################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018-2020 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 14 ## Materials on testing assumptions of the linear models and detecting outliers ## ##consider the following two data sets. We can assess how good the model is by looking at R^2: set.seed(302) x <- rnorm(1000) y <- 33 + x + rnorm(1000) summary(modela) modela <- lm(y~x) anova(modela) #pdf("c14-scatter1.pdf",width=9,height=5) par(mfrow=c(1,3)) plot(x,y,col="gold",pch=16,ylim=c(10,50), main=paste("R2=",round(summary(modela)$r.squared,3))) points(x,y) abline(modela$coef) y2 <- 33 + x + rnorm(1000)*5 modelb <-lm(y2~x) plot(x,y2,col="gold",pch=16,ylim=c(10,50), main=paste("R2=",round(summary(modelb)$r.squared,3))) abline(modelb$coef) points(x,y2) summary(lm(y2~x)) set.seed(100) y3 <- y noisy <- sample(1000,15) y3[noisy] <- y3[noisy] + rnorm(15)* 35 modelc <- lm(y3~x) plot(x,y3,col="gold",pch=16, main=paste("R2=",round(summary(modelc)$r.squared,3))) points(x,y3) abline(modelc$coef) #dev.off() ############################# ## Testing the assumptions of the model (normality) set.seed(111) x <- rnorm(20) y <- x + rnorm(20) g1 <- lm(y~x) hist(g1$resid) qqnorm(g1$resid) shapiro.test(g1$resid) ##Detecting and Handling Influential Observations and Outliers # pdf("c14-smallscatter.pdf",width=8,height=6) plot(x,y,col="gold",pch=16,cex=1.5, main=paste("R=",round(cor(x,y),2))) points(x,y,cex=1.5) #dev.off() cor(x,y) cor.test(x,y) library(BayesFactor) correlationBF(x,y) ##What if we remove point 13? plot(x[-13],y[-13]) cor.test(x[-13],y[-13]) correlationBF(x[-13],y[-13]) ##Is point 13 real, or an error? Is it an outlier? Does it matter? ##Examining Residuals and standardized residuals #the goal is to see if the residuals depend on the fitted value in a systematic way. #pdf("c14-residuals.pdf",width=9,height=5) par(mfrow=c(1,2)) plot(g1,which=1) plot(g1$fit,g1$resid,main="Fit versus residuals of model") abline(0,0,lty=3) #dev.off() ##Non-constant variance (and a test for it) set.seed(5) x2 <- runif(200)*10 y2 <- 33 + x2 + rnorm(200)*(x2) #pdf("c14-residuals2.pdf",width=9,height=5) par(mfrow=c(1,2)) plot(x2,y2,col="gold",pch=16) points(x2,y2) g2 <- lm(y2~x2) plot(g2,which=1) #dev.off() ##Test for non-constant variance: ## Called the Breusch-Pagan test or Cook & Weisberg (1983) library(car) ncvTest(g2) ##look at the residuals divided by the sd of the residuals g1 <- lm(y~x) (g1$resid/sd(g1$resid)) #pdf("c14-standresid.pdf",width=9,height=5) par(mfrow=c(1,3)) plot(g1$fit,g1$resid/sd(g1$resid)) plot(g1$fit,rstandard(g1)) abline(-2,0) abline(-1,0) abline(0,0) abline(1,0) abline(2,0) plot(g1$resid/sd(g1$resid),rstandard(g1)) #dev.off() cor(g1$resid,rstandard(g1)) ##Leverage g1 <- lm(y~x) leverage <- hatvalues(g1) #pdf("c14-leverage.pdf",width=8,height=5) par(mfrow=c(1,2)) plot(x,y,pch=16,main="Levarage of data") points(x,y,cex=1.2+leverage*5) barplot(leverage) abline(mean(leverage),0,lwd=2) abline(2*mean(leverage),0) #dev.off() #Notice that leverage is essentially MSE when you have a single predictor variable: cor((x-mean(x))^2,leverage) pdf("c14-2dleverage.pdf") x2 <- runif(20) model2 <- lm(y~x+x2) lev2 <- hatvalues(model2) ##Does not depend on y! plot(x,x2,pch=16) points(x,x2,cex=1.2+lev2*5) dev.off() #Studentized Residuals ## i.e., standardized residuals scaled by leverage rstudent(g1) #pdf("c14-studentized.pdf",width=9,height=5) par(mfrow=c(1,2)) plot(x,y,pch=16) points(x,y,cex=abs(rstudent(g1))+2) abline(lm(y~x)$coef) plot(g1$fit,rstudent(g1)) abline(0,0) #dev.off() leverage <- hat(model.matrix(g1)) plot(leverage,rstudent(g1)) ?influence.measures ig <- influence(g1) #pdf("c14-influence.pdf",width=6,height=6) plot(x,y,pch=16,col="gold") points(x,y) for(i in 1:20) abline(ig$coef[i,]) #dev.off() cors <- matrix(rep(0,20*3),ncol=3) for(i in 1:20) { keep <- rep(T,20) keep[i] <- F cortmp <- cor.test(x[keep],y[keep]) cors[i,] <- c( cortmp$estimate,cortmp$statistic,cortmp$p.value) } #pdf("c14-jackknife.pdf",width=8,height=5) par(mfrow=c(1,2)) plot(cors[,3],ylab="Correlation when removed") abline(.05,0) plot(x,y) g1 <- lm(y~x) g2 <- lm(y[-13]~x[-13]) points(x[13],y[13],pch=16,col="red") abline(0,1,lty=3,lwd=3) ##The 'true' line abline(g1$coef) ##The full data set abline(g2$coef,col="red",lty=2) ##Removing the 'outlier'' #dev.off() ##Jackknife using other outcomes: residuals: set.seed(111) x <- rnorm(20) y <- x + rnorm(20) resid <- rep(0,20) for(i in 1:20) { keep <- rep(T,20) keep[i] <- F xtmp <- x[keep] model <- lm(y[keep]~xtmp) resid[i] <-(y[i]-predict(model,list(xtmp=x[i])))/sd(model$resid) } plot(resid/sd(resid),main="Jackknife on standardized residuals") model1 <- lm(y~x) int <- rep(0,20) beta <- rep(0,20) for(i in 1:20) { keep <- rep(T,20) keep[i] <- F xtmp <- x[keep] model <- lm(y[keep]~xtmp) int[i] <- model1$coef[1]-model$coef[1] beta[i] <- model1$coef[1]-model$coef[2] } #pdf("c14-jackknife2.pdf",width=8,height=5) par(mfrow=c(2,2),mar=c(2,2,2,2)) plot(int,main="Jackknife on Intercept") plot(beta,main="Jackknife on slope") plot(x,y,main="Jackknife on linear model") for(i in 1:20)abline(model1$coef[1]+int[i],beta[i]+model1$coef[2]) plot(x,y,main="Jackknife on beta diagnosticity") fits <- dfbetas(model1) for(i in 1:20)abline(fits[i,],col="red") #dev.off() #pdf("c14-comparison.pdf",width=8,height=6) plot(hatvalues(model1),type="o",col="red",ylim=c(-1,2)) points(cooks.distance(model1),type="o",col="darkgreen") points(dffits(model1),type="o",col="orange") #dev.off() influence.measures(model1) dat <- read.csv("tmt.csv") dat$age <- as.factor(dat$age) #pdf("c14-tmt-model.pdf",width=9,height=5) par(mfrow=c(1,2)) plot(dat$time~(dat$age)+dat$type) lm1 <- lm(time~age+type,data=dat) qqnorm(lm1$resid) #dev.off() #pdf("c14-tmt-model-lmplot.pdf") par(mfrow=c(2,2)) plot(lm1) #dev.off() #pdf("c14-tmt-model-transforms.pdf") par(mfrow=c(2,2)) hist(lm1$resid) hist(log(lm1$resid-min(lm1$resid)),breaks=20) plot(log(dat$time)~(dat$age)) points(dat$age,log(dat$time)) plot(log(dat$time)~(dat$type)) points(dat$type,log(dat$time),col="grey") #dev.off() lm2 <- lm(log(time)~age+type,data=dat) qqnorm(lm2$resid) ##This is a bit better; the fastest time was around 50 lm3 <- lm(log(time-50)~age+type,data=dat) qqnorm(lm3$resid) hist(lm3$resid) summary(lm1) summary(lm2) summary(lm3) #########Downsides: lm1.int <- lm(time~age*type,data=dat) lm3.int <- lm(log(time-50)~age*type,data=dat) anova(lm1.int) anova(lm3.int) agg1 <- tapply(dat$time,list(dat$age,dat$type),mean) agg3 <- tapply(log(dat$time-50),list(dat$age,dat$type),mean) matplot(t(agg1),type="o",ylab="Response time (s)") matplot(t(agg3),type="o",ylab="log(Response time)",yaxt="n",ylim=c(2.5,5)) axis(2,log(c(65,75,100,150,200,250)-50), c(65,75,100,150,200,250),las=1) library(car) ncvTest(lm1.int) ncvTest(lm3.int)