################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018-2020 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 11 ## Comparing nested models. ## ## ## Libraries used faraway (for some data example) ## ## ## A data set with correlated respo nses: dat <- read.csv("survey.csv") summary(dat) boxplot(dat[,2:9],col="gold",las=3) pairs(dat[,2:9]) round(cor(dat[,2:9]),3) ##Fit a model predicting academic by everything else: lm1 <- lm(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative+group,data=dat) summary(lm1) drop1(lm1,test="F") ##It looks like dropping leadership and handson have the least impact lm2 <- lm(academic~communication+teaming+priorwork+multicultural+creative+group,data=dat) summary(lm2) drop1(lm2,test="F") anova(lm1,lm2) ##Multicultural is not adding much either lm3 <- lm(academic~communication+teaming+priorwork+creative+group,data=dat) summary(lm3) drop1(lm3,test="F") anova(lm1,lm3) lm4 <- lm(academic~communication+priorwork+creative+group,data=dat) lm5 <- lm(academic~communication+priorwork+group,data=dat) lm6 <- lm(academic~priorwork+group,data=dat) lm7 <- lm(academic~group,data=dat) lm8 <- lm(academic~1,data=dat) lm7g <- glm(academic~group,data=dat) anova(lm1,lm2,lm3,lm4,lm5,lm6,lm7,lm8) anova(lm1,lm4,lm8) anova(lm1,lm5) ##These two models are not nested: lm9 <- lm(academic~communication,data=dat) lm10 <- lm(academic~creative,data=dat) anova(lm9,lm10) anova(lm5,lm10) ##Score AIC for each model: data.frame(model=paste("lm",1:10,sep=""), rbind(extractAIC(lm1), extractAIC(lm2), extractAIC(lm3), extractAIC(lm4), extractAIC(lm5), extractAIC(lm6), extractAIC(lm7), extractAIC(lm8), extractAIC(lm9), extractAIC(lm10))) ##BIC is a small modification of AIC: extractBIC <- function(model) { extractAIC(model,k=log(length(model$residuals))) } data.frame(model=paste("lm",1:10,sep=""), rbind(extractBIC(lm1), extractBIC(lm2), extractBIC(lm3), extractBIC(lm4), extractBIC(lm5), extractBIC(lm6), extractBIC(lm7), extractBIC(lm8), extractBIC(lm9), extractBIC(lm10))) ##Use stepwise model search. Here, we use both directions every time. gsmall <- step(lm1,direction="both", k=log(nrow(dat))) summary(gsmall) ##Compare to bayes factor: library(BayesFactor) b1 <- lmBF(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative+group,data=dat) plot(b1) b3 <- lmBF(academic~communication+teaming+priorwork+creative+group,data=dat) b3/b1 bmodel <- regressionBF(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative,data=dat) plot(head(bmodel)) head(bmodel) ########################################################## #######The effect of parameter selection on prediction ##normal prediction set.seed(100) age <- 20+runif(50) * 80 income <- 20000+ runif(50)*50000 digitspan <- ( age * .5 + income * .003)/20 + rnorm(50) data=data.frame(age,income,digitspan) model <- lm(digitspan~age+income,data=data) par(mfrow=c(1,2)) plot(age,digitspan,col="gold",pch=16) points(age,digitspan) points(age,model$fit,cex=.5) segments(age,digitspan,age,model$fit,lty=3,lwd=.5) plot(income,digitspan,col="gold",pch=16) points(income,digitspan) points(income,model$fit,cex=.5) segments(income,digitspan,income,model$fit,lty=3,lwd=.5) plot(digitspan,model$fit,pch=16,col="gold",main="Observed vs. Predicted digit span",ylab="Predicted digit span") points(digitspan,model$fit) ##Age doesn't look very predictive, because income is more predictive here. But summary tells it it still matters. summary(model) ##Fit vs. predicted values--they are the same: model$fit predict(model,data) cor(model$fit,predict(model,data)) predict(model,list(age=30,income=10000)) predict(model,list(age=30,income=40000)) #WHAT IFWE HAD OTHER VARIABLES TOO? set.seed(1001) n <- 50 xcat <- as.factor(sample(c("A","B","C"), n,replace=T)) x2 <- runif(n) x3 <- runif(n) x4 <- 1/runif(n) x5 <- runif(n) x6 <- runif(n) x7 <- runif(n) y <- as.numeric(xcat)*3 + 5*x2 - 10*x3 + rnorm(n)*2.5 #y[sample(1:n,5)] <- rnorm(5,mean(y),sd(y)) dat <- data.frame(xcat,x2,x3,x4,x5,x6,x7,y) model <- lm(y~0+xcat+x2+x3+x4+x5+x6+x7,data=dat) summary(model) ##predict the original data: predict(model,dat) ##The fit is pretty good here: cor(predict(model,dat),y)^2 ## or A new data: predict(model,dat[1,]) dat[1,] tmp <- dat[1,] tmp$xcat="B" predict(model,tmp) predict(model,list(xcat="A",x2=.8,x3=.01,x4=1.5,x5=3,x6=1.5,x7=1.3)) set.seed(8) reorder <- (1:n)[order(runif(n))] half<- floor(n/2) fitset <- reorder[1:half] extra <- reorder[(half+1):n] ##fit the model on a random half of the data: modelb <- lm(y~0+xcat+x2+x3+x4+x5+x6+x7,data=dat[fitset,]) modelc <- lm(y~0+xcat+x2+x3,data=dat[fitset,]) summary(model) summary(modelb) summary(modelc) summary(model)$r.squared summary(modelb)$r.squared summary(modelc)$r.squared params <- rbind(model$coef, modelb$coef, c(modelc$coef,0,0,0,0), c(3,6,9,5,-10,0,0,0,0)) round(params,3) ##fit original model its own left-out data: cor(dat$y[fitset],predict(model,dat[fitset,]))^2 cor(dat$y[extra],predict(model,dat[extra,]))^2 ##overfit model: cor(dat$y[fitset],predict(modelb,dat[fitset,]))^2 cor(dat$y[extra],predict(modelb,dat[extra,]))^2 ##smaller model: cor(dat$y[fitset],predict(modelc,dat[fitset,]))^2 cor(dat$y[extra],predict(modelc,dat[extra,]))^2 ######################### ## Predicting categorical relationships: gender <- sample(as.factor(c("F","M")),50,replace=T) height <- 30 + as.numeric(gender) * 10 + runif(50)*30 ##height depends on gender weight <- 100 + as.numeric(gender) * 20 + runif(50)*50 ##weight depends on gender too plot(height,weight,col=gender,pch=16) model <- lm(as.numeric(gender)~height+weight) model plot(model$fit~gender,ylab="Gender coefficient") points(as.numeric(gender),model$fit) abline(1.5,0,lwd=3) predictedgender <- model$fit > 1.5 table(gender,c("F","M")[(predictedgender+1)]) ############Example: Modeling Chick Weight summary(ChickWeight) plotchicks <- function() { cscheme <- c("red","darkgreen","black","orange") ##write down the times of measurement--they are not regular times <- (c(0,2,4,6,8,10,12,14,16,18,20,21)) ##Aggregate growth over the diets and times cw.agg <- tapply((ChickWeight$weight), list(time=ChickWeight$Time, diet=ChickWeight$Diet),mean) cw.bysub<- tapply((ChickWeight$weight), list(time=ChickWeight$Time, chick=ChickWeight$Chick),mean) diets <- aggregate(as.numeric(as.character(ChickWeight$Diet)), list(chick=ChickWeight$Chick),median) #Replot the growth curves, but and make reasonable headers matplot(times,cw.bysub,col=cscheme[diets$x],type="l",lty=3, ylab="Weight (mg)",xlab="Age (days)",ylim=c(0,400),las=1) points(ChickWeight$Time,ChickWeight$weight,cex=.5, pch=1, col=cscheme[ChickWeight$Diet]) #Underlay a white line matplot(times,cw.agg,add=T,type="l",lwd=15, col="white",lty=1) matplot(times,cw.agg,add=T,type="l",lwd=5, col=cscheme,lty=1) ##Whoops, no title: title("Chick weights over time by diet") ##Lets add some gridpoints points(rep(-10:25,each=17), rep(0:16*25,36),pch=".",cex=2) ##Make a legend here. legend(2,350,paste("Diet",c(1:4)),col=cscheme,lty=1,lwd=5) } plotchicks() summary(ChickWeight) ##compare each diet with a t-test; cwd <- ChickWeight$Diet t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==2]) t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==3]) t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==4]) t.test(ChickWeight$weight[cwd==2],ChickWeight$weight[cwd==3]) t.test(ChickWeight$weight[cwd==2],ChickWeight$weight[cwd==4]) t.test(ChickWeight$weight[cwd==3],ChickWeight$weight[cwd==4]) ##What assumptions does this violate? ##What is wrong here? lm1 <- lm(weight~Diet,data=ChickWeight) summary(lm1) plotchicks() abline(102,0,col="red",lwd=3) abline(102+19.97,0,col="darkgreen",lwd=3) abline(102+40.3,0,col="black",lwd=3) abline(102+32.62,0,col="orange",lwd=3) ##Pick out the diet of each chick: diets <- tapply(ChickWeight$Diet,list(ChickWeight$Chick),function(x)x[[1]]) minweights <- tapply(ChickWeight$weight,list(ChickWeight$Chick),min) maxweights <- tapply(ChickWeight$weight,list(ChickWeight$Chick),max) ##Let's recategorize by weight gains ratio <- maxweights/minweights gain <- maxweights-minweights hist(ratio,breaks=20) hist(gain,breaks=20) plot(ratio,gain) ##It hardly matters ##How about we do a t-test on gains, rather than all weights. t.test(gain[diets==1],gain[diets==2]) t.test(gain[diets==1],gain[diets==3]) t.test(gain[diets==1],gain[diets==4]) t.test(gain[diets==2],gain[diets==3]) t.test(gain[diets==2],gain[diets==4]) t.test(gain[diets==3],gain[diets==4]) ##How do these tests differ? Look at the degrees of freedom. plotchicks() ##what if we look at just the end-weights end <- tapply(gain,list(diets),mean) abline(end[1],0,col="red",lwd=3 ) abline(end[2],0,col="darkgreen",lwd=3 ) abline(end[3],0,col="black",lwd=3 ) abline(end[4],0,col="orange",lwd=3) ##that's not right--that is just the gain; ##it assumes chickens weight 0 on day 0 (but the intercepts were not 0) plotchicks() start <- tapply(minweights,list(diets),mean) end <- tapply(gain,list(diets),mean) abline(end[1]+start[1],0,col="red",lwd=3 ) abline(end[2]+start[2],0,col="darkgreen",lwd=3 ) abline(end[3]+start[3],0,col="black",lwd=3 ) abline(end[4]+start[4],0,col="orange",lwd=3 ) ##How about this--what slope and intercept are we modeling here? lm2 <- lm(weight~ Time+Diet,data=ChickWeight) summary(lm2) plotchicks() abline(lm2$coef[1],lm2$coef[2],col="red",lwd=3) abline(lm2$coef[1]+lm2$coef[3],lm2$coef[2],col="darkgreen",lwd=3) abline(lm2$coef[1]+lm2$coef[4],lm2$coef[2],col="black",lwd=3) abline(lm2$coef[1]+lm2$coef[5],lm2$coef[2],col="orange",lwd=3) ##How about this slope and intercept. : is interaction. lm3 <- lm(weight~Time:Diet,data=ChickWeight) summary(lm3) plotchicks() abline(lm3$coef[1],lm3$coef[2],col="red",lwd=5,lty=4) abline(lm3$coef[1],lm3$coef[3],col="darkgreen",lwd=5,lty=4) abline(lm3$coef[1],lm3$coef[4],col="black",lwd=5,lty=4) abline(lm3$coef[1],lm3$coef[5],col="orange",lwd=5,lty=4) ##How about here? lm4 <- lm(weight~Time*Diet,data=ChickWeight) summary(lm4) plotchicks() abline(lm4$coef[1],lm4$coef[2],col="red",lwd=5,lty=4) abline(lm4$coef[1]+lm4$coef[3],lm4$coef[2]+lm4$coef[6],col="darkgreen",lwd=5,lty=4) abline(lm4$coef[1]+lm4$coef[4],lm4$coef[2]+lm4$coef[7],col="black",lwd=5,lty=4) abline(lm4$coef[1]+lm4$coef[5],lm4$coef[2]+lm4$coef[8],col="orange",lwd=5,lty=4) ##One last look--categorical examination weightgain <- ifelse(maxweights>=300,"jumbo", ifelse(maxweights<=150,"runt","normal")) chickTable <- table(diets,weightgain) ## Does the type of chick depend of feed? chisq.test(chickTable)