################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 12 ## Polynomial regression, non-parametric smoothing and regression, ## and transforms of the dependent variable. ## ## Libraries used MASS; rcompanion, rgl, gam ## ## set.seed(1004) x <- 1:10 y <- runif(10) #pdf("c13-curve.pdf",width=8,height=5) plot(x,y,pch=16,col="gold",main="Linear and quadratic models") points(x,y) ##add the squared term: x2 <- x^2 lm1 <- lm(y~x) lm2 <- lm(y~x+x2) anova(lm1,lm2) preddat <- data.frame(x=0:100/10,x2=(0:100/10)^2) points(preddat$x,predict(lm1,preddat),type="l") points(preddat$x,predict(lm2,preddat),type="l") #dev.off() cor(x,x2) x %*% x2 summary(lm1) summary(lm2) ##center the new predictors: xb <- x-mean(x) xb2 <- xb^2 xb2 <- xb2-mean(xb2) ##this needs to be done too. cor(xb,xb2) xb %*% xb2 lm(y~xb)$coef lm(y~xb+xb2)$coef #pdf("c13-poly.pdf",width=8,height=8) ##Form the polynomial values for 0..10 r2 <- 0:100/10 #identify a range to predict with par(mfrow=c(3,3),mar=c(2,3,2,1)) poly1 <- lm(y~x) plot(x,y,pch=16,main="1st order polynomial fit") points(r2,predict(poly1,list(x=r2)),type="l") x2 <- x^2 poly2 <- lm(y ~ x + I(x^2)) plot(x,y,pch=16,main="2nd order polynomial fit") points(x,poly2$fit,type="p") points(r2,predict(poly2,list(x=r2)),type="l") poly3 <- lm(y~x + I(x^2) + I( x^3)) plot(x,y,pch=16,main="3rd order polynomial fit") points(x,poly3$fit,type="p") points(r2,predict(poly3,list(x=r2)),type="l") poly4 <- lm(y~x + I(x^2) + I( x^3) + I(x^4)) plot(x,y,pch=16,main="4th order polynomial fit") points(x,poly4$fit,type="p") points(r2,predict(poly4,list(x=r2)),type="l") poly5 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5)) plot(x,y,pch=16,main="5th order polynomial fit") points(x,poly5$fit,type="p") points(r2,predict(poly5,list(x=r2)),type="l") poly6 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6)) plot(x,y,pch=16,main="6th order polynomial fit") points(x,poly6$fit,type="p") points(r2,predict(poly6,list(x=r2)),type="l") poly7 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7)) plot(x,y,pch=16,main="7th order polynomial fit") points(x,poly7$fit,type="p") points(r2,predict(poly7,list(x=r2)),type="l") poly8 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7) + I(x^8)) plot(x,y,pch=16,main="8th order polynomial fit") points(x,poly8$fit,type="p") points(r2,predict(poly8,list(x=r2)),type="l") poly9 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7) + I(x^8) + I(x^9)) plot(x,y,pch=16,main="9th order polynomial fit") points(x,poly9$fit,type="p") points(r2,predict(poly9,list(x=r2)),type="l") #dev.off() x.5 <- x-mean(x) x2.5 <- x^2 - mean(x^2) x3.5 <- x^3 - mean(x^3) x4.5 <- x^4 - mean(x^4) x5.5 <- x^5 - mean(x^5) poly5a <- lm(y~x+ I(x^2) + I( x^3) + I(x^4) + I(x^5)) poly5b <- lm(y~poly(x,5)) poly5c <- lm(y~x.5+x2.5+x3.5+x4.5+x5.5) poly5a poly5b poly5c summary(poly5a) summary(poly5b) summary(poly5c) #notice that the coefficients are all a bit different. The model fit is the same, but #depending on how you want to interpret it, things may differ. ##Scientific hypothesis and polynomial regression let <- read.csv("reading.csv",header=F) colnames(let) <- c("sub","length","trial","stim","rt") lm1 <- lm(rt~length,data=let) let$length2 <- let$length^2 lm2 <- lm(rt~length + length2,data=let) par(mfrow=c(1,1)) #pdf("c13rt.pdf",width=6,height=4) plot(let$length,let$rt,ylim=c(0,15000),xlab="Number of letters", ylab="Reading time (ms)",xlim=c(0,35)) abline(lm1$coeff,col='green') points(1:40,lm2$coef[1] + lm2$coef[2]*1:40+lm2$coef[3]*(1:40)^2,type="l",col='red') segments(-10,0:15*1000,40,0:15*1000,lty=3,col="grey") #dev.off() summary(lm1) summary(lm2) anova(lm1,lm2) ###########################3 ## Moving average: President approval ratings plot(presidents,type="o",main="Approval ratings of presidents") ##we could do this with polynomial regression, but what if we just ##used a moving average. That is, we replaced each value with a weighted ##sum of the values around it. lines(filter(presidents,c(1,1,1)/3),col="grey20",lwd=3) lines(filter(presidents,c(1,1,1,1,1)/5),col="red",lwd=3) ##build a moving average function ma ma <- function(x,n=5){filter(x,rep(1/n,n), sides=2)[1:length(x)]} ##apply ma to WWWUsage: pdf("c13www.pdf",width=6,height=4) plot(WWWusage,pch=16,type="o") points(ma(WWWusage,n=5),type="o",col= "blue") points(ma(WWWusage,n=15),type="o",col="green") plot(WWWusage) newdat <- WWWusage + rnorm(100)*20 points(newdat) points(ma(newdat,3),col="red",type="l",lwd=3) points(ma(newdat,5),col="red",type="l",lwd=3) points(ma(newdat,6),col="red",type="l",lwd=3) points(ma(newdat,10),col="red",type="l",lwd=3) points(ma(newdat,20),col="red",type="l",lwd=3) ##moving average with triangular kernel ma2 <- function(x,n=5){ tmp <- c(1:n, (n-1):1) kernel <- tmp/sum(tmp) filter(x,kernel, sides=2)} plot(WWWusage) points(newdat) lines(ma(newdat,5),col="red") lines(ma2(newdat,3),col="red") lines(ma2(newdat,5),col="red") dev.off() ################# ## Loess regression set.seed(100) datx <- runif(50)*5 daty <- sin(datx*5) + ((datx-2)/3)^2 + rnorm(50)*.25 plot(datx,daty) lmodel <- loess(daty~datx) xs <- 0:500/100 points(xs,predict(lmodel,xs),type="l",col="red",lwd=3) lmodel2 <- loess(daty~datx,span=.3) points(xs,predict(lmodel2,xs),type="l",col="blue",lwd=3) predict(lmodel2,10.3322) predict(lmodel2,3.3322) #################################################3 ###Ptracker example dat <- read.csv("ptracker.csv")[200:500,] time <- 117:168*100 ##Regularize in tenths of seconds mx <- loess(posx~time,span=.1,data=dat) my <- loess(posy~time,span=.1,data=dat) #pdf("c13ptracker.pdf") par(mfrow=c(2,2)) plot(dat$time,dat$posx,main="X position") points(time,predict(mx,data.frame(time=time)),type="l") plot(dat$time,dat$posy,main="Y position") points(time,predict(my,data.frame(time=time)),type="l") plot(dat$posx,dat$posy,type="b",main="X and Y") points(predict(mx,data.frame(time=time)), predict(my,data.frame(time=time) ), col="red",type="o",pch=1,cex=.3) time <- 11700:168000 ##Regularize in 100s of seconds points(predict(mx,data.frame(time=time)), predict(my,data.frame(time=time) ), col="red",type="o",pch=1,cex=.3) mx <- loess(posx~time,span=.02,data=dat) my <- loess(posy~time,span=.02,data=dat) plot(dat$posx,dat$posy,type="p") points(predict(mx,data.frame(time=time)), predict(my,data.frame(time=time) ), col="red",type="o",pch=1,cex=.3) dev.off() ############################### ## GAM (General Additive Models) and spline regression ##The gam library and some related functions let you essentially mix loess predictors into a linear model. ## We'll start with the gam function, which works like lm, but lets you more easily specify transformed ## predictor variables. It can use poly(), but also lo() around variables for loess fit, or ns() bs() for ## natural and b-splines. Unlike loess, it will permit inferential statistical tests. library(gam) cw <- ChickWeight g <- gam(weight~s(Time)+ Diet,data=cw) cw$fit <- g$fitted.values library(ggplot2) summary(g) plot(g) #pdf("c13-gam1.pdf") ggplot(cw,aes(x=Time,y=fit,group=Diet,color=Diet,group=Chick)) +geom_point() +geom_line() +theme_bw() #dev.off() ##All chicks have the same growth pattern at a different scale; diet is additive ## g <- gam(weight~lo(Time)*Chick+ Diet,data=cw) cw$fit2 <- g$fitted.values #pdf("c13-gam2.pdf") ggplot(cw,aes(x=Time,y=fit2,group=Chick,color=Diet)) +geom_point() + geom_line()+ theme_bw() #dev.off() summary(g) #plot(g) g <- lm(weight~ns(Time,5)+ Diet,data=cw) summary(g) anova(g) ##we can also mix in the splines directly into an lm using ns() or bs() in splines mx5 <- lm(posx~ns(time,df=5),data=dat) mx10 <- lm(posx~ns(time,df=10),data=dat) mx15 <- lm(posx~ns(time,df=15),data=dat) mx30<- lm(posx~ns(time,df=30),data=dat) par(mfrow=c(1,1)) plot(dat$time,dat$posx,type="l") points(dat$time,mx5$fit,col="red") points(dat$time,mx10$fit,col="gold") points(dat$time,mx15$fit,col="navy") points(dat$time,mx30$fit,col="darkgreen") #################### ###Interactions between continuous predictors x <- -100:100 y <- runif(201)*201-100 z <- 3*x -5*y + x*y - 50 library(rgl) plot3d(x,y,z) z <- 3*x -5*y + x*y - 50 + rnorm(201)*2000 plot3d(x,y,z) xy <- x*y interaction <- lm (z~x+y + x:y) interaction2 <- lm(z~ x*y) interaction3 <- lm(z~x + y + xy) ##Each of these are essentially the same: summary(interaction) summary(interaction2) summary(interaction3) ########################## ## Transformations of the outcome: data(ChickWeight) lm1 <- lm(weight~Time:Diet,data=ChickWeight) lm2 <- lm(log(weight)~Time:Diet,data=ChickWeight) lm3 <- lm(log(weight)~log(Time+1):Diet,data=ChickWeight) lm22 <- lm(log(weight)~Time:Chick,data=ChickWeight) ##exp(coef) gives the growth rate. diet <- t(table(ChickWeight$Diet,ChickWeight$Chick)>0) %*% c(1:4) #pdf("c13chick.pdf",width=8,height=5) plot(ChickWeight$Time,(ChickWeight$weight),col="grey",xlab="Days",ylab="Weight (g)") for(i in 2:length(lm22$coef)) { x <- 0:25 fit <- exp(lm22$coef[1] + lm22$coef[i]*x) points(x,fit,type='l',col=c("red","blue","darkgreen","orange")[diet[i-1]]) } points(x, exp(lm2$coef[1] + lm2$coef[2]*x),type="l",col="red",lwd=8,lty=4) points(x, exp(lm2$coef[1] + lm2$coef[3]*x),type="l",col="blue",lwd=8,lty=4) points(x, exp(lm2$coef[1] + lm2$coef[4]*x),type="l",col="darkgreen",lwd=8,lty=4) points(x, exp(lm2$coef[1] + lm2$coef[5]*x),type="l",col="orange",lwd=8,lty=4) #dev.off() ##Find the mean growth rate. tapply(exp(lm22$coef[2:length(lm22$coef)]),list(diet),mean) ################################### ## Transformations: census <- read.csv("census2013.csv") labels <- census$GEO.display.label states <-sapply(labels, function(x){strsplit(as.character(x),", ")[[1]][2]}) nrow(census) sum(census$respop72011<5000) sum(census$respop72011>50000) sum(census$respop72011>500000) hist(census$respop72011,breaks=100) hist(log(census$respop72011),breaks=100) ##aggergate states by mean or exp-log-mean: untrans <- aggregate(census$respop72011,list(state=states),mean) logged <- aggregate(census$respop72011,list(state=states),function(x){exp(mean(log(x)))}) untrans$logged <- logged$x barplot(t(as.matrix(untrans[,-1])),names=untrans$state,beside=T,horiz=T,las=1) x <-runif(100) * 20 z <- .1/x +runif(100)*.1 zinv <- (x+runif(100)/.2) hist(z) qqnorm(z) hist(zinv) qqnorm(zinv) plot(x,z) plot(x,1/z) model1 <- lm(zinv~x) model2 <- lm(1/z~x) summary(model1) summary(model2) ##Log-odds transform and arcsin transform: logodds <- function(p){log(p/(1-p))} logistic <- function(x){1/(1+exp(-x))} cond <- sample(1:15,1000,replace=T) success <- runif(1000)< 1-.4^(cond/3) cond[1:15] <- 1:15 ##make sure we have no perfect accuracies success[1:15] <- 0 pc <- aggregate(success,list(cond=cond),mean) plot(pc) lm1 <- lm(pc$x~pc$cond) lm2 <- lm(logodds(pc$x)~pc$cond) lm3 <- lm(asin(pc$x)~pc$cond) par(mfrow=c(1,1)) #pdf("c13-transforms.pdf",fig.width=5,fig.height=6) plot(pc$cond,pc$x,xlab="Condition",ylab="Observed value",ylim=c(0,1.1)) ##pretty bad abline(lm1$coef) ##prediction of log-odds model points(pc$cond, logistic(predict(lm2)),type="l",lwd=2,lty=3,col="cadetblue3") ##pretty good points(pc$cond, sin(predict(lm3)),type="l",lwd=2,lty=4,col="orange") legend(8,.4,c("Linear","Log-odds","Arcsine"),lty=c(1,3,4),col=c("black","cadetblue3","orange"), lwd=3) #dev.off() ## ##Common transforms: Tukey' spower transform and the Box-Cox library(rcompanion) z2 <- transformTukey(z,plotit=T) z2 plot(-z^(-.15),z2) model3 <- lm(z2~x) summary(model3) #if we look at the census data, a transform close to 0 (log) is ideal. census2 <- transformTukey(census$respop72011) hist(census2) ##Alternate: Box-Cox transform library(MASS) #pdf("c13-boxcox.pdf",width=8,height=5) par(mfrow=c(1,2)) bc<-boxcox(z~x,plotit=T) bc2 <- boxcox(census$respop72011~census$GEO.id2,plotit=T) #dev.off() ################################ ## Example polynomial regression: x<- 1:100 g1 <- lm(WWWusage~poly(x,1)) g2 <- lm(WWWusage~poly(x,2)) g3 <- lm(WWWusage~poly(x,3)) g4 <- lm(WWWusage~poly(x,4)) g5 <- lm(WWWusage~poly(x,5)) g6 <- lm(WWWusage~poly(x,6)) g7 <- lm(WWWusage~poly(x,7)) pdf("c13ex1.pdf",width=8,height=5) plot(x,WWWusage) points(g1$fit,type="l",col="red") points(g2$fit,type="l",col="red") points(g3$fit,type="l",col="red") points(g4$fit,type="l",col="red") points(g5$fit,type="l",col="red") points(g6$fit,type="l",col="red") points(g7$fit,type="l",col="red") dev.off()