#################################################################
##  Code for Book: Applied Statistical Analysis in R
##   A Graduate course for Psychology, Human factors, and Data Science
##  (c) 2018 Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 12
##  Polynomial regression, non-parametric smoothing and regression,
##   and transforms of the dependent variable.
##
##   Libraries used MASS; rcompanion, rgl, gam
##
##


set.seed(1004)
x <- 1:10
y <- runif(10)


#pdf("c13-curve.pdf",width=8,height=5)
plot(x,y,pch=16,col="gold",main="Linear and quadratic models")
points(x,y)

##add the squared term:
x2 <- x^2

lm1 <- lm(y~x)
lm2 <- lm(y~x+x2)
anova(lm1,lm2)

preddat <- data.frame(x=0:100/10,x2=(0:100/10)^2)
points(preddat$x,predict(lm1,preddat),type="l")
points(preddat$x,predict(lm2,preddat),type="l")

#dev.off()


cor(x,x2)
x %*% x2

summary(lm1)
summary(lm2)


##center the new predictors:
xb <- x-mean(x)
xb2 <- xb^2
xb2 <- xb2-mean(xb2) ##this needs to be done too.


cor(xb,xb2)
xb %*% xb2

lm(y~xb)$coef
lm(y~xb+xb2)$coef


#pdf("c13-poly.pdf",width=8,height=8)
##Form the polynomial values for 0..10
r2 <- 0:100/10  #identify a range to predict with
par(mfrow=c(3,3),mar=c(2,3,2,1))
poly1 <- lm(y~x)
plot(x,y,pch=16,main="1st order polynomial fit")
points(r2,predict(poly1,list(x=r2)),type="l")
x2 <- x^2
poly2 <- lm(y ~ x + I(x^2))
plot(x,y,pch=16,main="2nd order polynomial fit")
points(x,poly2$fit,type="p")
points(r2,predict(poly2,list(x=r2)),type="l")

poly3 <- lm(y~x + I(x^2) + I( x^3))
plot(x,y,pch=16,main="3rd order polynomial fit")
points(x,poly3$fit,type="p")
points(r2,predict(poly3,list(x=r2)),type="l")


poly4 <- lm(y~x + I(x^2) + I( x^3) + I(x^4))
plot(x,y,pch=16,main="4th order polynomial fit")
points(x,poly4$fit,type="p")
points(r2,predict(poly4,list(x=r2)),type="l")

poly5 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5))
plot(x,y,pch=16,main="5th order polynomial fit")
points(x,poly5$fit,type="p")
points(r2,predict(poly5,list(x=r2)),type="l")

poly6 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6))
plot(x,y,pch=16,main="6th order polynomial fit")
points(x,poly6$fit,type="p")
points(r2,predict(poly6,list(x=r2)),type="l")


poly7 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7))
plot(x,y,pch=16,main="7th order polynomial fit")
points(x,poly7$fit,type="p")
points(r2,predict(poly7,list(x=r2)),type="l")


poly8 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7) + I(x^8))
plot(x,y,pch=16,main="8th order polynomial fit")
points(x,poly8$fit,type="p")
points(r2,predict(poly8,list(x=r2)),type="l")


poly9 <- lm(y~x + I(x^2) + I( x^3) + I(x^4) + I(x^5) +I(x^6) + I(x^7) + I(x^8) + I(x^9))
plot(x,y,pch=16,main="9th order polynomial fit")
points(x,poly9$fit,type="p")
points(r2,predict(poly9,list(x=r2)),type="l")


#dev.off()


x.5 <- x-mean(x)
x2.5 <- x^2 - mean(x^2)
x3.5 <- x^3 - mean(x^3)
x4.5 <- x^4 - mean(x^4)
x5.5 <- x^5 - mean(x^5)


poly5a <- lm(y~x+ I(x^2) + I( x^3) + I(x^4) + I(x^5))
poly5b <- lm(y~poly(x,5))
poly5c <- lm(y~x.5+x2.5+x3.5+x4.5+x5.5)
poly5a
poly5b
poly5c
summary(poly5a)
summary(poly5b)
summary(poly5c)
#notice that the coefficients are all a bit different. The model fit is the same, but
#depending on how you want to interpret it, things may differ.


##Scientific hypothesis and polynomial regression

let <- read.csv("reading.csv",header=F)
colnames(let) <- c("sub","length","trial","stim","rt")


lm1 <- lm(rt~length,data=let)
let$length2 <- let$length^2
lm2 <- lm(rt~length + length2,data=let)

par(mfrow=c(1,1))

#pdf("c13rt.pdf",width=6,height=4)
plot(let$length,let$rt,ylim=c(0,15000),xlab="Number of letters",
     ylab="Reading time (ms)",xlim=c(0,35))

abline(lm1$coeff,col='green')
points(1:40,lm2$coef[1] + lm2$coef[2]*1:40+lm2$coef[3]*(1:40)^2,type="l",col='red')    
segments(-10,0:15*1000,40,0:15*1000,lty=3,col="grey")
#dev.off()

summary(lm1)
summary(lm2)
anova(lm1,lm2)


###########################3
## Moving average: President approval ratings

plot(presidents,type="o",main="Approval ratings of presidents")


##we could do this with polynomial regression, but what if we just
##used a moving average. That is, we replaced each value with a weighted
##sum of the values around it.
lines(filter(presidents,c(1,1,1)/3),col="grey20",lwd=3)
lines(filter(presidents,c(1,1,1,1,1)/5),col="red",lwd=3)


##build a moving average function ma
ma <- function(x,n=5){filter(x,rep(1/n,n), sides=2)[1:length(x)]}
##apply ma to WWWUsage:


pdf("c13www.pdf",width=6,height=4)
plot(WWWusage,pch=16,type="o")
points(ma(WWWusage,n=5),type="o",col= "blue")
points(ma(WWWusage,n=15),type="o",col="green")


plot(WWWusage)
newdat <- WWWusage + rnorm(100)*20
points(newdat)
points(ma(newdat,3),col="red",type="l",lwd=3)
points(ma(newdat,5),col="red",type="l",lwd=3)
points(ma(newdat,6),col="red",type="l",lwd=3)
points(ma(newdat,10),col="red",type="l",lwd=3)

points(ma(newdat,20),col="red",type="l",lwd=3)
##moving average with triangular kernel
ma2 <- function(x,n=5){
  tmp <- c(1:n, (n-1):1)
  kernel <- tmp/sum(tmp)
  filter(x,kernel, sides=2)}
plot(WWWusage)
points(newdat)
lines(ma(newdat,5),col="red")
lines(ma2(newdat,3),col="red")
lines(ma2(newdat,5),col="red")
dev.off()

#################
## Loess regression

set.seed(100)
datx <- runif(50)*5
daty <- sin(datx*5) + ((datx-2)/3)^2 + rnorm(50)*.25
plot(datx,daty)

lmodel <- loess(daty~datx)
xs <- 0:500/100
points(xs,predict(lmodel,xs),type="l",col="red",lwd=3)

lmodel2 <- loess(daty~datx,span=.3)
points(xs,predict(lmodel2,xs),type="l",col="blue",lwd=3)


predict(lmodel2,10.3322)

predict(lmodel2,3.3322)

#################################################3
###Ptracker example
dat <- read.csv("ptracker.csv")[200:500,]

time <- 117:168*100  ##Regularize in tenths of seconds

mx <- loess(posx~time,span=.1,data=dat)
my <- loess(posy~time,span=.1,data=dat)
#pdf("c13ptracker.pdf")
par(mfrow=c(2,2))
plot(dat$time,dat$posx,main="X position")
points(time,predict(mx,data.frame(time=time)),type="l")

plot(dat$time,dat$posy,main="Y position")
points(time,predict(my,data.frame(time=time)),type="l")


plot(dat$posx,dat$posy,type="b",main="X and Y")

points(predict(mx,data.frame(time=time)),
       predict(my,data.frame(time=time) ),
       col="red",type="o",pch=1,cex=.3)

time <- 11700:168000  ##Regularize in 100s of seconds
points(predict(mx,data.frame(time=time)),
       predict(my,data.frame(time=time) ),
       col="red",type="o",pch=1,cex=.3)

mx <- loess(posx~time,span=.02,data=dat)
my <- loess(posy~time,span=.02,data=dat)

plot(dat$posx,dat$posy,type="p")

points(predict(mx,data.frame(time=time)),
       predict(my,data.frame(time=time) ),
       col="red",type="o",pch=1,cex=.3)

dev.off()

###############################
## GAM (General Additive Models) and spline regression
##The gam library and some related functions let you essentially mix loess predictors into a linear model.
## We'll start with the gam function, which works like lm, but lets you more easily specify transformed
## predictor variables. It can use poly(), but also lo() around variables for loess fit, or ns() bs() for
## natural and b-splines.  Unlike loess, it will permit inferential statistical tests.


library(gam)
cw <- ChickWeight
g <-  gam(weight~s(Time)+ Diet,data=cw)
cw$fit <- g$fitted.values
library(ggplot2)
summary(g)
plot(g)
#pdf("c13-gam1.pdf")
ggplot(cw,aes(x=Time,y=fit,group=Diet,color=Diet,group=Chick)) +geom_point() +geom_line() +theme_bw()
#dev.off()

##All chicks have the same growth pattern at a different scale; diet is additive
##
g <-  gam(weight~lo(Time)*Chick+ Diet,data=cw)
cw$fit2 <- g$fitted.values
#pdf("c13-gam2.pdf")
ggplot(cw,aes(x=Time,y=fit2,group=Chick,color=Diet)) +geom_point() + geom_line()+ theme_bw()
#dev.off()

summary(g)
#plot(g)

g <-  lm(weight~ns(Time,5)+ Diet,data=cw)
summary(g)
anova(g)


##we can also mix in the splines directly into an lm using ns() or bs() in splines
mx5 <- lm(posx~ns(time,df=5),data=dat)
mx10 <- lm(posx~ns(time,df=10),data=dat)
mx15 <- lm(posx~ns(time,df=15),data=dat)
mx30<- lm(posx~ns(time,df=30),data=dat)

par(mfrow=c(1,1))
plot(dat$time,dat$posx,type="l")
points(dat$time,mx5$fit,col="red")
points(dat$time,mx10$fit,col="gold")
points(dat$time,mx15$fit,col="navy")
points(dat$time,mx30$fit,col="darkgreen")


####################
###Interactions between continuous predictors

x <- -100:100
y <- runif(201)*201-100
z <-  3*x -5*y + x*y - 50
library(rgl)
plot3d(x,y,z)


z <-  3*x -5*y + x*y - 50 + rnorm(201)*2000
plot3d(x,y,z)


xy <- x*y

interaction <- lm (z~x+y + x:y)
interaction2 <- lm(z~ x*y)
interaction3 <- lm(z~x + y + xy)


##Each of these are essentially the same:
summary(interaction)
summary(interaction2)
summary(interaction3)


##########################
## Transformations of the outcome:


data(ChickWeight)  
lm1 <- lm(weight~Time:Diet,data=ChickWeight)
lm2 <- lm(log(weight)~Time:Diet,data=ChickWeight)
lm3 <- lm(log(weight)~log(Time+1):Diet,data=ChickWeight)
lm22 <- lm(log(weight)~Time:Chick,data=ChickWeight)

##exp(coef) gives the growth rate.

diet <- t(table(ChickWeight$Diet,ChickWeight$Chick)>0) %*% c(1:4)
#pdf("c13chick.pdf",width=8,height=5)
plot(ChickWeight$Time,(ChickWeight$weight),col="grey",xlab="Days",ylab="Weight (g)")
for(i in 2:length(lm22$coef))
{
  x <- 0:25
  fit <- exp(lm22$coef[1] + lm22$coef[i]*x)
  points(x,fit,type='l',col=c("red","blue","darkgreen","orange")[diet[i-1]]) 
}

points(x, exp(lm2$coef[1] + lm2$coef[2]*x),type="l",col="red",lwd=8,lty=4)
points(x, exp(lm2$coef[1] + lm2$coef[3]*x),type="l",col="blue",lwd=8,lty=4)
points(x, exp(lm2$coef[1] + lm2$coef[4]*x),type="l",col="darkgreen",lwd=8,lty=4)
points(x, exp(lm2$coef[1] + lm2$coef[5]*x),type="l",col="orange",lwd=8,lty=4)
#dev.off()
##Find the mean growth rate.
tapply(exp(lm22$coef[2:length(lm22$coef)]),list(diet),mean)


###################################
## Transformations:

census <- read.csv("census2013.csv")
labels <- census$GEO.display.label
states <-sapply(labels, function(x){strsplit(as.character(x),", ")[[1]][2]})

nrow(census)
sum(census$respop72011<5000)
sum(census$respop72011>50000)
sum(census$respop72011>500000)

hist(census$respop72011,breaks=100)
hist(log(census$respop72011),breaks=100)

##aggergate states by mean or exp-log-mean:
untrans <- aggregate(census$respop72011,list(state=states),mean)
logged <- aggregate(census$respop72011,list(state=states),function(x){exp(mean(log(x)))})
untrans$logged <- logged$x

barplot(t(as.matrix(untrans[,-1])),names=untrans$state,beside=T,horiz=T,las=1)


x <-runif(100) * 20
z <- .1/x +runif(100)*.1
zinv <- (x+runif(100)/.2)

hist(z)
qqnorm(z)
hist(zinv)
qqnorm(zinv)


plot(x,z)
plot(x,1/z)

model1 <- lm(zinv~x)
model2 <- lm(1/z~x)
summary(model1)
summary(model2)

##Log-odds transform and arcsin transform:
logodds <- function(p){log(p/(1-p))}
logistic <- function(x){1/(1+exp(-x))}
cond <- sample(1:15,1000,replace=T)
success <- runif(1000)< 1-.4^(cond/3)
cond[1:15] <- 1:15  ##make sure we have no perfect accuracies
success[1:15] <- 0

pc <- aggregate(success,list(cond=cond),mean)
plot(pc)

lm1 <- lm(pc$x~pc$cond)
lm2 <- lm(logodds(pc$x)~pc$cond)
lm3 <- lm(asin(pc$x)~pc$cond)
par(mfrow=c(1,1))
#pdf("c13-transforms.pdf",fig.width=5,fig.height=6)
plot(pc$cond,pc$x,xlab="Condition",ylab="Observed value",ylim=c(0,1.1)) ##pretty bad
abline(lm1$coef)

##prediction of log-odds model
points(pc$cond, logistic(predict(lm2)),type="l",lwd=2,lty=3,col="cadetblue3") ##pretty good
points(pc$cond, sin(predict(lm3)),type="l",lwd=2,lty=4,col="orange")
legend(8,.4,c("Linear","Log-odds","Arcsine"),lty=c(1,3,4),col=c("black","cadetblue3","orange"),
       lwd=3)

#dev.off()


##
##Common transforms: Tukey' spower transform and the Box-Cox 
library(rcompanion)
z2 <- transformTukey(z,plotit=T)
z2
plot(-z^(-.15),z2)
model3 <- lm(z2~x)
summary(model3)

#if we look at the census data, a transform close to 0 (log) is ideal.
census2 <- transformTukey(census$respop72011)
hist(census2)

##Alternate: Box-Cox transform
library(MASS)
#pdf("c13-boxcox.pdf",width=8,height=5)
par(mfrow=c(1,2))
bc<-boxcox(z~x,plotit=T)
bc2 <- boxcox(census$respop72011~census$GEO.id2,plotit=T)
#dev.off()


################################
## Example polynomial regression:

x<- 1:100
g1 <- lm(WWWusage~poly(x,1))
g2 <- lm(WWWusage~poly(x,2))
g3 <- lm(WWWusage~poly(x,3))
g4 <- lm(WWWusage~poly(x,4))
g5 <- lm(WWWusage~poly(x,5))
g6 <- lm(WWWusage~poly(x,6))
g7 <- lm(WWWusage~poly(x,7))

pdf("c13ex1.pdf",width=8,height=5)
plot(x,WWWusage)
points(g1$fit,type="l",col="red")
points(g2$fit,type="l",col="red")
points(g3$fit,type="l",col="red")
points(g4$fit,type="l",col="red")
points(g5$fit,type="l",col="red")
points(g6$fit,type="l",col="red")
points(g7$fit,type="l",col="red")
dev.off()