#################################################################
##  Code for Book: Applied Statistical Analysis in R
##   A Graduate course for Psychology, Human factors, and Data Science
##  (c) 2018-2020 Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 11
##  Comparing nested models.
##
##
##   Libraries used faraway (for some data example)
##
##


## A data set with correlated respo nses:
dat <- read.csv("survey.csv")

summary(dat)
boxplot(dat[,2:9],col="gold",las=3)
pairs(dat[,2:9])
round(cor(dat[,2:9]),3)

##Fit a model predicting academic by everything else:
lm1 <- lm(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative+group,data=dat)
summary(lm1)
drop1(lm1,test="F")

##It looks like dropping leadership and handson have the least impact
lm2 <- lm(academic~communication+teaming+priorwork+multicultural+creative+group,data=dat)
summary(lm2)
drop1(lm2,test="F")

anova(lm1,lm2)


##Multicultural is not adding much either
lm3 <- lm(academic~communication+teaming+priorwork+creative+group,data=dat)
summary(lm3)
drop1(lm3,test="F")
anova(lm1,lm3)

lm4 <- lm(academic~communication+priorwork+creative+group,data=dat)
lm5 <- lm(academic~communication+priorwork+group,data=dat)
lm6 <- lm(academic~priorwork+group,data=dat)
lm7 <- lm(academic~group,data=dat)
lm8 <- lm(academic~1,data=dat)

lm7g <- glm(academic~group,data=dat)

anova(lm1,lm2,lm3,lm4,lm5,lm6,lm7,lm8)

anova(lm1,lm4,lm8)
anova(lm1,lm5)

##These two models are not nested:
lm9 <- lm(academic~communication,data=dat)
lm10 <- lm(academic~creative,data=dat)
anova(lm9,lm10)
anova(lm5,lm10)

##Score AIC for each model:
data.frame(model=paste("lm",1:10,sep=""),
           rbind(extractAIC(lm1),
extractAIC(lm2),
extractAIC(lm3),
extractAIC(lm4),
extractAIC(lm5),
extractAIC(lm6),
extractAIC(lm7),
extractAIC(lm8),
extractAIC(lm9),
extractAIC(lm10)))


##BIC is a small modification of AIC:

extractBIC <- function(model)
{
  extractAIC(model,k=log(length(model$residuals)))
}


data.frame(model=paste("lm",1:10,sep=""),
           rbind(extractBIC(lm1),
                 extractBIC(lm2),
                 extractBIC(lm3),
                 extractBIC(lm4),
                 extractBIC(lm5),
                 extractBIC(lm6),
                 extractBIC(lm7),
                 extractBIC(lm8),
                 extractBIC(lm9),
                 extractBIC(lm10)))

##Use stepwise model search.  Here, we use both directions every time.

gsmall <- step(lm1,direction="both", k=log(nrow(dat)))
summary(gsmall)


##Compare to bayes factor:
library(BayesFactor)
b1 <- lmBF(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative+group,data=dat)
plot(b1)
b3 <- lmBF(academic~communication+teaming+priorwork+creative+group,data=dat)
b3/b1

bmodel <- regressionBF(academic~communication+leadership+handson+teaming+priorwork+multicultural+creative,data=dat)
                  
plot(head(bmodel))   
head(bmodel)


##########################################################
#######The effect of parameter selection on prediction

##normal prediction
set.seed(100)
age <- 20+runif(50) * 80
income <- 20000+ runif(50)*50000
digitspan <-  ( age * .5 + income * .003)/20 + rnorm(50)
data=data.frame(age,income,digitspan)

model <- lm(digitspan~age+income,data=data)

par(mfrow=c(1,2))
plot(age,digitspan,col="gold",pch=16)
points(age,digitspan)
points(age,model$fit,cex=.5)
segments(age,digitspan,age,model$fit,lty=3,lwd=.5)

plot(income,digitspan,col="gold",pch=16)
points(income,digitspan)
points(income,model$fit,cex=.5)
segments(income,digitspan,income,model$fit,lty=3,lwd=.5)

plot(digitspan,model$fit,pch=16,col="gold",main="Observed vs. Predicted digit span",ylab="Predicted digit span")
points(digitspan,model$fit)
##Age doesn't look very predictive, because income is more predictive here. But summary tells it it still matters.
summary(model)


##Fit vs. predicted values--they are the same:
model$fit
predict(model,data)

cor(model$fit,predict(model,data))
predict(model,list(age=30,income=10000))
predict(model,list(age=30,income=40000))


#WHAT IFWE HAD OTHER VARIABLES TOO?
set.seed(1001)
n <- 50
xcat <- as.factor(sample(c("A","B","C"), n,replace=T))
x2 <- runif(n)
x3 <- runif(n)
x4 <- 1/runif(n)
x5 <- runif(n)
x6 <- runif(n)
x7 <- runif(n)

y <- as.numeric(xcat)*3 + 5*x2 - 10*x3 + rnorm(n)*2.5

#y[sample(1:n,5)] <- rnorm(5,mean(y),sd(y))

dat <- data.frame(xcat,x2,x3,x4,x5,x6,x7,y)
model <- lm(y~0+xcat+x2+x3+x4+x5+x6+x7,data=dat)
summary(model)

##predict the original data:
predict(model,dat)

##The fit is pretty good here:
cor(predict(model,dat),y)^2

## or A new data:
predict(model,dat[1,])
dat[1,]


tmp <- dat[1,]
tmp$xcat="B"

predict(model,tmp)
predict(model,list(xcat="A",x2=.8,x3=.01,x4=1.5,x5=3,x6=1.5,x7=1.3))


set.seed(8)
reorder <- (1:n)[order(runif(n))]
half<- floor(n/2)
fitset <- reorder[1:half]
extra <- reorder[(half+1):n]

##fit the model on a random half of the data:
modelb <- lm(y~0+xcat+x2+x3+x4+x5+x6+x7,data=dat[fitset,])
modelc <- lm(y~0+xcat+x2+x3,data=dat[fitset,])

summary(model)
summary(modelb)
summary(modelc)

summary(model)$r.squared
summary(modelb)$r.squared
summary(modelc)$r.squared


params <- rbind(model$coef,
                modelb$coef,
                c(modelc$coef,0,0,0,0),
                c(3,6,9,5,-10,0,0,0,0))

round(params,3)


 ##fit original model its own left-out data:
 cor(dat$y[fitset],predict(model,dat[fitset,]))^2
 cor(dat$y[extra],predict(model,dat[extra,]))^2

##overfit model:
cor(dat$y[fitset],predict(modelb,dat[fitset,]))^2
cor(dat$y[extra],predict(modelb,dat[extra,]))^2

##smaller model:
cor(dat$y[fitset],predict(modelc,dat[fitset,]))^2
cor(dat$y[extra],predict(modelc,dat[extra,]))^2


#########################

## Predicting categorical relationships:

gender <- sample(as.factor(c("F","M")),50,replace=T)
height <- 30 + as.numeric(gender) * 10 + runif(50)*30  ##height depends on gender
weight <- 100 + as.numeric(gender) * 20 + runif(50)*50  ##weight depends on gender too

plot(height,weight,col=gender,pch=16)


model <- lm(as.numeric(gender)~height+weight)
model
plot(model$fit~gender,ylab="Gender coefficient")
points(as.numeric(gender),model$fit)
abline(1.5,0,lwd=3)


predictedgender <- model$fit > 1.5
table(gender,c("F","M")[(predictedgender+1)])


############Example: Modeling Chick Weight 

summary(ChickWeight)
plotchicks <- function()
{
  cscheme <- c("red","darkgreen","black","orange")
  ##write down the times of measurement--they are not regular
  times <- (c(0,2,4,6,8,10,12,14,16,18,20,21))
  
  ##Aggregate growth over the diets and times
  
  cw.agg <- tapply((ChickWeight$weight), list(time=ChickWeight$Time,
                                              diet=ChickWeight$Diet),mean)
  cw.bysub<- tapply((ChickWeight$weight),    
                    list(time=ChickWeight$Time,
                         chick=ChickWeight$Chick),mean)
  
  diets <- aggregate(as.numeric(as.character(ChickWeight$Diet)),
                     list(chick=ChickWeight$Chick),median)
  
  #Replot the growth curves, but and make reasonable headers
  matplot(times,cw.bysub,col=cscheme[diets$x],type="l",lty=3,
          ylab="Weight (mg)",xlab="Age (days)",ylim=c(0,400),las=1)
  points(ChickWeight$Time,ChickWeight$weight,cex=.5,
         pch=1, col=cscheme[ChickWeight$Diet])
  
  #Underlay a white line
  matplot(times,cw.agg,add=T,type="l",lwd=15, col="white",lty=1)
  matplot(times,cw.agg,add=T,type="l",lwd=5, col=cscheme,lty=1)
  ##Whoops, no title:
  title("Chick weights over time by diet")
  ##Lets add some gridpoints
  points(rep(-10:25,each=17), rep(0:16*25,36),pch=".",cex=2)
  
  ##Make a legend here.
  legend(2,350,paste("Diet",c(1:4)),col=cscheme,lty=1,lwd=5)
}

plotchicks()
summary(ChickWeight)

##compare each diet with a t-test;
cwd <- ChickWeight$Diet
t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==2])
t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==3])
t.test(ChickWeight$weight[cwd==1],ChickWeight$weight[cwd==4])
t.test(ChickWeight$weight[cwd==2],ChickWeight$weight[cwd==3])
t.test(ChickWeight$weight[cwd==2],ChickWeight$weight[cwd==4])
t.test(ChickWeight$weight[cwd==3],ChickWeight$weight[cwd==4])

##What assumptions does this violate?
##What is wrong here?


lm1 <- lm(weight~Diet,data=ChickWeight)
summary(lm1)

plotchicks()
abline(102,0,col="red",lwd=3)
abline(102+19.97,0,col="darkgreen",lwd=3)
abline(102+40.3,0,col="black",lwd=3)
abline(102+32.62,0,col="orange",lwd=3)


##Pick out the diet of each chick:
diets <- tapply(ChickWeight$Diet,list(ChickWeight$Chick),function(x)x[[1]])

minweights <- tapply(ChickWeight$weight,list(ChickWeight$Chick),min)
maxweights <- tapply(ChickWeight$weight,list(ChickWeight$Chick),max)

##Let's recategorize by weight gains
ratio <- maxweights/minweights
gain <- maxweights-minweights

hist(ratio,breaks=20)
hist(gain,breaks=20)
plot(ratio,gain)
##It hardly matters

##How about we do a t-test on gains, rather than all weights.
t.test(gain[diets==1],gain[diets==2])
t.test(gain[diets==1],gain[diets==3])
t.test(gain[diets==1],gain[diets==4])
t.test(gain[diets==2],gain[diets==3])
t.test(gain[diets==2],gain[diets==4])
t.test(gain[diets==3],gain[diets==4])

##How do these tests differ?  Look at the degrees of freedom.


plotchicks()

##what if we look at just the end-weights
end <- tapply(gain,list(diets),mean)

abline(end[1],0,col="red",lwd=3 )
abline(end[2],0,col="darkgreen",lwd=3 )
abline(end[3],0,col="black",lwd=3 )
abline(end[4],0,col="orange",lwd=3)

##that's not right--that is just the gain;
##it assumes chickens weight 0 on day 0 (but the intercepts were not 0)

plotchicks()
start <- tapply(minweights,list(diets),mean)
end <- tapply(gain,list(diets),mean)

abline(end[1]+start[1],0,col="red",lwd=3 )
abline(end[2]+start[2],0,col="darkgreen",lwd=3 )
abline(end[3]+start[3],0,col="black",lwd=3 )
abline(end[4]+start[4],0,col="orange",lwd=3 )


##How about this--what slope and intercept are we modeling here?

lm2 <- lm(weight~ Time+Diet,data=ChickWeight)
summary(lm2)

plotchicks()
abline(lm2$coef[1],lm2$coef[2],col="red",lwd=3)
abline(lm2$coef[1]+lm2$coef[3],lm2$coef[2],col="darkgreen",lwd=3)
abline(lm2$coef[1]+lm2$coef[4],lm2$coef[2],col="black",lwd=3)
abline(lm2$coef[1]+lm2$coef[5],lm2$coef[2],col="orange",lwd=3)


##How about this slope and intercept. : is interaction.
lm3 <- lm(weight~Time:Diet,data=ChickWeight)
summary(lm3)

plotchicks()
abline(lm3$coef[1],lm3$coef[2],col="red",lwd=5,lty=4)
abline(lm3$coef[1],lm3$coef[3],col="darkgreen",lwd=5,lty=4)
abline(lm3$coef[1],lm3$coef[4],col="black",lwd=5,lty=4)
abline(lm3$coef[1],lm3$coef[5],col="orange",lwd=5,lty=4)

##How about here?

lm4 <- lm(weight~Time*Diet,data=ChickWeight)
summary(lm4)

plotchicks()
abline(lm4$coef[1],lm4$coef[2],col="red",lwd=5,lty=4)
abline(lm4$coef[1]+lm4$coef[3],lm4$coef[2]+lm4$coef[6],col="darkgreen",lwd=5,lty=4)
abline(lm4$coef[1]+lm4$coef[4],lm4$coef[2]+lm4$coef[7],col="black",lwd=5,lty=4)
abline(lm4$coef[1]+lm4$coef[5],lm4$coef[2]+lm4$coef[8],col="orange",lwd=5,lty=4)

##One last look--categorical examination
weightgain <- ifelse(maxweights>=300,"jumbo",
                     ifelse(maxweights<=150,"runt","normal"))                     
chickTable <- table(diets,weightgain)
## Does the type of chick depend of feed?
chisq.test(chickTable)