#################################################################
##  Code for Book: Applied Statistical Analysis in R
##   A Graduate course for Psychology, Human factors, and Data Science
##  (c) 2018-2020 Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 14
##  Materials on testing assumptions of the linear models and detecting outliers
##

##consider the following two data sets. We can assess how good the model is by looking at R^2:
set.seed(302)
x <- rnorm(1000)
y <- 33 + x + rnorm(1000)
summary(modela)
modela <- lm(y~x)
anova(modela)

#pdf("c14-scatter1.pdf",width=9,height=5)
par(mfrow=c(1,3))
plot(x,y,col="gold",pch=16,ylim=c(10,50),
     main=paste("R2=",round(summary(modela)$r.squared,3)))
points(x,y)
abline(modela$coef)

y2 <- 33 + x + rnorm(1000)*5

modelb <-lm(y2~x) 

plot(x,y2,col="gold",pch=16,ylim=c(10,50),
   main=paste("R2=",round(summary(modelb)$r.squared,3)))
abline(modelb$coef)
points(x,y2)

summary(lm(y2~x))

set.seed(100)
y3 <- y 
noisy <- sample(1000,15)
y3[noisy] <- y3[noisy] + rnorm(15)* 35

modelc <- lm(y3~x)
plot(x,y3,col="gold",pch=16,
     main=paste("R2=",round(summary(modelc)$r.squared,3)))
points(x,y3)
abline(modelc$coef)

#dev.off()


#############################
## Testing the assumptions of the model (normality)
set.seed(111)
x <- rnorm(20)
y <- x + rnorm(20)

g1 <- lm(y~x)


hist(g1$resid)
qqnorm(g1$resid)
shapiro.test(g1$resid)


##Detecting and Handling Influential Observations and Outliers

# pdf("c14-smallscatter.pdf",width=8,height=6)
 plot(x,y,col="gold",pch=16,cex=1.5,
      main=paste("R=",round(cor(x,y),2)))
 points(x,y,cex=1.5)
#dev.off()


 cor(x,y)
cor.test(x,y)
library(BayesFactor)
correlationBF(x,y)

##What if we remove point 13?
plot(x[-13],y[-13])
cor.test(x[-13],y[-13])
correlationBF(x[-13],y[-13])


##Is point 13 real, or an error?  Is it an outlier? Does it matter?


##Examining Residuals and standardized residuals

#the goal is to see if the residuals depend on the fitted value in a systematic way.


#pdf("c14-residuals.pdf",width=9,height=5)
par(mfrow=c(1,2))
plot(g1,which=1)
plot(g1$fit,g1$resid,main="Fit versus residuals of model")
abline(0,0,lty=3)
#dev.off()


##Non-constant variance (and a test for it)
set.seed(5)
x2 <- runif(200)*10
y2 <- 33 + x2 + rnorm(200)*(x2) 


#pdf("c14-residuals2.pdf",width=9,height=5)

par(mfrow=c(1,2))
plot(x2,y2,col="gold",pch=16)
points(x2,y2)

g2 <- lm(y2~x2)
plot(g2,which=1)
#dev.off()


##Test for non-constant variance:
## Called the Breusch-Pagan test or Cook & Weisberg (1983)
library(car)
ncvTest(g2)


##look at the residuals divided by the sd of the residuals
g1 <- lm(y~x)
(g1$resid/sd(g1$resid))

#pdf("c14-standresid.pdf",width=9,height=5)
par(mfrow=c(1,3))
plot(g1$fit,g1$resid/sd(g1$resid))

plot(g1$fit,rstandard(g1))

abline(-2,0)
abline(-1,0)
abline(0,0)
abline(1,0)
abline(2,0)


plot(g1$resid/sd(g1$resid),rstandard(g1))
#dev.off()

cor(g1$resid,rstandard(g1))


##Leverage

g1 <- lm(y~x)
leverage <- hatvalues(g1)
#pdf("c14-leverage.pdf",width=8,height=5)
par(mfrow=c(1,2))
plot(x,y,pch=16,main="Levarage of data")
points(x,y,cex=1.2+leverage*5)

barplot(leverage)
abline(mean(leverage),0,lwd=2)
abline(2*mean(leverage),0)

#dev.off()

#Notice that leverage is essentially MSE when you have a single predictor variable:
cor((x-mean(x))^2,leverage)


pdf("c14-2dleverage.pdf")
 x2 <- runif(20)
 model2 <- lm(y~x+x2)
 lev2 <- hatvalues(model2)  ##Does not depend on y!
 plot(x,x2,pch=16)
 points(x,x2,cex=1.2+lev2*5)
dev.off()

#Studentized Residuals
## i.e., standardized residuals scaled by leverage

rstudent(g1)

#pdf("c14-studentized.pdf",width=9,height=5)
par(mfrow=c(1,2))
plot(x,y,pch=16)
points(x,y,cex=abs(rstudent(g1))+2)
abline(lm(y~x)$coef)
plot(g1$fit,rstudent(g1))
abline(0,0)

#dev.off()
leverage <- hat(model.matrix(g1))
plot(leverage,rstudent(g1))

?influence.measures

ig <- influence(g1)
#pdf("c14-influence.pdf",width=6,height=6)
plot(x,y,pch=16,col="gold")
points(x,y)
for(i in 1:20)
  abline(ig$coef[i,])

#dev.off()

cors <- matrix(rep(0,20*3),ncol=3)
for(i in 1:20)
{
  keep <- rep(T,20)
  keep[i] <- F
  cortmp <- cor.test(x[keep],y[keep])  
  cors[i,] <- c(  cortmp$estimate,cortmp$statistic,cortmp$p.value)
}

#pdf("c14-jackknife.pdf",width=8,height=5)
par(mfrow=c(1,2))
plot(cors[,3],ylab="Correlation when removed")
abline(.05,0)


plot(x,y)

g1 <- lm(y~x)
g2 <- lm(y[-13]~x[-13])

points(x[13],y[13],pch=16,col="red")

abline(0,1,lty=3,lwd=3)           ##The 'true' line
abline(g1$coef)                   ##The full data set
abline(g2$coef,col="red",lty=2)  ##Removing the 'outlier''

#dev.off()

##Jackknife using other outcomes: residuals:

set.seed(111)
x <- rnorm(20)
y <- x + rnorm(20)


resid <- rep(0,20)
for(i in 1:20)
{
  keep <- rep(T,20)
  keep[i] <- F
  xtmp <- x[keep]
  model <-  lm(y[keep]~xtmp)
  resid[i] <-(y[i]-predict(model,list(xtmp=x[i])))/sd(model$resid)
}

plot(resid/sd(resid),main="Jackknife on standardized residuals")


model1 <- lm(y~x)
int <- rep(0,20)
beta <- rep(0,20)
for(i in 1:20)
{
  keep <- rep(T,20)
  keep[i] <- F
  xtmp <- x[keep]
  model <-  lm(y[keep]~xtmp)
  int[i] <- model1$coef[1]-model$coef[1]
  beta[i] <- model1$coef[1]-model$coef[2]
}


#pdf("c14-jackknife2.pdf",width=8,height=5)
par(mfrow=c(2,2),mar=c(2,2,2,2))

plot(int,main="Jackknife on Intercept")
plot(beta,main="Jackknife on slope")
plot(x,y,main="Jackknife on linear model")
 for(i in 1:20)abline(model1$coef[1]+int[i],beta[i]+model1$coef[2])

plot(x,y,main="Jackknife on beta diagnosticity")
fits <- dfbetas(model1)
for(i in 1:20)abline(fits[i,],col="red")
#dev.off()

#pdf("c14-comparison.pdf",width=8,height=6)
plot(hatvalues(model1),type="o",col="red",ylim=c(-1,2))
points(cooks.distance(model1),type="o",col="darkgreen")
points(dffits(model1),type="o",col="orange")
#dev.off()

influence.measures(model1)

dat <- read.csv("tmt.csv")
dat$age <- as.factor(dat$age)

#pdf("c14-tmt-model.pdf",width=9,height=5)
par(mfrow=c(1,2))
plot(dat$time~(dat$age)+dat$type)


lm1 <- lm(time~age+type,data=dat)
qqnorm(lm1$resid)
#dev.off()

#pdf("c14-tmt-model-lmplot.pdf")
par(mfrow=c(2,2))
plot(lm1)
#dev.off()

#pdf("c14-tmt-model-transforms.pdf")
par(mfrow=c(2,2))

hist(lm1$resid)
hist(log(lm1$resid-min(lm1$resid)),breaks=20)

plot(log(dat$time)~(dat$age))
points(dat$age,log(dat$time))
plot(log(dat$time)~(dat$type))
points(dat$type,log(dat$time),col="grey")

#dev.off()

lm2 <- lm(log(time)~age+type,data=dat)
qqnorm(lm2$resid)

##This is a bit better; the fastest time was around 50
lm3 <- lm(log(time-50)~age+type,data=dat)
qqnorm(lm3$resid)
hist(lm3$resid)

summary(lm1)
summary(lm2)
summary(lm3)


#########Downsides:
lm1.int <- lm(time~age*type,data=dat)
lm3.int <- lm(log(time-50)~age*type,data=dat)
anova(lm1.int)
anova(lm3.int)


agg1 <- tapply(dat$time,list(dat$age,dat$type),mean)
agg3 <- tapply(log(dat$time-50),list(dat$age,dat$type),mean)

matplot(t(agg1),type="o",ylab="Response time (s)")
matplot(t(agg3),type="o",ylab="log(Response time)",yaxt="n",ylim=c(2.5,5))
axis(2,log(c(65,75,100,150,200,250)-50), c(65,75,100,150,200,250),las=1)

library(car)
ncvTest(lm1.int)
ncvTest(lm3.int)