#################################################################
##  Code for Book: Applied Statistical Analysis in R
##   A Graduate course for Psychology, Human factors, and Data Science
##  (c) 2018 Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 16 
##  Materials on categorical predictors in linear models, contrasts, and the one-way ANOVA.
##

## Libraries: car, MASS, agricolae, vioplot, BayesFactor
##
specialty.0 <- c("Advertising","Advertising","Advertising","Advertising","Advertising",
                         "Business","Business","Business","Business","Business",
                         "Certification","Certification","Certification","Certification","Certification")

specialty <- as.factor(c("Advertising","Advertising","Advertising","Advertising","Advertising",
                      "Business","Business","Business","Business","Business",
                      "Certification","Certification","Certification","Certification","Certification"))

salary <- c(30,35,32,40,34,70,56,45,65,28,19,23,28,18,32)


mean(salary)
aggregate(salary,list(specialty),mean)
lm(salary~specialty)
lm(salary~specialty+0)
lm(salary~specialty-1)
lm(salary~specialty)

lm(salary~specialty.0)

##different ways to examine how the contrasts are represented:
contrasts(specialty)
contr.treatment(3)
contr.treatment(levels(specialty))
contrasts(specialty) <-contr.treatment(levels(specialty))
contrasts(specialty)


#######SAS treatment coding:
contr.SAS(3)
contrasts(specialty)<- contr.SAS(levels(specialty))
contrasts(specialty)
lm(salary~specialty)
lm(salary~specialty+0)

##Helmert: compare each level to the mean of previous (or subsequent) levels:
contr.helmert(levels(specialty))
contrasts(specialty)<- contr.helmert(levels(specialty))

summary(lm(salary~specialty))

contr.helmert(5)

#notice they are all uncorrelated
cor(contr.helmert(5))

##using helmert coding:

values <- c(1,1.1,1,1.3,.9,1.2, 4,5,6,8,10,9)
months <- as.factor(c("Jan","Feb","Mar","Apr","May","Jun","July","Aug","Sep","Oct","Nov","Dec"))
months
contrasts(months) <- contr.helmert(levels(months))
barplot(values,names=months)
lm(values~months)

##but uh-oh:
contrasts(months)


months.0 <- c("Jan","Feb","Mar","Apr","May","Jun","July","Aug","Sep","Oct","Nov","Dec")
months<- factor(months.0,levels=months.0)

contrasts(months) <- contr.helmert(levels(months))
lm(values~months)

plot(lm(values~months)$coef[-1])

#######################################################
##Successive difference coding:
library(MASS)
contr.sdif(3)
contr.sdif(4)


contrasts(months) <- contr.sdif(months)
lm(values~months)


contrasts(specialty)<-contr.sdif(levels(specialty))
lm(salary~specialty)

summary(lm(salary~specialty))


contr.sum(3)

##look at the specialty model again:
contrasts(specialty)<-contr.sum(levels(specialty))

lm(salary~specialty)
mean(salary)
tapply(salary,specialty,mean)
##calculate the mean plus estimates:
37 +2.8-15.8  ##==24
37 +15.8  ## == 52.8
37-2.8    ## == 34.2


##Others
contr.poly(5)
###
###
###Using the entire sets of predictors in a model--does it depend on anything?

contrasts(specialty)<-contr.sum(levels(specialty))
specialty2 <- specialty
contrasts(specialty2) <- contr.helmert(levels(specialty2))
lm(salary~specialty)
lm(salary~specialty2)
anova(lm(salary~specialty))
anova(lm(salary~specialty2))

aov(salary~specialty)
summary(aov(salary~specialty))


#######################################################################3
# Tooth growth Example using contrasts

lm.tooth1 <- lm(len~dose,data=ToothGrowth)
summary(lm.tooth1)

pdf("c16-toothmodel1.pdf",width=6,height=4)
plot(aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean),ylim=c(0,30),
     xlab="Dose",ylab="Growth",col="gold",pch=16,cex=2)
points(aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean),cex=2)

abline(lm(len~dose,data=ToothGrowth))
dev.off()

means <- aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean)
means
mean(ToothGrowth$len)


#This doesn't seem completely linear. We might try to fit a polynomial regression,
#but what if we instead treated these as categories.
dosage <- as.factor(ToothGrowth$dose)
ToothGrowth$dosage <- dosage
lm.tooth2 <- lm(ToothGrowth$len~dosage)
summary(lm.tooth2)


##this says that dose 1 and dose 2 each differ from dose 0.  
contrasts(dosage)

#what if we wanted to know whether each additional dosage makes a difference?

contrasts(dosage) <- contr.sdif(levels(dosage))
contrasts(dosage)

summary(lm(ToothGrowth$len~dosage))


##intercept is grand mean
#Maybe we wanted to do (reverse) helmert coding?

contrasts(dosage) <- contr.helmert(levels(dosage))
contrasts(dosage)
summary(lm(ToothGrowth$len~dosage))

means <- aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean)

 b0 = mean(means$x)  #grand mean
 b1 =   (means$x[2] - mean(means$x[1]))/2   # (2 - 1)/2
 b2= (means$x[3]  - mean(means$x[1:2]) )/3  # (3-(1:2))/3

  c(b0,b1,b2)


contrasts(dosage) <- contr.helmert(levels(dosage))[3:1,]
contrasts(dosage)
summary(lm(ToothGrowth$len~dosage))


############################################
#############One Way ANOVA##################

## Ensure we have standard treatment coding:
contrasts(specialty) <- contr.treatment(levels(specialty))
##look at the F-value here:
summary(lm(salary~specialty))

anova(lm(salary~specialty),
      lm(salary~1))

anova(lm(salary~specialty))


model <- aov(salary~specialty)
summary(model)
summary(aov(salary~specialty))


##Does the ANOVA depend on the contrasts?

contrasts(specialty) <- contr.treatment( levels(specialty))
summary(lm(salary~specialty))$fstatistic
contrasts(specialty) <- contr.SAS( levels(specialty))
summary(lm(salary~specialty))$fstatistic
contrasts(specialty) <- contr.sdif( levels(specialty))
summary(lm(salary~specialty))$fstatistic
contrasts(specialty) <- contr.helmert( levels(specialty))
summary(lm(salary~specialty))$fstatistic
contrasts(specialty) <- contr.sum( levels(specialty))
summary(lm(salary~specialty))$fstatistic

contrasts(specialty ) <- cbind(c(1,1,2),c(3,-1,4))
summary(lm(salary~specialty))$fstatistic

contrasts(specialty ) <- cbind(c(1,1,2),c(0,0,0))
summary(lm(salary~specialty))$fstatistic

##be sure to reset these:
contrasts(specialty) <- contr.treatment( levels(specialty))


boxplot(salary~specialty)

##################
##Testing ANOVA Assumptions
##here, salary by specialty fails;
bartlett.test(salary~specialty)  
bartlett.test(salary,specialty)


##more robust to non-normality
library(car)
leveneTest(salary,specialty)
##This is significant; unlikely to have occurred if the variances were the same.
##Non-parametric
fligner.test(salary~specialty)

ll <-lm(salary~specialty)
library(vioplot)
#pdf("c16-boxplots.pdf",width=9,height=5)
par(mfrow=c(1,2))
boxplot(salary~specialty,col="gold")
vioplot(salary[specialty=="Advertising"],
        salary[specialty=="Business"],
        salary[specialty=="Certification"],
        names=c("Advertising","Business","Certification"),
        h=5,col="gold")
#dev.off()


##not assuming equal variances:
oneway.test(salary~specialty)

##These two are the same:
##version of welch's t-test
oneway.test(salary~specialty,var.equal=T)
summary(aov(salary~specialty))

#version of wilcox/mann-whitney-U
kruskal.test(salary~specialty)
kruskal.test(len~dose,data=ToothGrowth)

kruskal.test(len~dose,data=ToothGrowth)
#kruskal.test(len~dose+supp,data=ToothGrowth)
ToothGrowth$ds <- paste(ToothGrowth$dose,ToothGrowth$supp,sep="-")
kruskal.test(len~ds,data=ToothGrowth)


library(BayesFactor)

dat <- data.frame(salary,specialty)

dat <- data.frame(salary,specialty)
abf <- anovaBF(salary~specialty,data=dat)
abf


##similar tests:
#lbf <- lmBF(salary~specialty,data=dat)
#generalTestBF(salary~specialty,data=dat)
#abf <- anovaBF(salary~specialty,data=dat)
abf


ToothGrowth$dosage <- as.factor(ToothGrowth$dose)
tg <- anovaBF(len~dosage+supp,data=ToothGrowth)
tg
tg[1]/tg[3]
tgchains.dosage <- posterior(tg,index=2,iterations=10000)
tgchains.sup <- posterior(tg,index=3,iterations=10000)

summary(tgchains.sup)
####################3
## comparing group means
contrasts(ToothGrowth$dosage) <- contr.treatment(levels(ToothGrowth$dosage))
summary(lm(len ~ dosage,data=ToothGrowth))

(15.495-9.13)/ 1.34 
4.75
1-pt(4.75,57)
# 7.079305e-06
   
pairwise.t.test(ToothGrowth$len,ToothGrowth$dose)
hh<-pairwise.t.test(ToothGrowth$len,ToothGrowth$dose,p.adj="bonf")

contrasts(specialty) <- contr.treatment(levels(specialty))

TukeyHSD(aov(salary~specialty))
TukeyHSD(aov(len~dosage,data=ToothGrowth))

library(agricolae)
##from agricolae package, there is a nice implementation
HSD.test(aov(len~dosage,data=ToothGrowth),"dosage", group=TRUE,
         main="Effect of dosage",console=T)
HSD.test(aov(salary~specialty),"specialty",group=T,console=T)

######Bayesian post-hoc


chains <- posterior(abf,iterations=10000)
chains[1:5,]
summary(chains)
hist(chains[,1])
plot(chains[,2:4])
hist(chains[,2],xlim=c(-40,40),breaks=100)  ##histogram of advertising
hist(chains[,3],add=T,col="blue",breaks=100) ##histogram of business
hist(chains[,4],add=T,col="red",breaks=100)  ##histogram of certification

hist(chains[,4]-chains[,3],breaks=100)
abline(v=0,lwd=3)
mean(chains[,4]>chains[,3])  ##probability that 4 > 5==1%
mean(chains[,4]>chains[,2])  ##probability that 4 > 2==11%
mean(chains[,3]>chains[,2])  ##probability that 3 > 2==97%