################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 16 ## Materials on categorical predictors in linear models, contrasts, and the one-way ANOVA. ## ## Libraries: car, MASS, agricolae, vioplot, BayesFactor ## specialty.0 <- c("Advertising","Advertising","Advertising","Advertising","Advertising", "Business","Business","Business","Business","Business", "Certification","Certification","Certification","Certification","Certification") specialty <- as.factor(c("Advertising","Advertising","Advertising","Advertising","Advertising", "Business","Business","Business","Business","Business", "Certification","Certification","Certification","Certification","Certification")) salary <- c(30,35,32,40,34,70,56,45,65,28,19,23,28,18,32) mean(salary) aggregate(salary,list(specialty),mean) lm(salary~specialty) lm(salary~specialty+0) lm(salary~specialty-1) lm(salary~specialty) lm(salary~specialty.0) ##different ways to examine how the contrasts are represented: contrasts(specialty) contr.treatment(3) contr.treatment(levels(specialty)) contrasts(specialty) <-contr.treatment(levels(specialty)) contrasts(specialty) #######SAS treatment coding: contr.SAS(3) contrasts(specialty)<- contr.SAS(levels(specialty)) contrasts(specialty) lm(salary~specialty) lm(salary~specialty+0) ##Helmert: compare each level to the mean of previous (or subsequent) levels: contr.helmert(levels(specialty)) contrasts(specialty)<- contr.helmert(levels(specialty)) summary(lm(salary~specialty)) contr.helmert(5) #notice they are all uncorrelated cor(contr.helmert(5)) ##using helmert coding: values <- c(1,1.1,1,1.3,.9,1.2, 4,5,6,8,10,9) months <- as.factor(c("Jan","Feb","Mar","Apr","May","Jun","July","Aug","Sep","Oct","Nov","Dec")) months contrasts(months) <- contr.helmert(levels(months)) barplot(values,names=months) lm(values~months) ##but uh-oh: contrasts(months) months.0 <- c("Jan","Feb","Mar","Apr","May","Jun","July","Aug","Sep","Oct","Nov","Dec") months<- factor(months.0,levels=months.0) contrasts(months) <- contr.helmert(levels(months)) lm(values~months) plot(lm(values~months)$coef[-1]) ####################################################### ##Successive difference coding: library(MASS) contr.sdif(3) contr.sdif(4) contrasts(months) <- contr.sdif(months) lm(values~months) contrasts(specialty)<-contr.sdif(levels(specialty)) lm(salary~specialty) summary(lm(salary~specialty)) contr.sum(3) ##look at the specialty model again: contrasts(specialty)<-contr.sum(levels(specialty)) lm(salary~specialty) mean(salary) tapply(salary,specialty,mean) ##calculate the mean plus estimates: 37 +2.8-15.8 ##==24 37 +15.8 ## == 52.8 37-2.8 ## == 34.2 ##Others contr.poly(5) ### ### ###Using the entire sets of predictors in a model--does it depend on anything? contrasts(specialty)<-contr.sum(levels(specialty)) specialty2 <- specialty contrasts(specialty2) <- contr.helmert(levels(specialty2)) lm(salary~specialty) lm(salary~specialty2) anova(lm(salary~specialty)) anova(lm(salary~specialty2)) aov(salary~specialty) summary(aov(salary~specialty)) #######################################################################3 # Tooth growth Example using contrasts lm.tooth1 <- lm(len~dose,data=ToothGrowth) summary(lm.tooth1) pdf("c16-toothmodel1.pdf",width=6,height=4) plot(aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean),ylim=c(0,30), xlab="Dose",ylab="Growth",col="gold",pch=16,cex=2) points(aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean),cex=2) abline(lm(len~dose,data=ToothGrowth)) dev.off() means <- aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean) means mean(ToothGrowth$len) #This doesn't seem completely linear. We might try to fit a polynomial regression, #but what if we instead treated these as categories. dosage <- as.factor(ToothGrowth$dose) ToothGrowth$dosage <- dosage lm.tooth2 <- lm(ToothGrowth$len~dosage) summary(lm.tooth2) ##this says that dose 1 and dose 2 each differ from dose 0. contrasts(dosage) #what if we wanted to know whether each additional dosage makes a difference? contrasts(dosage) <- contr.sdif(levels(dosage)) contrasts(dosage) summary(lm(ToothGrowth$len~dosage)) ##intercept is grand mean #Maybe we wanted to do (reverse) helmert coding? contrasts(dosage) <- contr.helmert(levels(dosage)) contrasts(dosage) summary(lm(ToothGrowth$len~dosage)) means <- aggregate(ToothGrowth$len,list(ToothGrowth$dose),mean) b0 = mean(means$x) #grand mean b1 = (means$x[2] - mean(means$x[1]))/2 # (2 - 1)/2 b2= (means$x[3] - mean(means$x[1:2]) )/3 # (3-(1:2))/3 c(b0,b1,b2) contrasts(dosage) <- contr.helmert(levels(dosage))[3:1,] contrasts(dosage) summary(lm(ToothGrowth$len~dosage)) ############################################ #############One Way ANOVA################## ## Ensure we have standard treatment coding: contrasts(specialty) <- contr.treatment(levels(specialty)) ##look at the F-value here: summary(lm(salary~specialty)) anova(lm(salary~specialty), lm(salary~1)) anova(lm(salary~specialty)) model <- aov(salary~specialty) summary(model) summary(aov(salary~specialty)) ##Does the ANOVA depend on the contrasts? contrasts(specialty) <- contr.treatment( levels(specialty)) summary(lm(salary~specialty))$fstatistic contrasts(specialty) <- contr.SAS( levels(specialty)) summary(lm(salary~specialty))$fstatistic contrasts(specialty) <- contr.sdif( levels(specialty)) summary(lm(salary~specialty))$fstatistic contrasts(specialty) <- contr.helmert( levels(specialty)) summary(lm(salary~specialty))$fstatistic contrasts(specialty) <- contr.sum( levels(specialty)) summary(lm(salary~specialty))$fstatistic contrasts(specialty ) <- cbind(c(1,1,2),c(3,-1,4)) summary(lm(salary~specialty))$fstatistic contrasts(specialty ) <- cbind(c(1,1,2),c(0,0,0)) summary(lm(salary~specialty))$fstatistic ##be sure to reset these: contrasts(specialty) <- contr.treatment( levels(specialty)) boxplot(salary~specialty) ################## ##Testing ANOVA Assumptions ##here, salary by specialty fails; bartlett.test(salary~specialty) bartlett.test(salary,specialty) ##more robust to non-normality library(car) leveneTest(salary,specialty) ##This is significant; unlikely to have occurred if the variances were the same. ##Non-parametric fligner.test(salary~specialty) ll <-lm(salary~specialty) library(vioplot) #pdf("c16-boxplots.pdf",width=9,height=5) par(mfrow=c(1,2)) boxplot(salary~specialty,col="gold") vioplot(salary[specialty=="Advertising"], salary[specialty=="Business"], salary[specialty=="Certification"], names=c("Advertising","Business","Certification"), h=5,col="gold") #dev.off() ##not assuming equal variances: oneway.test(salary~specialty) ##These two are the same: ##version of welch's t-test oneway.test(salary~specialty,var.equal=T) summary(aov(salary~specialty)) #version of wilcox/mann-whitney-U kruskal.test(salary~specialty) kruskal.test(len~dose,data=ToothGrowth) kruskal.test(len~dose,data=ToothGrowth) #kruskal.test(len~dose+supp,data=ToothGrowth) ToothGrowth$ds <- paste(ToothGrowth$dose,ToothGrowth$supp,sep="-") kruskal.test(len~ds,data=ToothGrowth) library(BayesFactor) dat <- data.frame(salary,specialty) dat <- data.frame(salary,specialty) abf <- anovaBF(salary~specialty,data=dat) abf ##similar tests: #lbf <- lmBF(salary~specialty,data=dat) #generalTestBF(salary~specialty,data=dat) #abf <- anovaBF(salary~specialty,data=dat) abf ToothGrowth$dosage <- as.factor(ToothGrowth$dose) tg <- anovaBF(len~dosage+supp,data=ToothGrowth) tg tg[1]/tg[3] tgchains.dosage <- posterior(tg,index=2,iterations=10000) tgchains.sup <- posterior(tg,index=3,iterations=10000) summary(tgchains.sup) ####################3 ## comparing group means contrasts(ToothGrowth$dosage) <- contr.treatment(levels(ToothGrowth$dosage)) summary(lm(len ~ dosage,data=ToothGrowth)) (15.495-9.13)/ 1.34 4.75 1-pt(4.75,57) # 7.079305e-06 pairwise.t.test(ToothGrowth$len,ToothGrowth$dose) hh<-pairwise.t.test(ToothGrowth$len,ToothGrowth$dose,p.adj="bonf") contrasts(specialty) <- contr.treatment(levels(specialty)) TukeyHSD(aov(salary~specialty)) TukeyHSD(aov(len~dosage,data=ToothGrowth)) library(agricolae) ##from agricolae package, there is a nice implementation HSD.test(aov(len~dosage,data=ToothGrowth),"dosage", group=TRUE, main="Effect of dosage",console=T) HSD.test(aov(salary~specialty),"specialty",group=T,console=T) ######Bayesian post-hoc chains <- posterior(abf,iterations=10000) chains[1:5,] summary(chains) hist(chains[,1]) plot(chains[,2:4]) hist(chains[,2],xlim=c(-40,40),breaks=100) ##histogram of advertising hist(chains[,3],add=T,col="blue",breaks=100) ##histogram of business hist(chains[,4],add=T,col="red",breaks=100) ##histogram of certification hist(chains[,4]-chains[,3],breaks=100) abline(v=0,lwd=3) mean(chains[,4]>chains[,3]) ##probability that 4 > 5==1% mean(chains[,4]>chains[,2]) ##probability that 4 > 2==11% mean(chains[,3]>chains[,2]) ##probability that 3 > 2==97%