## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 8. #Some practice questions: ##look at the iris data set. ## see picture of the iris ## Is Sepal Length different from Sepal Width? ## Is Petal Width smaller than Petal Length? ## Is sepal length for versicolor smaller than virginica? par(mfrow=c(2,1)) hist(rnorm(1000,mean=0,sd=1),col="gold",xlim=c(-3,3), main="Histogram of data") numexps <- 100000 means <- rep(0,numexps) sds <- rep(0,numexps) expsize <- 15 for(i in 1:numexps) { #Generate one experiment: data <- rnorm(expsize,mean=0,sd=1) #estimate its mean means[i] <- mean(data) sds[i] <- sd(data) } hist(means,xlim=c(-3,3),col="grey20", main="Histogram of means") mean(means) sd(means) #pdf("c8-montecarlo.pdf",width=9,height=5) par(mfrow=c(1,2)) x <- hist(means,col="gold",breaks=-20:20/10,main="Histogram of obtained means\n under null hypothesis") abline(v=0.5,lwd=3,col="cadetblue3") plot(x$mids,cumsum(x$density)/10,type="o",ylab="Cumulative density",xlab="Bin midpoint",pch=16,cex=.5) abline(v=0.5,lwd=3,col="cadetblue3") #dev.off() ## this compares the null distribution versus one where the mean is +.5. par(mfrow=c(1,1)) cadblue <- rgb(122/255,197/255,205/255,.7) x <- hist(means,col="gold",breaks=-20:20/10,main="Histogram of obtained means\ null hypothesis") hist(means+.5,col=cadblue,add=T) abline(v=0,lwd=3,col="red") abline(v=.5,lwd=3,col="red") abline(v=.25,lwd=3,col="red") abline(v=.7,lwd=3,col="red") ###Example t distribution #pdf("c8-tdist.pdf",width=9,height=5) par(mfrow=c(1,1)) vals <- -500:500/100 plot(vals,dnorm(vals),type="l",lty=1,lwd=4,col="gold") points(vals,dt(vals,2),type="l",main="t distribution", ylim=c(0,.5),xlab="Mean") points(vals,dt(vals,5),type="l") points(vals,dt(vals,15),type="l") points(vals,dt(vals,50),type="l") abline(v=2) #dev.off() 1- pt(2.0,2) ## 2 df/3 observations 1- pt(2.0,5) ## 5 df/ 6 observations 1- pt(2.0,15) ## 15 df/ 16 observations 1- pt(2.0,49) ## 49 df/ 50 observations ##one-sample t-test by hand set.seed(1000) x0 <- rnorm(30,0) x1 <- rnorm(20,.2) x1a <- rnorm(200,.2) x2 <- rnorm(20,.5) x3 <- rnorm(20,mean=.2,sd=.2) x4 <- exp(rnorm(100))-1 library(vioplot) #pdf("c8-vioplots.pdf",width=9,height=6) vioplot(x0,x1,x1a,x2,x3,x4,col="gold", names=c("x0","x1","x1a","x2","x3","x4")) abline(0,0) #dev.off() qqnorm(x0) ##one-sample t-test by hand: mu <- mean(x1) sd <- sd(x1) se <- sd/sqrt(length(x1)) t <- mu/se 1-pt(t,19) (1- pt(t,19))*2 t.test(x0,alternative="greater") t.test(x1,alternative="greater") t.test(x1,alternative="two.sided") t.test(x1a,alternative="greater") t.test(x2,alternative="greater") t.test(x3,alternative="greater") t.test(x4) ############################################################## ###non-parametric one-sample test: sign test/binomial test binom.test(sum(x0>0),length(x0),p=.5) binom.test(sum(x1>0),length(x1),p=.5) ##BSDA library has a simpler to use version: #install.packages("BSDA") library(BSDA) SIGN.test(x0) binom.test(sum(x1>0),length(x1),p=.5) SIGN.test(x1) SIGN.test(x1a) SIGN.test(x2) SIGN.test(x3) SIGN.test(x4) ########################### ### Bayesian one-sample library(BayesFactor) x <- ttestBF(x1) print(x) ttestBF(x0) ## Extract samples to get posterior distributions samples <- ttestBF(x1,iterations=10000,posterior=TRUE) #pdf("c8-posteriors.pdf",width=8,height=5) par(mfrow=c(1,2)) hist(samples[,1],breaks=50,main="Posterior distribution of mu",xlab="Sampled values",col="gold") hist(samples[,2],breaks=50,main="Posterior distribution of sigma2",xlab="Sampled Values",col="gold") #dev.off() ##Error bars with 95% confidence on the sampled means: quantile(samples[,2],c(.05,.95)) ttestBF(x2) ##################################################################################### ###End of (Chapter 8-part 1) ###################################################### ### Chapter 8, part 2 ###################################################### ##We will re-use these data sets: set.seed(1000) x0 <- rnorm(30,0) x1 <- rnorm(20,.2) x1a <- rnorm(200,.2) x2 <- rnorm(20,.5) x3 <- rnorm(20,mean=.2,sd=.2) x4 <- exp(rnorm(100))-1 x0b <- x0 + runif(length(x0),min=-.1,max=.2) ###################################################### ###Paired tests: ##t.test t.test(x0,x0b,paired=T) t.test(x0,x0b,paired=T,alternative="less") t.test(x0-x0b) ##one-sample version t.test(x0,x0b) ##this is wrong, and not a paired t-test ##non-parametric paired tests: diff <- x0-x0b binom.test(sum(diff>0),length(diff)) library(BSDA) SIGN.test(diff) wilcox.test(diff,mu=0) wilcox.test(x0b,x0,paired=T) wilcox.test(x0b,x0,paired=T,alternative="greater") ##calculate wilcox.test by hand for one-sample nums <- runif(100)-.4 v <- sum(rank(abs(nums))[nums>0]) 1-psignrank(v,length(nums)-1) wilcox.test(nums,test="one.sided") ##Bayes factor pair t-tests ttestBF(x0,x0b,paired=T) ttestBF(x0-x0b) ##use a difference instead of pairing ttestBF(x0-x0b,nullInterval=c(0,-Inf)) ##one-sided require specifying the region of the null. ###################################################### ##Independent samples tests ##Independent samples t-test by hand: muX <- mean(x1) muY <- mean(x1a) sdX <- sd(x1) sdY <- sd(x1a) se.pooled <- function(x,y) { varx <- var(x) vary <- var(y) nx <- length(x) ny <- length(y) sqrt(varx/nx + vary/ny) } t <- (muX-muY)/se.pooled(x1,x1a) t pt(t,218,lower.tail=T) pt(t,23.719,lower.tail=T) ## independent-samples t-test; one-sided using the t.test function: t.test(x1,x1a,alternative="less") ##non-parametric Wilcox/ Mann-Whitney U test: wilcox.test(x1,x1a,alternative="less") ##Wilcox test for independent samples set.seed(100) x <- runif(20) y <- runif(20)+.1 pairs <- outer(x,y,">=") pairs sum(pairs) length(pairs) plot(pwilcox(0:400,20,20)) plot(dwilcox(0:400,20,20)) 1-pwilcox(115,20,20) wilcox.test(x,y) wilcox.test(x,y,alternative = "less") wilcox.test(log(10+x),log(10+y),alternative = "less") wilcox.test(x1,x1a) ##alternative way to execute the test: groups <- rep(1:2,each=length(x)) values <- c(x,y) w <- wilcox.test(values~groups) #pdf("c8wilcox.pdf",width=8,height=4) par(mfrow=c(1,2)) plot(pwilcox(0:400,20,20),type="s",xlab="U/W statistic",ylab="Cumulative Probability",main="Wilcox Rank Sum Distribution") plot(dwilcox(0:400,20,20),type="s",xlab="U/W statistic",ylab="Density",main="Wilcox Rank Sum Distribution") #dev.off() ##bayesian independent samples tests: library(BayesFactor) ttestBF(x1,x1a) ttestBF(x,y) ttestBF(x,y,nullInterval=c(-.1,.1)) ##############################################################3 ## Computing covariance and correlation ## set.seed(100) data <- runif (1000) var.sample <- function ( x ) { sum (( x - mean ( x ) ) ^2) / ( length ( x ) -1) } ##these are the same: var(data) var.sample(data) ##factor the formula into two parts var.sample2 <- function ( x ) { sum (( x - mean ( x ) ) * (x - mean ( x ) ) ) / ( length ( x ) -1) } var.sample2(data) ##separate the first and the second: var.sample3 <-function(x,y){sum((x-mean(x))*(y-mean(y)))/(length(x)-1)} var.sample3(data,data) x<- -500:500/100 y <- rnorm(1001)*7-x var.sample <-function(x,y){sum((x-mean(x))*(y-mean(y)))/(length(x)-1)} var.sample(x,y) #pdf("c8-scatter1.pdf",width=5,height=4) plot(x,y,cex=.5,col="cadetblue3",pch=16) text(0,22,paste("Covariance: ", round(var.sample(x,y),3))) #dev.off() var.sample(x,y) var.sample(x,-y) var.sample(-x,x) cov(x,y) cov(10*x,y) cov.normalized <-function(x,y){ cov <- sum((x-mean(x))*(y-mean(y)))/(length(x)-1) var1 <- sum((x-mean(x))^2)/(length(x)-1) var2 <- sum((y-mean(y))^2)/(length(y)-1) cov/sqrt(var1)/sqrt(var2) } cov.normalized(x,y) cov.normalized(x,y*1000) x <- runif(10000) y <- runif(10000) z <- x + y cor(x,y) cor(x,z) cor(y,z) cor(x,z)^2 sqrt(.5) set.seed(1000) x<- -500:500/100 y <- rnorm(1001)*5-x cor.test(x,y) sub<-1:100 cor.test(x[sub],y[sub]) sub2<-sample(1:1000,100) cor.test(x[sub2],y[sub2]) # pdf("c8scatter2.pdf",width=5,height=5) par(mfrow=c(1,1)) plot(x,y) points(x[sub],y[sub],col="red",pch=16) points(x[sub2],y[sub2],col="blue",pch=16) # dev.off() #pdf("c8corsubsample.pdf",width=6,height=6) par(mfrow=c(2,2)) sub<-sample(1:1000,50) test <- cor.test(x[sub],y[sub]) plot(x[sub],y[sub ], main= paste("Correlation =",round(test$estimate,3))) sub<-sample(1:1000,50) test <- cor.test(x[sub],y[sub]) plot(x[sub],y[sub ], main= paste("Correlation =",round(test$estimate,3))) sub<-sample(1:1000,50) test <- cor.test(x[sub],y[sub]) plot(x[sub],y[sub ], main= paste("Correlation =",round(test$estimate,3))) sub<-sample(1:1000,50) test <- cor.test(x[sub],y[sub]) plot(x[sub],y[sub ], main= paste("Correlation =",round(test$estimate,3))) #dev.off() set.seed(100) cors <- rep(0,10000) for(i in 1:10000) { sub<-sample(1:1000,20); cors[i] <- cor.test(x[sub],y[sub])$estimate } #pdf("c8hist.pdf", width=5,height=4) hist(cors,col="gold",xlab="Sampled correlation estimates") #dev.off() ############################### ## Non-parametric correlation estimates: x <- runif(100) y <- runif(100) cor.test(x,y) ##Change one value: x[2] <- 5 y[2] <- 5 cor.test(x,y) cor.test(x,y,method="spearman") library(BayesFactor) #Bayes factor test still fooled. print(correlationBF(x,y)) x<- -500:500/100 y <- rnorm(1001)*7 +x x2 <- exp(x) pdf("c8-skew.pdf",width=6,height=4) plot(x2,y,pch=16,col="cadetblue3",cex=1.3) points(x2,y,pch=1,col="grey20",cex=1.3) text(75,-20,paste("Pearson R = ",round(cor(x2,y),3))) text(75,-25,paste("Spearman Rho = ",round(cor(x2,y,method="spearman"),3))) dev.off() cor(x,y) cor(x,y) cor(x2,y) cor(x2,y,method="spearman") cor.test(rank(x2),rank(y)) data(iris) pairs(iris) #pdf("c8-iris.pdf",width=8,height=8) pairs(iris,col=iris[,5],pch=16) #dev.off() library(GGally) library(ggplot) pdf("c8-pairs.pdf",width=12,height=12) ggpairs(iris,ggplot2::aes(colour=Species)) dev.off() cor(iris[,1:4]) for(i in 1:3) { for (j in (i+1):4) { cat("Comparing",i,"to",j,"\n") print(cor.test(iris[,i],iris[,j])) } } ##Bayes factor tests: for(i in 1:3) { for (j in (i+1):4) { cat("Comparing",i,"to",j,"\n") print(correlationBF(iris[,i],iris[,j])) } } ##################################### ###Part 3 of Chapter 8 ###################################### ## Categorical data: print(HairEyeColor) #pdf("c8-hair1.pdf",width=8,height=4) par(mfrow=c(1,2)) barplot(HairEyeColor[,,1],main="Male") barplot(HairEyeColor[,,2],main="Female") #dev.off() #pdf("c8-hair2.pdf",width=8,height=4) par(mfrow=c(1,2)) barplot(t(HairEyeColor[,,1]),col=c("brown4","blue","khaki","darkgreen"),main="Male eye color",xlab="Hair color") barplot(t(HairEyeColor[,,2]),col=c("brown4","blue","khaki","darkgreen"),main="Female eye color",xlab="Hair color") #dev.off() #pdf("c8-hair3.pdf",width=6,height=4) hc2 <- t(HairEyeColor[,,1]+HairEyeColor[,,2]) par(mfrow=c(1,1)) barplot(hc2,col=c("brown4","blue","khaki","darkgreen")) #dev.off() observed <- hc2[,4] #Compute the expected proportions: brownblond brownblond <- hc2[,c(2,4)] expected.prop <- rowSums(brownblond)/sum(brownblond) expected <- matrix(rep(colSums(brownblond),each=4),nrow=4)* matrix(rep(expected.prop,2),nrow=4) #pdf("c8-expected.pdf",width=6,height=4) barplot(expected,beside=T,names=c("Brown","Blond"), col=c("brown4","blue","khaki","darkgreen"), ylab="Expected count",main="Expected Distribution") #dev.off() sumdiff<-sum((brownblond - expected)^2/(expected)) sumdiff pchisq(85.6,3) chisq.test(brownblond) #Pearson's Chi-squared test # #data: brownblond #X-squared = 85.5966, df = 3, p-value < 2.2e-16 chisq.test(hc2[,2],p=hc2[,4],rescale.p=T) chisq.test(hc2[,c(2,4)]) ##Chi-squared test is sort of a non-parametric test already. ## we can do a bayes factor version: contingencyTableBF(brownblond ,sampleType = "indepMulti", fixedMargin = "cols") colSums(hc2) ##ALternative uses of a chi-squared test: ##test for equality of outcome for a marginal distribution colSums(hc2) chisq.test(colSums(hc2),p=c(.25 ,.25 ,.25 ,.25)) ##Test for equality between two subsets: ## look at eye color for black versus brown hair chisq.test(hc2[,1],p=hc2[,2], rescale.p=T) chisq.test(hc2[,1:2]) ##Notice that the above are different. One asserts that brown hair is the truth; ## the other expects them to be the same and looks for deviations of the sameness. ############## ## Chi-squared test on raw data outcomes rather than tables: library(BayesFactor) set.seed (100) sample1 <- sample(LETTERS [1:6] ,1000 , prob=c(1,1,1,1,1,.2),replace=T) sample2 <- sample(LETTERS [1:6] ,1000 , replace=T) sample3 <- sample(c("yes","no","maybe"), 1000, replace=T) chisq.test(sample1,sample2) chisq.test(sample1 ,sample3) chisq.test(table(sample1 ,sample2)) ##this is the same. contingencyTableBF(table(sample1 ,sample2), sampleType = "indepMulti", fixedMargin = "cols") ###Alternative: rather than whether they are independent, test for whether the ###marginal distributions differ table <- tapply(rep (1 ,2000),list(c(sample1 ,sample2), c(rep(1:2, each =1000))),length) table[is.na(table)]<- 0 table chisq.test(table) contingencyTableBF(table , sampleType = "indepMulti", fixedMargin = "cols") ##So, although the to distribution are independent, they do not have the same distributions. ## We can only do this comparison because the set of labels are the same for the two; ## maybe they are choices before and after a treatment. ###################### ## Technical issues (mostly on t-tests) ###################### ## Non-normal and skewed data ## The non-parametric tests are not impacted by non-normality--this is their main benefit. ## As N increases, the traditional tests are impacted minimally, as they are estimating ## the distribution of the estimate of the mean. Likelihood and Bayesian tests are ## highly dependent on the form of the distribution, and so violations here can have large consequences. ## In these cases, it is usually a good idea to try to transform your data in such a ## way that they end up being roughly normally distributed. But you must also be confident ## that you can interpret the transformed data. The alternative is to model the ## distribution exactly using an alternative distribution. This is often done in ## Bayesian and ML approaches, and even within more traditional approaches in the ## context of generalized linear regression models. ######################## ## Different sample size x1 <- rnorm(20,.2) y1 <- x1 + .3 rnorm(20,mean=.3,sd=.2) t.test(x1,y1) t.test(c(x1,rnorm(50,.2)),y1) #################################### ## Between versus within ###Exercise set.seed(1000) x0 <- rnorm(30,0) x1 <- rnorm(20,.2) x1a <- rnorm(50,.2) y1 <- x1 + rnorm(20,mean=.25) y1a <- x1a + rnorm(50,mean=.25) ##For this one, it does not matter t.test(x1,y1,paired=T) t.test(x1,y1,paired=F) ##for this one, it does matter: t.test(x1a,y1a,paired=T) t.test(x1a,y1a,paired=F) set.seed (1001) x <- runif (20) y1 <- x + 1.3 + rnorm (20)*.3 y2 <- x + 1.3 + rnorm (20)*5 par(mfrow=c(1,2)) plot(x,y2) matplot(cbind(x,y1,y2)) t.test(x,y1,paired=T) t.test(x,y2,paired=T) ######################### ##Effect sizes: ## Cohen's d for independent samples: (mean(x)-mean(y1)) / sd(c(x-mean(x),y1 -mean(y1))) (mean(x)-mean(y2)) / sd(c(x-mean(x),y2 -mean(y2))) ## Cohen's d for paired samples: delta1 <- x - y1 delta2 <- x - y2 mean(delta1)/sd(delta1) mean(delta2)/sd(delta2) library(effectsize) ##using some examples from earlier: effectsize(t.test(x,y1,paired=T)) cohens_d(t.test(x,y1,paired=T)) effectsize(chisq.test(table)) ##### ## Example: pre-post testing using paired-samples/one-sample tests library(vioplot) set.seed(101) n <- 40 pretest <- exp(rnorm(n)) posttest <- exp(log(pretest) + rnorm(n)+.3) difference <- posttest-pretest vioplot(pretest,posttest,col="gold") hist(pretest,breaks=20) hist(posttest,breaks=20) hist(difference,col="gold") abline(v=0) t.test(difference) binom.test(sum(difference>0),40) library(BayesFactor) ttestBF(difference) ##In-class exercises: ##For each of these, answer the question, explaining the result using the ## appropriate statistical test. Accompany the test with a figure showing ##result. ##For BSDA::Movie, was the difference in moral aptitude positive or negative? BSDA::Movie ##Compare toothgrowth data for OJ versus VC, under each dosage. Is OJ better or worse than VC? ToothGrowth ##compare bone density by group for the BSDA::Bones data set ## Medical advice suggests that bone density should be 210 units. Determine ## whether the active group is higher than 210. Determine whether the nonactive group ## is lower than 210. BSDA::Bones ##Did rankings for dogs in 1998 differ than 1992? ## note--treat each dog rating as a separate independent identical observation measured twice. ## We would like to have the raw numbers by which these values were established, ## instead of averages within a class. BSDA::Dogs ## ###For HairEyeColor, ## Does the proportion of eye color depend on gender? ## Does the proportion of hair color depend on gender? ## Does the combined proportion of hair and eye color depend on gender? HairEyeColor ## Some examples from class set.seed(100) data <- rnorm(25,mean=.2) data2 <- rnorm(30,mean=-.2) data3 <- rnorm(30, mean=-5) data4 <- data + rnorm(25,mean=.2,sd=.3) t.test(data) library(BSDA) SIGN.test(data) library(BayesFactor) ttestBF(data) ##independent-samples t-test t.test(data,data2) t.test(data2,data3,var.equal = FALSE) t.test(data2,data3,var.equal = TRUE) ttestBF(data,data2) t.test(data,data4,paired=T) SIGN.test(data4>data) wilcox.test(data,data4,paired=T) ttestBF(data,data4,paired=T)