## Code for Book: Applied Statistical Analysis in R for Psychology and Human factors ## (c) Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 2. ## ########################################### ## 2.1. Reading and writing data from files ##auto-generated from menu. Notice the hard-coded file name 'c5data' <- read.table("~/Documents/data/c5data.txt", quote="\"") c5data <- read.table("c5data.txt", quote="\"", comment.char="") ##simple approach requiring directory to be set data <- read.table("c5data.txt") head(data) ##read in a version with the header: data <- read.table("c5data2.txt",header=T) ## Set the column headers in R when it is read in: head <- c("subcode","trial","step","time","trialtime", "targx","targy","mousex","mousey","ontarget", "dtime","totaltime","distoff","sumdistoff") data <- read.table("c5data.txt",col.names=head) ##Set the column headers after reading in: data <- read.table("c5data.txt") colnames(data) <- head ##watch out: data <- read.table("c5data2.txt",header=F) data[1:10,] ##another problem: data <- read.table("c5data.txt",header=T) data[1:5,] ##Skip the first line of the data file data <- read.table("c5data2.txt",skip=1, header=F) colnames(data) <- head ######### ## Writing out data to a file ######### write.csv(data,"data.csv") write.table(data,"data.txt") ## Read in a .csv or tab-delimited dat2 <- read.table("data.csv",sep=",") dat2 <- read.csv("data.csv") ########################################### ## 2.2. Examining data structures ########################################### #Some ways to get basic information about a data structure data(trees) ##load the data ## look at some summary values dim(trees) str(trees) attributes(trees) summary(trees) ##try plotting to see what is going on #pdf("c2f1.pdf",width=9,height=5) par(mfrow=c(1,2)) matplot(trees) boxplot(trees) #dev.off() ##look at the top of the data: head(trees) trees[1:5,] ##look at just some columns: plot(trees[,1:2]) plot(trees[,-3]) ########################################### ## 2.3. Sorting ########################################### #create a data frame x <- runif(100) y <- x + runif(100,-.3,.3) dat <- data.frame(x=x,y=y) print(dat) ##example use of order order(c(2,.5,1,1.5,2,2.5)) rank(c(2,.5,1,1.5,2,2.5)) order(order(c(2,.5,1,1.5,2,2.5))) #sort dat by y: ord <- order(dat$y) dat2 <- dat[ord,] head(dat2) ##comes in handy when plotting: #pdf("c2f2.pdf",width=8,height=6) par(mfrow=c(1,2),mar=c(3,3,0,0)) ##Make two plots plot(dat,type="b") ##Plotted in random order plot(dat2,type="b") ##sequentially ordered by x #dev.off() ########################################### ## 2.4. Aggregation ########################################### ##create a voter database: party <- c("R","R","D","R","R","D","D","D","R","R","D") gender <- c("M","M","F","F","F","F","M","M","F","M","M") vote <- c("A","B","A","A","A","B","A","A","B","B","A") survey <- data.frame(party,gender,vote) ##look at values of each variable: table(survey$party) table(survey$gender) table(survey$vote) ## look at pairs/contingency tables: table(survey$gender,survey$vote) table(survey$party,survey$vote) ##look at it all: xtab <- table(survey) xtab ##slice the table: xtab[1,,] #party D xtab[2,,] #party R xtab["D",,] #party = D xtab[1,2,] #Democratic Men ##magical creatures example. Which ice cream to different animals like: x <- sample(c("a","b","c"),50,replace=T) y <- sample(c("unicorns","pegasuses"),50,replace=T) z <- sample(c("chocolate","vanilla","strawberry"),50,replace=T) table(x) table(x,y) ##These are the same: table(x,y,z) table(data.frame(x,y,z)) cross <- table(x,y,z) cross[1,,] cross["a",,] cross[,2,] cross[,2,3] cross[1,1,2] cross[1:2,2,3] ########################################### ##2.5. Aggregate and tapply functions ########################################### set.seed(111) x <- rnorm(500) ##generate random numbers y <- x + runif(500,-.3,.3) ##related random numbers dat3 <- data.frame(x=x,y=y) ##create a data frame dat3$factor <- factor(round(dat3$x/10,1)*10) ##make bins from x ##aggregate y by the bins: dat3.agg <- aggregate(dat3$y,list(bin=dat3$factor),mean) ##aggregate x by the same bins: dat3.agg$xvals <- aggregate(dat3$x,list(bin=dat3$factor),mean)$x ##use tapply to aggregate y by the bins: dat3.tab <- tapply(dat3$y,list(bin=dat3$factor),mean) dat3.agg dat3.tab ##tables using rank set.seed(111) dat3$factor2 <- floor(rank(dat3$x)/20) dat3.agg2 <- aggregate(dat3$y,list(bin=dat3$factor2),mean) dat3.agg2$xvals <-aggregate(dat3$x,list(bin=dat3$factor2),mean)$x #pdf("c2f3.pdf",width=6,height=6) plot(dat3.agg$xvals,dat3.agg$x,col="blue",pch=16, xlab="Mean values of x bins",ylab="Values of y") col2 <- rgb(1,0,0,.6) #a see-through red col1 <- rgb(0,0,1,.6) # a see-through blue points(dat3.agg2$xvals,dat3.agg2$x,cex=3,col=col2,pch=17) points(dat3.agg$xvals,dat3.agg$x,cex=1,col=col1,pch=16) #dev.off() ########################################### ##2.7. The apply function ########################################### set.seed(100) m <- matrix(runif(28),7,4) m ##aggregate by row: round(apply(m,1,var),3) ##By row #Find minimum in each column mins <- apply(m,2,min) ##By column #find maximum in each column: maxs <- apply(m,2,max) ##By column pdf("c2f4.pdf",width=6,height=4) par(mfrow=c(1,1)) matplot(t(m),type="p",pch=16,col="black") points(mins,type="l") points(maxs,type="l") points(apply(m,2,mean),type="o",lwd=3) dev.off() ##aggregating by row or column: rowSums(m) colMeans(m) ########################################### ##2.6. An end-to-end example: Chick Weight ########################################### #preliminary investigation: data(ChickWeight) attributes(ChickWeight)$labels head(ChickWeight) length(table(ChickWeight$Chick)) #plot the cloud pdf("c2chick1.pdf",width=8,height=5) plot(ChickWeight$Time,ChickWeight$weight, ylab="Weight (g)",xlab="Age (days)") dev.off() #color the points: #pdf("c2chick2.pdf",width=8,height=5) cscheme <- c("red","darkgreen","black","orange") plot(ChickWeight$Time,ChickWeight$weight,ylab="Weight (g)",xlab="Age (days)", pch=16, col=cscheme[ChickWeight$Diet]) #dev.off() ##aggregate: cw.agg <- tapply(ChickWeight$weight, list(time=ChickWeight$Time, diet=ChickWeight$Diet),mean) ##record the true days: times <- c(0,2,4,6,8,10,12,14,16,18,20,21) times <- as.numeric(rownames(cw.agg)) #pdf("c2chick3.pdf",width=8,height=5) plot(ChickWeight$Time,ChickWeight$weight,cex=.5, ylab="Weight (g)",xlab="Age (days)", pch=1, col=cscheme[ChickWeight$Diet]) matplot(times,cw.agg,add=T,type="l",lwd=3, col=cscheme,lty=1) #dev.off() ##dig a little deeper: cw.bysub<- tapply(ChickWeight$weight, list(time=ChickWeight$Time, chick=ChickWeight$Chick),mean) ## look at just the first ten columns: cw.bysub[,1:10] diets <- tapply(as.numeric(as.character(ChickWeight$Diet)), list(chick=ChickWeight$Chick),median) diets <- aggregate(as.numeric(as.character(ChickWeight$Diet)), list(chick=ChickWeight$Chick),median)$x diets #pdf("c2chick4.pdf",width=8,height=5) matplot(times,cw.bysub,col=cscheme[diets],type="l",lty=3, ylab="Weight (mg)",xlab="Age (days)") ##Lets add some gridlines segments(-10,0:15*25,25,0:15*25,lty=3,col="grey") segments(-10,0:7*50,25,0:7*50,lty=1,col="grey") points(ChickWeight$Time,ChickWeight$weight,cex=.5, pch=1, col=cscheme[ChickWeight$Diet]) #erase back of line matplot(times,cw.agg,add=T,type="l",lwd=7, col="white",lty=1) #replot line: matplot(times,cw.agg,add=T,type="l",lwd=3, col=cscheme,lty=1) title("Chick weights over time by diet") ##Make a legend here. legend(2,350,paste("Diet",c(1:4)),col=cscheme,lty=1,lwd=5,bg="white") #dev.off()