##  Code for Book: Applied Statistical Analysis in R for Psychology and Human factors
##  (c) Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 2.
##


###########################################
##  2.1. Reading and writing data from files

##auto-generated from menu. Notice the hard-coded file name
'c5data' <- read.table("~/Documents/data/c5data.txt", quote="\"")


c5data <- read.table("c5data.txt", quote="\"", comment.char="")


##simple approach requiring directory to be set
data <- read.table("c5data.txt")
head(data)

##read in a version with the header:
data <- read.table("c5data2.txt",header=T)


## Set the column headers in R when it is read in:
head <- c("subcode","trial","step","time","trialtime",
           "targx","targy","mousex","mousey","ontarget",
           "dtime","totaltime","distoff","sumdistoff")
data <- read.table("c5data.txt",col.names=head)


##Set the column headers after reading in:
data <- read.table("c5data.txt")
colnames(data) <- head

##watch out:
data <- read.table("c5data2.txt",header=F)
data[1:10,]

##another problem:
data <- read.table("c5data.txt",header=T)
data[1:5,]


##Skip the first line of the data file
data <- read.table("c5data2.txt",skip=1, header=F)
colnames(data) <- head


#########
## Writing out data to a file
#########

write.csv(data,"data.csv")
write.table(data,"data.txt")


## Read in a .csv or tab-delimited

dat2 <- read.table("data.csv",sep=",")
dat2 <- read.csv("data.csv")


###########################################
## 2.2. Examining data structures
###########################################

 
#Some ways to get basic information about a data structure 
data(trees)  ##load the data

 
 ## look at some summary values
 dim(trees)
 str(trees)
 attributes(trees)
 summary(trees)
 
 ##try plotting to see what is going on
#pdf("c2f1.pdf",width=9,height=5)
par(mfrow=c(1,2))
matplot(trees)
boxplot(trees)
#dev.off()

##look at the top of the data:
head(trees) 
trees[1:5,] 

##look at just some columns:  
plot(trees[,1:2])
plot(trees[,-3])

###########################################
## 2.3. Sorting
###########################################

#create a data frame
x <- runif(100)
y <- x + runif(100,-.3,.3)
dat <- data.frame(x=x,y=y)
print(dat)

##example use of order
order(c(2,.5,1,1.5,2,2.5))

rank(c(2,.5,1,1.5,2,2.5))
order(order(c(2,.5,1,1.5,2,2.5)))

#sort dat by y:
ord <- order(dat$y)
dat2 <- dat[ord,]
head(dat2)      

##comes in handy when plotting:
#pdf("c2f2.pdf",width=8,height=6)
par(mfrow=c(1,2),mar=c(3,3,0,0))    ##Make two plots
plot(dat,type="b")   ##Plotted in random order
plot(dat2,type="b")  ##sequentially ordered by x
#dev.off()

###########################################
## 2.4. Aggregation
###########################################

##create a voter database:
party <-  c("R","R","D","R","R","D","D","D","R","R","D")
gender <- c("M","M","F","F","F","F","M","M","F","M","M")
vote   <- c("A","B","A","A","A","B","A","A","B","B","A")
survey <- data.frame(party,gender,vote)

##look at values of each variable: 
table(survey$party)
table(survey$gender)
table(survey$vote)

## look at pairs/contingency tables:
table(survey$gender,survey$vote)
table(survey$party,survey$vote)

##look at it all:
xtab <- table(survey)
xtab

##slice the table:
 xtab[1,,]   #party D
 xtab[2,,]   #party R
 xtab["D",,] #party = D
 xtab[1,2,]   #Democratic Men

 
##magical creatures example. Which ice cream to different animals like:
 
 x <- sample(c("a","b","c"),50,replace=T)
 y <- sample(c("unicorns","pegasuses"),50,replace=T)
 z <- sample(c("chocolate","vanilla","strawberry"),50,replace=T)

table(x)
table(x,y)
 
 ##These are the same:
 table(x,y,z)
 table(data.frame(x,y,z))

 cross  <-  table(x,y,z)
 cross[1,,]
 cross["a",,]
 cross[,2,]
 cross[,2,3]
 cross[1,1,2]
 cross[1:2,2,3]

 
###########################################
##2.5.  Aggregate and tapply functions
###########################################
 set.seed(111)
x <- rnorm(500)                  ##generate random numbers
y <- x + runif(500,-.3,.3)       ##related random numbers
dat3 <- data.frame(x=x,y=y)      ##create a data frame
dat3$factor <- factor(round(dat3$x/10,1)*10)  ##make bins from x

##aggregate y by the bins:
dat3.agg <-     aggregate(dat3$y,list(bin=dat3$factor),mean)
##aggregate x by the same bins:
dat3.agg$xvals <- aggregate(dat3$x,list(bin=dat3$factor),mean)$x
##use tapply to aggregate y by the bins:
dat3.tab <- tapply(dat3$y,list(bin=dat3$factor),mean)

dat3.agg

dat3.tab

##tables using rank
set.seed(111)
dat3$factor2 <- floor(rank(dat3$x)/20)
dat3.agg2 <- aggregate(dat3$y,list(bin=dat3$factor2),mean)
dat3.agg2$xvals <-aggregate(dat3$x,list(bin=dat3$factor2),mean)$x 


#pdf("c2f3.pdf",width=6,height=6)
plot(dat3.agg$xvals,dat3.agg$x,col="blue",pch=16,
     xlab="Mean values of x bins",ylab="Values of y")
col2 <- rgb(1,0,0,.6) #a see-through red
col1 <- rgb(0,0,1,.6) # a see-through blue
points(dat3.agg2$xvals,dat3.agg2$x,cex=3,col=col2,pch=17)
points(dat3.agg$xvals,dat3.agg$x,cex=1,col=col1,pch=16)
#dev.off()

###########################################
##2.7. The apply function
###########################################
set.seed(100)
m <- matrix(runif(28),7,4)
m
##aggregate by row:
round(apply(m,1,var),3) ##By row
#Find minimum in each column
mins <- apply(m,2,min) ##By column
#find maximum in each column:
maxs <- apply(m,2,max) ##By column


pdf("c2f4.pdf",width=6,height=4)
par(mfrow=c(1,1))
matplot(t(m),type="p",pch=16,col="black")
points(mins,type="l")
points(maxs,type="l")
points(apply(m,2,mean),type="o",lwd=3)
dev.off()


##aggregating by row or column:
rowSums(m)
colMeans(m)


###########################################
##2.6. An end-to-end example: Chick Weight
###########################################
#preliminary investigation:
data(ChickWeight)
attributes(ChickWeight)$labels
head(ChickWeight)
length(table(ChickWeight$Chick))
 
#plot the cloud
pdf("c2chick1.pdf",width=8,height=5)
plot(ChickWeight$Time,ChickWeight$weight,
      ylab="Weight (g)",xlab="Age (days)")
dev.off()


#color the points:

#pdf("c2chick2.pdf",width=8,height=5)
cscheme <- c("red","darkgreen","black","orange")
plot(ChickWeight$Time,ChickWeight$weight,ylab="Weight (g)",xlab="Age (days)",
     pch=16, col=cscheme[ChickWeight$Diet])
#dev.off()

##aggregate:
cw.agg <- tapply(ChickWeight$weight, 
                    list(time=ChickWeight$Time,
                    diet=ChickWeight$Diet),mean)
##record the true days:
times <- c(0,2,4,6,8,10,12,14,16,18,20,21)
times <- as.numeric(rownames(cw.agg))


#pdf("c2chick3.pdf",width=8,height=5)
plot(ChickWeight$Time,ChickWeight$weight,cex=.5,
     ylab="Weight (g)",xlab="Age (days)",
     pch=1, col=cscheme[ChickWeight$Diet])

matplot(times,cw.agg,add=T,type="l",lwd=3,
       col=cscheme,lty=1)
#dev.off()


##dig a little deeper:

cw.bysub<- tapply(ChickWeight$weight, 
                   list(time=ChickWeight$Time,
                       chick=ChickWeight$Chick),mean)


## look at just the first ten columns:
cw.bysub[,1:10]

diets <- tapply(as.numeric(as.character(ChickWeight$Diet)),
                   list(chick=ChickWeight$Chick),median)


diets <- aggregate(as.numeric(as.character(ChickWeight$Diet)),
                   list(chick=ChickWeight$Chick),median)$x
diets 


#pdf("c2chick4.pdf",width=8,height=5)

matplot(times,cw.bysub,col=cscheme[diets],type="l",lty=3,
         ylab="Weight (mg)",xlab="Age (days)")

##Lets add some gridlines
segments(-10,0:15*25,25,0:15*25,lty=3,col="grey")
segments(-10,0:7*50,25,0:7*50,lty=1,col="grey")

points(ChickWeight$Time,ChickWeight$weight,cex=.5,
     pch=1, col=cscheme[ChickWeight$Diet])

#erase back of line
matplot(times,cw.agg,add=T,type="l",lwd=7, col="white",lty=1)
#replot line:
matplot(times,cw.agg,add=T,type="l",lwd=3, col=cscheme,lty=1)

title("Chick weights over time by diet")

##Make a legend here.
legend(2,350,paste("Diet",c(1:4)),col=cscheme,lty=1,lwd=5,bg="white")
#dev.off()