## End-to-end analysis example ## This example is intended to cumulatively address many of the ## things we have learned over the course of the semester. ## 1. Data reading, management ## ## For the weather data set, look at thhe snowfall across months of the year. You are ## a climatologist tasks to help residents better understand the the snowfall ## patterns in Houghton, both within a year and historically. Your ultimate ## product will be a short report meant for planners and historians. ## ## Create two graphics that show basic snowfall trends, one giving some type of summary ## over the entire data set from 1890, and one showing the trends across the ## different months of the year. ## For each graphic, write a paragraph decription of the trends you are displaying. ## ## For each graph, you will get a baseline credit of 5 points for creating the ## the graphic, 3 points for the text description, and up to 7 additional points ## (1 pt. each) for additional improvement to the graph. These improvement could ## include adding color, gridlines, or otherwise changing the graphical appearance, ## adding layers showing means/medians/max/min or ## other percentiles, adding a legend, etc. library(gplots) filter <- 1:131 alldat <- read.csv("weather.csv")[filter,] ##snowfall and other data: years <- alldat[,1] weather <- alldat[,1:9] elnino <- alldat[,12:23] sunspots <- alldat[,25:36] avgtemp <- alldat[,38:49] ice <- alldat[,50:57] matplot(t(ice),type="o",pch=16,col="black",lty=3, main="Ice cover") icebymonth <- as.vector(unlist(t(ice[,-1]))) plot(icebymonth[550:900],type="l") ## A straight matplot is sort of hard to grasp: matplot(weather[,2:9],type="l") barplot(as.matrix(weather[,2:9])) cols <- rev(rich.colors(8,"blues")) barplot(t(as.matrix(weather[,2:9])),col=cols) ##how might a transform work? barplot(t(log(1+as.matrix(weather[,2:9]))),col=cols) ## I'd like to make a stacked line plot. To do this, I can apply the cumsum ##function to each row of the data frame: bymonth <- apply(weather[,2:9],1,cumsum) par(mfrow=c(1,2)) matplot(bymonth,type="l",col="black",lty=1,main="Cumulative snowfall") matplot(t(bymonth),type="l",col="black",lty=1) #I'd like to fill in the area, and use a color gradient #to color each month-region separately. The following evolved over iterations filled.line <- function(x,ymatrix,xlab="",ylab="",main="") { matplot(x,ymatrix,lty=1,type="l",xlab=xlab,ylab=ylab,main=main) ##Figure out some vertical lines. xrange <- round(range(x),-1) abline(v=seq(xrange[1],xrange[2],10),lty=2,col="grey") cols <- rev(rich.colors((ncol(ymatrix)),"blues")) for(i in ncol(ymatrix):1) { polygon(c(min(x),x,max(x)), c(0,ymatrix[,i],0),col=cols[i],lty=0) } polygon(c(min(x),x,max(x)), c(0,ymatrix[,ncol(ymatrix)],0),lty=1) # fit <- lm(ymatrix[,ncol(ymatrix)]~poly(x,degree=10)) # points(x,fit$fit,lty=1,lwd=3,type="l") legend(min(x),355,rev(c("Oct","Nov","Dec","Jan","Feb","Mar","Apr","May")),bty="o",pt.cex=1.9, pch=15,col=rev(cols),y.intersp=.9,lty=0) legend(min(x),355,rev(c("","","","","","","","")),bty="n",pt.cex=1.9, pch=0,y.intersp=.9) } years.num <- as.numeric(substr(years,1,4)) pdf("houghton.pdf",width=8,height=5) filled.line(years.num[filter],t(bymonth)[filter,],ylab="Inches of snow per Winter",xlab="Year",main="Houghton County Annual Snowfall") dev.off() ## Also, we'd like to be able to look at this by month ## matplot(t(weather[,2:9]),type="o",pch=16,cex=.2,col="black",xaxt= "n",lty=3, ylab="Monthly snowfall (in)",las=1) axis(1,1:8,c("OCT","NOV","DEC","JAN","FEB","MAR","APR","MAY")) for(i in seq(0,120,20)) abline(i,0,lty=2,lwd=.8,col="grey") points(rowMeans(t(weather[filter,2:9])),type="l",col="navy",lwd=5) ##can we identify which month has the most snow? weather$max <- apply(weather[,2:9],1,which.max) points(weather$max, weather[,2:9][cbind(1:nrow(weather),weather$max)],col="gold",pch=15) ##Question 2: ## There appears to be a shift in snowfall patterns ## over the past century, with an increase happening around 1920. ## Consider two time eras, one for the 30 years before 1920, and one ## for the 93 years after 1920. Conduct ## statistical tests at p=.05 that show whether snowfall increased in each month ## of the year, or just some of the months. library(BayesFactor) early <- rep(c(T,F),c(30,length(years)-30)) late <- rep(c(F,T),c(30,length(years)-30)) t.test(weather$OCT[early],weather$OCT[late]) t.test(weather$NOV[early],weather$NOV[late]) t.test(weather$DEC[early],weather$DEC[late]) t.test(weather$JAN[early],weather$JAN[late]) t.test(weather$FEB[early],weather$FEB[late]) t.test(weather$MAR[early],weather$MAR[late]) t.test(weather$APR[early],weather$APR[late]) ttestBF(weather$OCT[early],weather$OCT[late]) ttestBF(weather$NOV[early],weather$NOV[late]) ttestBF(weather$DEC[early],weather$DEC[late]) ttestBF(weather$JAN[early],weather$JAN[late]) ttestBF(weather$FEB[early],weather$FEB[late]) ttestBF(weather$MAR[early],weather$MAR[late]) ttestBF(weather$APR[early],weather$APR[late]) plot(colMeans(weather[early,2:9]),type="o",ylim=c(0,80), ylab="Average monthly snowfall" ,xaxt="n",xlab="Month", las=1,main="Average snowfall by month",cex=2,pch=16) points(colMeans(weather[late,2:9],na.rm=T),type="o",pch=15,cex=2) axis(1,1:8,c("OCT","NOV","DEC","JAN","FEB","MAR","APR","MAY")) for(i in c(0:8*10)) abline(i,0,col="grey",lty=2) means <-rbind(colMeans(weather[early,2:9],na.rm=T), colMeans(weather[late,2:9],na.rm=T)) reliable <- c(F,F,T,T,T,T,F,F) rect((1:8)[reliable]-.12, means[1,reliable]-5, (1:8)[reliable]+.12, means[2,reliable]+5, border="navy",lwd=3) text(5,77,"Reliable differences") segments(5,75,3:6,means[2,reliable]+6) points(colMeans(weather[early,2:9]),type="o",ylim=c(0,80),col="red",pch=16,cex=1.9) points(colMeans(weather[late,2:9],na.rm=T),type="o",col="blue",pch=15,cex=1.9) legend(1,80,c("1920-present","1890-1920"),lwd=2,lty=1,col=c("blue","red"),pt.cex=2,pch=c(15,16)) ##Question 3: ## It appears that in the 'old' era, December tended to get more snow ## than January, but this has switched. Conduct an appropriate statistical ## test determining whether each of these are reliable. ## Are these differences reliable. That is, for each era, was January larger ## than December? Describe the results as you would in a research report. You will get ## 2 points for conducting a test that answers the question, 2 additional points ## for conducting the most appropriate test for the question, and 2 points for ## properly describing the results. ##Test for all years: ##This is the 'wrong' test t.test(weather$DEC,weather$JAN) ##This is a better one to use: t.test(weather$DEC,weather$JAN,paired=T) ttestBF(weather$DEC,weather$JAN,paired=T) t.test(weather$DEC[early],weather$JAN[early],paired=T) t.test(weather$DEC[late],weather$JAN[late],paired=T) ttestBF(weather$DEC[early],weather$JAN[early],paired=T) ttestBF(weather$DEC[late],weather$JAN[late],paired=T) ##these seem to suggest that the difference in the amount ##that fell dec-jan did differ in the early era, ## but may have in the late era. ## Question 4: ## ## ## ##You can find out which month had the highest snowfall using code such as: weather$max <- apply(weather[,2:9],1,which.max) weather[1:5,] ##Create a table showing how many times each month got the highest ## snowfall, for the early and late eras. Format the table as if you were ## publishing the data in a brochure or magazine article. Did the distribution ## change between the early and late era? Conduct a statistical test, ## (using a p value criterion of 0.10) to determine if this difference is ## just 'by chance'. Write a paragraph describing your results. ## Points: 3 pts for creating the table, 3 pts for conducting the proper statistical test, ## 3 pts for describing the results. highmonth <- table(weather$max,early) highmonth chisq.test(highmonth) ## Question 5: ## Build a model to predict snowfall in a given year. ## We can use: ## 1. some type of non-linear regression method that will permit fitting ## long-term fluctuations ## 2. The previous years snowfall ## 3. sunspot records ## 4. el nino records ## 5. ??? other ideas (temperature) ##can we predict one month snowfall based on other months in the same year? g1 <- lm(weather$MAR~weather$OCT+weather$NOV+weather$DEC+weather$JAN+weather$FEB) summary(g1) ##the complete model with all months only predicts march with an R2 of .14 ##and suggests only February helps predict March. This is maybe not too surprising step(g1) ##only february ############################################### ### What about annual snowfalL? snowfall <- rowSums(weather[,2:9]) snowfall #temperature <- read.csv("temperature.csv") temperature <- avgtemp # Let's look at some possible predictors plot(as.vector(as.matrix(elnino)),type="b",main="El nino weather Cycle (monthly)") plot(rowMeans(elnino),type="b", main="El nino weather cycle (annual)") plot(rowMeans(elnino),snowfall,main=paste("El nino vs. snowfall\n", round(cor(rowMeans(elnino),snowfall),3))) #small but positive correlation! plot(as.vector(as.matrix(sunspots)),type="b") plot(rowMeans(sunspots),type="b") plot(rowMeans(sunspots),snowfall, main=paste("Sunspots vs. snowfall\n", round(cor(rowMeans(sunspots),snowfall,use="pairwise.complete"),3))) ##larger positive correlation! plot(as.vector(as.matrix(temperature[,-1]))) plot(as.vector(t(as.matrix(temperature[,-1]))),type="o",main="Mean temperature by month") plot(rowMeans(temperature[,-1]),main="Mean temperature by year",type="o") plot(rowMeans(temperature[,-1]),snowfall, main=paste("Temperature vs. snowfall\n", round(cor(rowMeans(temperature[,-1]),snowfall,use="pairwise.complete"),3))) ###let's combine all of this. ##we have 126 seasons that are complete as of Oct 2016. thisyear <- nrow(alldat) lastyear <- thisyear -1 alldat[thisyear,] ##let's try estimate snowfall by predicting based on data in the previous year ##only things we could know then. We will only have 125 data row then prevSnow <- rowSums(weather[1:lastyear,2:9]) snowfallbyyear <- rowSums(weather[2:thisyear,2:9]) elninobyyear <- rowMeans(elnino[1:lastyear,]) ##average for the whole previous year sunspotsbyyear <- rowMeans(sunspots[1:lastyear,]) ##average for whole previous year sunspotSF <- rowMeans(sunspots[1:lastyear,9:12]) ##true predicting--by previous summer ##get a summer and a fall elnino separately: elninoSF <- rowMeans(elnino[1:lastyear,9:12]) tempbyyear <- rowMeans(temperature[1:lastyear,]) tempSF <- rowMeans(temperature[1:lastyear,10:12]) n <- lastyear ##year 1 has NAs,so we will just remove year 1 newdat <- data.frame(Year=years[1:n], year=1:n, snow=snowfallbyyear, elnino=elninobyyear, elninoSF, sunspots=sunspotsbyyear, ssSF=sunspotSF, prev = prevSnow, logprev = log(prevSnow), temp = tempbyyear, tempSF=tempSF )[-1,] newdat[1:5,] ##It would be nice to fit the annual trend as a polynomial, to ##hopefully factor the mean level out. model1 <- lm(snow ~ year + poly(year,10),dat=newdat) anova(model1) summary(model1) ##R^2 is .52 with just year-polynomial matplot(cbind(snowfallbyyear,elninobyyear,sunspotsbyyear/12),type="l") points(model1$fit) qqnorm(model1$resid)##fairly reasonably normal hist(model1$resid) model2a <- lm(snow ~ poly(year,10),dat=newdat) ##same as model1 model2b <- lm(log(snow) ~ poly(year,10),dat=newdat) summary(model2a) ##.528 summary(model2b) ##improves R^2 to .573, but is it fair? cor(newdat$snow,exp(model2b$fit))^2 ##.529 ##not really an improvement hist(model2a$resid) ##look into residuals cor(model2a$resid,newdat[,-(1:2)]) cor(model2b$resid,newdat[,-(1:2)]) ##residuals are highly correlated with annual elnino, but not previous summer's ##maybe we can gain a little by by using the information ## have. We will continue to make two separate models. model3a <- lm(snow ~ year + poly(year,10)+ elnino +elninoSF+ sunspots + ssSF + temp + tempSF+ prev, dat=newdat) model3b <-lm(log(snow) ~ year + poly(year,10)+ elnino +elninoSF+ sunspots + ssSF + temp + tempSF+ logprev, dat=newdat) summary(model3a) summary(model3b) par(mfrow=c(1,2)) plot((newdat$snow),type="l") points(model2a$fit,type="o",col="red") points(model3a$fit,type="o",col="blue") plot(log(newdat$snow),type="l") points(model2b$fit,type="o",col="red") points(model3b$fit,type="o",col="blue") anova(model3b) summary(model3b)##r2 of .64 ##there looks like there are normal periodic devations. plot(model3a$resid,type="l") spectrum(model3a$resid) ##noise is pretty flat. ## OK, now this master prediction shown three reliable predictors. ## modelBICa <- step(model3a,direction="both",k=log(nrow(newdat))) ##just elnino sf modelAICa <- step(model3a,direction="both") ##ditto summary(modelAICa) summary(modelBICa) anova(model3a,modelAICa,modelBICa) modelBICb <- step(model3b,direction="both",k=log(nrow(newdat))) modelAICb <- step(model3b,direction="both") summary(modelAICb) #sssf; elninosf,logprev, etc. summary(modelBICb) ##small model as above anova(model3b,modelAICb,modelBICb) ##Lets predict 2020 N <- nrow(newdat) thisYear <- data.frame(Year="THIS YEAR", year=N-1, ##let's not extrapolate for now snow=0, ##not used elnino=rowMeans(elnino[N,]), elninoSF=rowMeans(elnino[N,9:12]), sunspots=rowMeans(sunspots[N,]), ssSF=rowMeans(sunspots[N,9:12]), temp=rowMeans(temperature[N,-1]), tempSF=rowMeans(temperature[N,10:12]), prev=rowSums(weather[N,2:9]), logprev=log(rowSums(weather[N,2:9]))) newdat2 <- rbind(newdat,thisYear) newdat2[-(1:120),] predict(model3a,thisYear) predict(modelBICa,thisYear) predict(modelAICa,thisYear) exp(predict(model3b,thisYear)) exp(predict(modelAICb,thisYear)) exp(predict(modelBICb,thisYear)) ##THe predictions of the models vary a lot! #pdf("winter-prediction.pdf",width=8,height=6) par(mfrow=c(1,1)) filled.line(years.num,t(bymonth),ylab="Inches of snow per Winter",xlab="Year",main="Houghton County Annual Snowfall") preda <- (predict(model3a,newdat2)) newyears <- 1891:2016 points(newyears,preda,lwd=3,type="o",pch=16,cex=.5) predb <- (predict(modelAICa,newdat2)) points(newyears,predb,lwd=3,type="l",pch=16,cex=.5,col="grey20") #dev.off()