################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2018 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 12 ## Orthogonality, Correlation, identifiability, multicolinearity. ## ## ## Libraries used mctest ## ## ##Define orthogonal as the inner or dot product == 0. a <- c(1,0) b <- c(0,1) a %*% b plot(0,0,type="n") segments(0,0,1,0) segments(0,0,0,1) c<- c(-1,1) d <- c(-1,-1) c %*% d plot(0,0,type="n") segments(0,0,-1,1) segments(0,0,-1,-1) #pdf("c12-orthogonnal1.pdf",width=6,height=6) plot(0,type="n",bty="n",xlab="x",ylab="y",xlim=c(-1.5,1.5),ylim=c(-1.50,1.5)) segments(-1.5,-3:3/2,1.5,-3:3/2,lty=3) segments(-3:3/2,1.5,-3:3/2,-1.5,lty=3) abline(0,0,lty=1,lwd=3,col="grey20") abline(v=0,lty=1,lwd=3,col="grey20") points(c(.5,.1,-.5,-.5),c(.1,.5,.6,-.6),pch=16,cex=5,col="white") text(.5,.1,"a") text(.1,.5,"b") text(-.5,.6,"c") text(-.5,-.6,"d") arrows(0,0,1,0,lwd=3,col="red") arrows(0,0,0,1,lwd=3,col="red") arrows(0,0,-1,1,lwd=3,col="darkgreen") arrows(0,0,-1,-1,lwd=3,col="darkgreen") #dev.off() x <- c(1,3,3, 4,-4,3) y <- c(3,1,3,-5, 1,3) pdf("c12-orthogonal2.pdf",width=6,height=6) plot(x,y,col="gold",pch=16,cex=1.2, main="Orthogonal predictors x and y") ##this grid() points(x,y,cex=1.2) arrows(0,0,x,y,length=.1) dev.off() x %*% y orthx1 <- c(1,-5,3,-1) orthx2 <- c(5,1,1,3) cor(orthx1,orthx2) orthx1 %*% orthx2 uncorx1 <- c(0,0,1,1) uncorx2 <- c(1,0,1,0) cor(uncorx1,uncorx2) (uncorx1)%*%(uncorx2) uncorx1b <- uncorx1 - mean(uncorx1) uncorx2b <- uncorx2 - mean(uncorx2) cor(uncorx1b,uncorx2b) uncorx1b%*% uncorx2b orthy <- c(.2,.4,.8,1) lm(orthy~orthx1+orthx2+0)$coef lm(orthy~orthx1+0)$coef lm(orthy~orthx2+0)$coef lm(orthy~uncorx1+uncorx2+0) lm(orthy~uncorx1+0) lm(orthy~uncorx2+0) set.seed(100) x1.raw <- rep(1:10,each=10) x2.raw <- rep(1:10,10) x1.raw %*% x2.raw cor(x1.raw,x2.raw) x1 <- x1.raw-mean(x1.raw) x2 <- x2.raw-mean(x2.raw) x1 %*% x2 cor(x1,x2) y <- x1 + x2 + rnorm(100) ##orthogonal predictors lm(y~x1)$coef lm(y~x2)$coef lm(y~x1+x2)$coef ##Uncorrelated predictors lm(y~x1.raw)$coef lm(y~x2.raw)$coef lm(y~x1.raw+x2.raw)$coef ##non-orthogonal predictor: x1b <- x1 + rnorm(100)*.5 lm(y~x1b+x2)$coef cor(x1b,x2) x1b %*% x2 ##two related varibales set.seed(999) x1 <- runif(50)*10 x2 <- x1 + rnorm(50)*1 x3 <- x1 + rnorm(50)*1 cor(x2,x3) x2 %*% x3 (x2-mean(x2))%*% (x3-mean(x3)) y <- 100 + .8*x2 +1.5*x3 model1 <- lm(y~x2+x3) model1 lm(y~x2)$coef lm(y~x3)$coef lm(y~x2+x3)$coef x2b <- x2 -mean(x2) x3b <- x3 - mean(x3) lm(y~x2b)$coef lm(y~x3b)$coef lm(y~x2b+x3b)$coef ### ### ##non-orthognal predictors: x12 <- x1* 3 + x2 lm(y~x1+x2+x12) lm(y~x12 + x1+x2) x1 <- runif(50)*10 x2 <- x1 + rnorm(50)*1 x3 <- x1 + rnorm(50)*1 n <- 1000 simcoef <- matrix(0,nrow=n,ncol=3) for(i in 1:n) { y <- 100 + .8*x2 +1.5*x3 + rnorm(50)*1 model <- lm(y~x2+x3) simcoef[i,]<-model$coef } colMeans(simcoef) pairs(simcoef) cor(simcoef[,2],simcoef[,3]) plot(simcoef[,2],simcoef[,3]) set.seed(500) n <- 1000 simcoef2 <- matrix(0,nrow=n,ncol=3) for(i in 1:n) { x2 <- runif(50) x3 <- runif(50) y <- 100 + .8*x2 +1.5*x3 + rnorm(50)*1 model <- lm(y~x2+x3) simcoef2[i,]<-model$coef } colMeans(simcoef2) cor(simcoef2[,2],simcoef2[,3]) pairs(simcoef2) cor(simcoef2) set.seed(500) n <- 1000 simcoef3 <- matrix(0,nrow=n,ncol=3) for(i in 1:n) { x2 <- runif(50); x2 <- x2 - mean(x2) x3 <- runif(50); x3 <- x3 - mean(x3) y <- 100 + .8*x2 +1.5*x3 + rnorm(50)*1 model <- lm(y~x2+x3) simcoef3[i,]<-model$coef } cor(simcoef3) ############################### ## Non-identifiability set.seed(100) x1 <- rep(1:10,each=10) x2 <- rep(1:10,10) x1a <- x1 y <- x1 + x2 + rnorm(100) lm12 <- lm(y~x1+x2) lm12b <- lm(y~x1+x1a+x2) lm12 lm12b x12 <- x1* 3 + x2 lm(y~x1+x2+x12) x1 <- 1:10 x2 <- x1*2 x3 <- runif(10) y <- 50 + 100*x1 + 33*x2 + 17*x3 model <- lm(y~x1+x2+x3) model y <- runif(10) x <- matrix(runif(100),10,10) model <- lm(y~x) summary(model) fitx <- function(n=1) { model <- lm(y~x[,1:n]) plot(y,model$fit,ylim=c(0,1),xlim=c(0,1)) abline(0,1) } library(manipulate) manipulate(fitx(x), x=slider(1,10)) ## Detecting multi-co-linearity. ############################## library(mctest) set.seed(999) x1 <- runif(100)*10 x2 <- x1 + rnorm(100)*.2 x3 <- x1 + rnorm(100)*.2 z <- rnorm(100) y <- x1 + rnorm(100) + z pairs(cbind(x1,x2,x3,z)) summary(lm(y~x1+x2+x3+z)) summary(lm(y~x2)) summary(lm(y~x3)) mctest(lm(y~x1+x2+x3+z)) mctest(lm(y~x1+x2+x3+z),type="i") plot(x2,x3) mctest(lm(y~x2+x3+z),type="i") mctest(lm(z~x2+x3+y),type="i",method="VIF") ##what if the correlation were stronger? x2 <- x1 + rnorm(100)*.2 x3 <- x1 + rnorm(100)*.2 mctest(cbind(x2,x3,z),y) mctest(cbind(x2,x3,z),y,type="i",method="VIF") ########Exercise 1 x <- c(0,1,0,1,0,1,0,1) y <- c(1,1,2,2,3,3,4,4) x %*% y x2 <- c(x,-5) y2 <- c(y,2) x %*% y cor(x,y) x3 <- x-mean(x) y3 <- y-mean(y) x3 %*% y3 cor(x3,y3) set.seed(100) a <- c(1,1,0,0) b <- c(0,1,0,1) c <- runif(4) mat <- tapply(c,list(a,b),mean) matplot(mat,type="o") lm(c~a) summary(lm(c~a+b)) aa <- a-mean(a) bb <- b-mean(b) lm(c~aa) lm(c~bb) summary(lm(c~aa+bb))