################################################################# ## Code for Book: Applied Statistical Analysis in R ## A Graduate course for Psychology, Human factors, and Data Science ## (c) 2022 Shane T. Mueller, Ph.D. ## Michigan Technological University ## shanem@mtu.edu ## ## Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210 ## ## ## This file contains example software code relevant for Chapter 9 ## Introduction to Linear Regression ## ## ## Libraries used quantreg, plotly, rgl, scatterplot3d, manipulate ## ## ################################################################# ## ESTIMATING LINEAR REGRESSION PARAMETERS ## Load the manipulate library, which is already part of RStudio: ## it will allow us to make little controllers for our data. #install.packages(c("manipulate","faraway","rgl","scatterplot3d")) library(manipulate) set.seed(500) ##Here is a sample randomly generated data set x <- runif(100)*10 y <- x*5 + 15 + rnorm(100)*10 #y <- -2*x^2 + x*5 + 150 + rnorm(100)*10 plot(x,y,pch=16,col="cadetblue3") ## This function will plot x vs y with a line having ## intercept a and slope b. ## plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) title(sub=paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme(x,y,36,0) plotme(x,y,37,0) plotme(x,y,20,5) plotme(x,y,5,10) plotme(x,y,5,8) plotme(x,y,14,5.2) ##You can play with plotme to find values that seem to work: #pdf("c9-lm1.pdf",width=9,height=9) par(mfrow=c(2,2)) plotme(x,y,50,0) plotme(x,y,0,8) plotme(x,y,20,4) plotme(x,y,15,5) #dev.off() ##### Exercise from book: ##### create a new data set and use plotme to fit it. ##### ## The manipulate function let's you create a slider widget that ## makes this easier. par(mfrow=c(1,1)) manipulate(plotme(x,y,a,b),a=slider(-20,100,step=.1),b=slider(0,10,step=.1)) ## Task: how can we make plotme better and more useful to us? ## ## ideas: ## - computing expected values of the line ## - coloring/shading of points based on under/over ## - Compute R^2 or RMSE deviation statistic to help produce best fit ## - count how many points are over/under ## ## ## Task: Create a new set of x and y values and try to use the plotme/manipulate ## to find the best-fitting line. n <- 100 beta1 <- (sum(x*y) - sum(x) * sum(y)/n) /(sum(x^2) - sum(x)^2/n) beta2 <- mean(y) - beta1 * mean(x) beta1 beta2 ##some magic: xs <- cbind(1,x) solve(t(xs) %*% xs) %*% t(xs) %*% y plotme(x,y,14.05,5.14) model <- lm(y~x) model #reload these values: set.seed(500) x <- runif(100)*10 y <- x*5 + 15 + rnorm(100)*10 #pdf("c9-plotbestline.pdf",width=8,height=5) par(mfrow=c(1,1)) model <- lm(y~x) model plot(x,y,pch=16,cex=1.5,col="gold", main=paste("Best-fitting line\n", "y = ",round(model$coef[1] ,2) ," + ", round(model$coef[2],3) , " * x",sep="")) points(x,y,pch=1,cex=1.5,col="grey20") abline(model$coef,lwd=2) #dev.off() #we can use the model for prediction: predictedy <- model$coef[1] + model$coef[2]* x points(x,predictedy) plot(y,predictedy,cex=.5,col="grey20",pch=16) abline(0,1) cor(y,predictedy)^2 ##two other ways to get the predicted y values for our x values. cor(predictedy,model$fit) cor(predictedy,predict(model,newdata=data.frame(x=x))) ## we can use predict to get predictions for x values we did not see: predict(model,newdata= data.frame(x=c(-100,0,10,30,5000))) ####################################### #### slope-only model estimation costs <- data.frame( product=c("apples", "bananas", "eggs", "steak", "potatoes","oranges"), cost1950=c(0.39,0.14,0.79,0.59,0.07,.23), cost1990=c(1.98, 0.48, 1.05, 2.99, 0.31,.74) ) plot(costs$cost1950,costs$cost1990,xlim=c(0,1),cex=.3,ylim=c(0,4),type="n", xlab="Cost in 1950",ylab="Cost in 1990") text(costs$cost1950,costs$cost1990,costs$product) abline(lm(cost1990~cost1950,data=costs)$coef, col="red") abline(0,lm(cost1990~cost1950+0,data=costs)$coef,col="black") lm(cost1990~cost1950,data=costs) lm(cost1990~cost1950+0,data=costs) #################################### ##quantile-based regression. library(quantreg) rq(cost1990~cost1950,data=costs) set.seed(3) n <- 500 ses <- runif(n)*10 ##one-predictor model y <- -12 + 5*ses + (exp(rnorm(n)*1.4 -.25))* 25 mod1.lm<-lm(y~ses)$coef mod1.rq <- rq(y~ses)$coef #pdf("c9-quantreg.pdf", width=9,height=5) par(mfrow=c(1,2)) plot(ses,y,col="gold",pch=16,cex=.8, main="Socio-economic status of parents\n and children's income", xlab="Parent Socio-economic status",ylab="Child's income (x1000)") abline(mod1.lm,lty=1) abline(mod1.rq,lty=3) abline(-12+20,5,lwd=2,lty=2) legend(0,2500,c("linear","quantile","ideal"),lty=c(1,3,2),lwd=c(1,1,2)) plot(ses,y,col="gold",pch=16,cex=.8, main="Socio-economic status of parents\n and children's income", xlab="Parent Socio-economic status",ylab="Child's income (x1000)", ylim=c(0,100)) abline(mod1.lm,lty=1) abline(mod1.rq,lty=3) abline(-12+20,5,lwd=2,lty=2) #dev.off() #################################################### ## Data with two predictors ## ## Suppose you havesome DV that is related to two IVs? ## set.seed(1) x <- runif(50)*10 y <- runif(50)*10 z <- 5*x + 15*y -12 + rnorm(50)*10 plotme2 <- function(x,y,z,b0,b1,b2) { pred <- b0 + b1 * x + b2 * y cols <- c(rgb(.9,.2,.2,.7), rgb(.2,.8,.4,.7))[1+(z% add_markers() ##For more than 1 predictor, you can visualize observe versus predicted, ## or predicted vs. single variable, possibly including predicted. (see plotme above) ############################################ ## Exercise I ## (A fake data set) ## x <- 1:100 x2 <- runif(100) y <- -.4*x - 150 - log(runif(100)*x*.3)*10 y2 <- -.4*x -150 - log(runif(100)*x*.3)*10 + x2*3 ## ## 0. Look at the relationship visually ## 1. Compute the linear model for y versus x ## 2. Determine you best estimate for y based only on x ## 3. Plot actual versus predicted. ## 4. Examine the residuals. ## 5. Compare the estimates for quantreg rq regression.