plotme(x,y,20,4) plotme(x,y,15,5) par(mfrow=c(1,1)) manipulate(plotme(x,y,a,b),a=slider(-20,100,step=.1),b=slider(0,10,step=.1)) knitr::opts_chunk$set(echo = TRUE) library(BayesFactor) set.seed(1000) x <- runif(20) y <- runif(20) z <- x + .5*y z1 <- z z2 <- z+10 z3 <- log(z) z4 <- z*10 z5 <- z + 20*runif(20) + 5*runif(20) plot(x,z) cor(x,z) cor(x,z)^2 cor.test(x,z1) cor.test(x,z1,method="spearman") correlationBF(x,z1) plot(x,z2,col="navy",pch=16,cex=3,main="Additive transform") cor.test(x,z2) cor.test(x,z2,method="spearman") correlationBF(x,z2) plot(x,z3,col="darkred",cex=2.5,pch=16,main="log transform") cor.test(x,z3) cor.test(x,z3,method="spearman") correlationBF(x,z3) plot(x,z5,pch=16,cex=3,col="seashell") points(x,z5,cex=3) cor.test(x,z5) cor.test(x,z5,method="spearman") correlationBF(x,z5) set.seed(1000) indoor<- sample(c("A","B","C","D"),prob=c(5,8,25,6), size=200,replace=T) outdoor <-sample(c("A","B","C","D"),prob=c(5,20,3,6), size=200,replace=T) tab1 <- table(indoor,outdoor) tab2 <- table(c(indoor,outdoor),rep(c("I","O"),each=200)) tab1 tab2 library(BayesFactor) chisq.test(tab1) contingencyTableBF(tab1,sampleType = "indepMulti", fixedMargin = "cols") tab1 + diag(4) * c(25,30,15,20) chisq.test(tab1 + diag(4) * c(25,30,15,20)) tabx <- tab1 tabx[4,2] <- 90 tabx chisq.test(tabx) chisq.test(tab2) contingencyTableBF(tab2,sampleType = "indepMulti", fixedMargin = "cols") contingencyTableBF(tab2,sampleType = "indepMulti", fixedMargin = "rows") contingencyTableBF(tab2,sampleType = "indepMulti", fixedMargin = "rows") library(RColorBrewer) palette(brewer.pal(8,"Accent")) tabA <- table(cardat$gender,cardat$type) cardat <- read.table(text="age gender type origin origin.last carval carval.last 1 34 M Truck US Europe 16400 15900 2 31 M Truck Europe Europe 16900 15800 3 47 F Sedan Europe Europe 18800 16600 4 21 F SUV Japan Japan 16000 16200 5 42 F Truck US US 16800 16500 6 43 F Truck Japan Japan 17200 16200 7 60 M Truck Japan Japan 19900 20800 8 37 F Sedan Japan Japan 17100 16200 9 46 M SUV US US 16900 16700 10 27 F Sedan US Japan 16200 15600 11 50 M SUV US Europe 18800 17800 12 64 F Sedan Japan Japan 50700 18000 13 33 M SUV US US 16500 15700 14 39 M SUV US US 17000 17600 15 58 F Truck US US 19400 16600 16 53 M Sedan Japan US 19200 17700 17 29 F Sedan Japan US 16300 15600 18 37 M Truck Japan Europe 17300 16200 19 37 M Truck US US 18200 16600 20 54 M Sedan US Europe 24500 16400 21 46 M Truck Japan Europe 18000 17700 22 55 F Truck US US 28900 17900 23 46 F Sedan Japan Japan 16600 16200 24 57 F Truck Europe Europe 24300 16600 25 40 F Truck US US 16800 16700 26 27 F Sedan Japan US 16900 15600 27 58 M Sedan US US 20300 19000 28 64 M Sedan US US 40600 23600 29 47 F Sedan US US 18400 19800 30 32 F SUV US Europe 15900 15800 31 43 F Sedan Japan Europe 17200 16300 32 66 M SUV US US 19100 30200 33 36 F SUV Japan Europe 16900 15800 34 68 F Sedan Japan Europe 69300 18500 35 54 M Sedan US US 17000 17200 36 64 F SUV US US 34900 42900 37 27 F Truck US Japan 15800 15500 38 51 F Sedan Japan Japan 29000 17900 39 69 F Sedan Europe Japan 54400 18700 40 25 M Truck Japan Japan 15800 15400",header=T) library(RColorBrewer) palette(brewer.pal(8,"Accent")) tabA <- table(cardat$gender,cardat$type) tabA chisq.test(tabA) contingencyTableBF(tabA,sampleType = "indepMulti", fixedMargin = "rows") barplot(tabA,beside=T,col=1:2, ylab="Number of respondents") legend(3,12,c("Women","Men"),pch=15,col=1:2,bty="n",cex=2) ##Note that the hardcoded numbers below should be deleted and the ones above (or the code below) should be used post-2022. cardat <- read.table(text="age gender type origin origin.last carval carval.last 34 F SUV US US 16400 15800 31 M Truck US Europe 16900 16000 47 M Sedan US US 18800 17100 21 F Sedan Japan Japan 16000 15500 42 M SUV US Japan 16800 16100 43 F SUV US US 17200 16300 60 F Truck Europe Europe 19900 17800 37 M Truck Europe Europe 17100 16200 46 F SUV Japan Japan 16900 16300 27 M Sedan US US 16200 15700 50 M SUV US US 18800 17100 64 F SUV Japan US 50700 31700 33 M SUV Japan Japan 16500 15900 39 M Truck US Europe 17000 16200 58 F Sedan Japan US 19400 17500 53 F SUV US Europe 19200 17400 29 F Sedan US Japan 16300 15700 37 F Sedan US US 17300 16300 37 M SUV US Japan 18200 16700 54 F Sedan Japan Japan 24500 19800 46 F SUV Japan Europe 18000 16700 55 F SUV US Japan 28900 21700 46 F Truck US Europe 16600 16100 57 M SUV Europe Europe 24300 19700 40 M SUV US US 16800 16100 27 M Sedan Japan US 16900 16000 58 M SUV Europe Europe 20300 17900 64 M Truck US US 40600 27100 47 M Truck US Europe 18400 16900 32 M Truck US US 15900 15600 43 F Sedan Japan US 17200 16300 66 M Truck Europe Europe 19100 17500 36 F SUV US Japan 16900 16100 68 M Truck US US 69300 40100 54 F Sedan Japan US 17000 16400 64 M Truck Japan Europe 34900 24600 27 M SUV Japan Europe 15800 15500 51 F Sedan Japan Japan 29000 21700 69 M Sedan US Japan 54400 33400 25 F Sedan Japan Japan 15800 15500",header=T) if(0) { set.seed(100) age <- round(runif(40,min=18,max=70)) carval<- round(15000 + age * 25 + (exp (rnorm(40 ,mean= age/10)*.8))* 50,-2) carval.last <- round(15000 + (age-10) * 25 + (exp (rnorm(40 ,mean= (age-10)/10)*.8))* 50,-2) origin <- sample(c("US","Japan","Europe"),p=c(.5,.4,.1),40,replace=T) origin.last <- origin origin.last[sample(1:40,25)] <- sample(c("US","Japan","Europe"),25,replace=T) type <- sample(c("Sedan","SUV","Truck"),replace=T,40) gender <- c("F","M")[(as.numeric(as.factor(type)) + rnorm(40)*2 > 2)+ 1] cardat <- data.frame(age,gender,type,origin,origin.last,carval,carval.last) } library(RColorBrewer) palette(brewer.pal(8,"Accent")) tabA <- table(cardat$gender,cardat$type) tabA chisq.test(tabA) contingencyTableBF(tabA,sampleType = "indepMulti", fixedMargin = "rows") barplot(tabA,beside=T,col=1:2, ylab="Number of respondents") legend(3,12,c("Women","Men"),pch=15,col=1:2,bty="n",cex=2) tabA*4 chisq.test(tabA*4) contingencyTableBF(tabA*4,sampleType = "indepMulti", fixedMargin = "rows") par(mfrow=c(1,2)) hist(cardat$carval,breaks=20,col=1, main="Histogram of car value",xlab="Car value (US$)") qqnorm(cardat$carval,pch=16,col=2,cex=1.5) wilcox.test(carval~gender,data=cardat) ttestBF(cardat$carval[cardat$gender=="F"], cardat$carval[cardat$gender=="M"]) barplot(aggregate(cardat$carval,list(cardat$gender),mean)$x,main="Car purchase price",xlab="Gender",ylab="Purchase price",col=1:2,names=c("Women","Men"), las=1) tabc <- table(prior=cardat$origin.last,current=cardat$origin) tabc barplot(tabc,col=3:5,ylim=c(0,20),xlab="Current vehicle",ylab="Number of buyers") legend(0.1,20,levels(cardat$origin),pch=15,col=3:5,cex=1.5,bty="n",title="Previous vehicle") chisq.test(tabc) contingencyTableBF(tabc,sampleType = "indepMulti", fixedMargin = "rows") tabc chisq.test(tabc) contingencyTableBF(tabc,sampleType = "indepMulti", fixedMargin = "rows") par(mfrow=c(1,3)) hist(cardat$age,col=3,xlab="Age of buyer") hist(cardat$carval,col=4,xlab="Purchase price") plot(cardat$age,cardat$carval,pch=16,col=5,xlab="Age of buyer",ylab="Purchase price") cor.test(cardat$age,cardat$carval,method="spearman") correlationBF(rank(age),rank(carval),data=cardat) lm1 <- lm(carval~age,data=cardat) pred1 <- predict(lm1,data.frame(age=1:70)) plot(cardat$age,cardat$carval,pch=16,col=5,xlab="Age of buyer",ylab="Purchase price",ylim=c(0,80000)) points(c(32,52,62),pred1[c(32,52,62)],col=5,cex=2) abline(lm1$coef) pred1[c(32,52,62)] lm2 <- lm(log(carval)~age,data=cardat) pred2 <- predict(lm2,data.frame(age=1:70)) plot(cardat$age,cardat$carval,pch=16,col=5,ylim=c(0,80000),xlab="Age of buyer",ylab="Purchase price") points(1:70,exp(pred2),type="l") points(c(32,52,62),exp(pred2[c(32,52,62)]),col=5,cex=2) exp(pred2[c(32,52,62)]) mean(cardat$carval[abs(cardat$age-32)<5]) mean(cardat$carval[abs(cardat$age-52)<5]) mean(cardat$carval[abs(cardat$age-62)<5]) par(mfrow=c(1,2)) plot(cardat$carval.last,cardat$carval,xlab="Previous car value", ylab="Current car value",pch=16,col=3) plot(rank(cardat$carval.last),rank(cardat$carval),xlab="Previous car value", ylab="Current car value",pch=16,col=3) cor.test(cardat$carval.last,cardat$carval) cor.test(cardat$carval.last,cardat$carval,method="spearman") correlationBF(cardat$carval.last,cardat$carval) correlationBF(rank(cardat$carval.last),rank(cardat$carval)) hist(cardat$carval-cardat$carval.last,col=1) qqnorm(cardat$carval-cardat$carval.last,pch=16,col=2,cex=1.3) binom.test(sum((cardat$carval-cardat$carval.last)>0), nrow(cardat)) t.test(cardat$carval,cardat$carval.last,paired=T,direction="less") ttestBF(cardat$carval,cardat$carval.last,paired=T,nullInterval=c(-Inf,0)) library(vioplot) cars <- cardat$carval[cardat$type=="Truck"] suvs <- cardat$carval[cardat$type=="SUV"] vioplot(cars,suvs,h=5000,col=3,names=c("Truck","SUV")) title(ylab="Vehicle cost ($US)") wilcox.test(cars,suvs) ttestBF(cars,suvs) runif(1) runif(1) set.seed(100) runif(1) runif(1) set.seed(100) runif(1) runif(1) x <- runif(100)*10 y <- x*5 + 15 + rnorm(100)*10 plot(x,y,pch=16,col="cadetblue3") ## plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) text(4,-10,paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) mtext(1,3,paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) mtext(1,3,paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) mtext(1,3,paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme <- function(x,y,a=0,b=1) { plot(x,y,col="gold",pch=16, main=paste("intercept:",a,"slope:",b)) points(x,y) abline(a,b) pred <- a + b*x points(x,pred,pch=1,cex=.5) rmse <- sqrt(mean((y-pred)^2)) title(sub=paste("RMSE=",round(rmse,3))) } ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme(x,y,36,0) ##You can play with plotme to find values that seem to work: plotme(x,y,35,0) plotme(x,y,36,0) plotme(x,y,37,0) plotme(x,y,37,5) plotme(x,y,20,5) plotme(x,y,5,10) plotme(x,y,5,8) plotme(x,y,14,5.2) plotme(x,y,15,5) par(mfrow=c(1,1)) manipulate(plotme(x,y,a,b),a=slider(-20,100,step=.1),b=slider(0,10,step=.1)) n <- 100 beta1 <- (sum(x*y) - sum(x) * sum(y)/n) /(sum(x^2) - sum(x)^2/n) beta2 <- mean(y) - beta1 * mean(x) beta2 beta2 beta1 xs <- cbind(1,x) solve(t(xs) %*% xs) %*% t(xs) %*% y x <- 1:1000000 y <- x + runif(1000000) lm1 <- lm(y~x) lm1 <- lm(y~x) lm1 <- lm(y~x) plot(x,y,pch=16,cex=1.5,col="gold", main=paste("Best-fitting line\n", "y = ",round(model$coef[1] ,2) ," + ", round(model$coef[2],3) , " * x",sep="")) plot(x,y,pch=16,cex=1.5,col="gold", main=paste("Best-fitting line\n", "y = ",round(model$coef[1] ,2) ," + ", round(model$coef[2],3) , " * x",sep="")) y <- x^2 plot(y,x) x <- 1:100 y <- x^2 + runif(100) plot(x,y) lm(y~x) abline(-1717,101) aa <- lm(y~x) predict(aa,10) predict(aa,list(x=10)) predict(aa,list(x=1200)) y <- y + rnorm(100)*100 plot(x,y) y <- y + rnorm(100)*1000 plot(x,y) costs <- data.frame( product=c("apples", "bananas", "eggs", "steak", "potatoes","oranges"), cost1950=c(0.39,0.14,0.79,0.59,0.07,.23), cost1990=c(1.98, 0.48, 1.05, 2.99, 0.31,.74) ) plot(costs$cost1950,costs$cost1990,xlim=c(0,1),cex=.3,ylim=c(0,4),type="n", xlab="Cost in 1950",ylab="Cost in 1990") text(costs$cost1950,costs$cost1990,costs$product) abline(lm(cost1990~cost1950,data=costs)$coef, col="red") abline(0,lm(cost1990~cost1950+0,data=costs)$coef,col="black") lm(cost1990~cost1950,data=costs) lm(cost1990~cost1950+0,data=costs) setwd("~/Dropbox/courses/5210-2022/web-5210/psy5210/Projects/Chapter9") set.seed(20) predictor <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1) outcome <- predictor + rnorm(40) predictor outcome plot(predictor,outcome) aggregate(outcome,list(predictor),mean) outcome predictor t.test(outcome~predictor) t.test(outcome[predictor==0],outcome[predictor==1]) cor(outcome,predictor) lm(outcome~predictor) summary(lm(outcome~predictor)) 8.547^(-2) sqrt(8.547) outcome pred2 <- runif(40) summary(lm(outcome~predictor+pred2)) summary(lm(outcome~predictor)) summary(lm(outcome~predictor+pred2)) summary(lm(outcome~1)) lm0 <- lm(outcome~1) lm2 <- lm(outcome~predictor+pred2) lm1 <- lm(outcome~predictor) anova(lm1,lm0) anova(lm2,lm0) anova(lm2,lm1) summary(lm2) pf(5.8,2,30) pf(5.8,2,60) pf(5.8,2,1000) pf(2.8,2,1000) pf(2.8,2,1000000) pf(1,2,1000) pf(1000,2,1000) library(faraway) data(stat500) summary(stat500) lm1 <- lm(total~midterm+final+hw,data=stat500) lm2 <- lm(final~midterm,data=stat500) lm3 <- lm(final~midterm+hw,data=stat500) stat500 lm1 lm1$coefficients round( lm1$coefficients,8) stat500$newscore <- stat500$midterm*.5 + stat500$hw*.25+stat500$final*.25 stat500 lm5 <- lm(newscore~hw+midterm+final,data=stat500) lm5 round(lm5$coefficients,3) lm3 summary(lm(final~hw,data=stat500)) summary(lm2) summary(lm3) anova(lm3,lm2) summary(lm2) sqrt(.299) summary(lm3) summary(lm2) summary(lm3) lm3 lm3$coefficients lm3$fitted.values plot(stat500$midterm,stat500$final) points(stat500$midterm,lm3$fitted.values,col="red",pch=16) cor(stat500$midterm,lm3$fitted.values) cor(stat500$final,lm3$fitted.values) cor(stat500$final,lm3$fitted.values)^2 cor(lm1$fit,z)^2 q1 <- runif(50) q2 <- runif(50) q3 <- runif(50) q4 <- runif(50) q5 <- runif(50) q6 <- runif(50) q7 <- runif(50) q8 <- runif(50) q9 <- runif(50) q10 <- runif(50) summary(lm(z~q1))$r.squared set.seed(1000) sd <- 10 n <- 50 x <- 1:n/n*10 y <- runif(n)*10 y2 <- runif(n) * 3 z <- 10 +3*x + 5*y + rnorm(n)*sd lm1 <- lm(z~x+y+y2) lm1 res1 <- sqrt(sum(lm1$resid^2)/(n-4)) print(res1) summary(lm1) ## Anova is the same as a t-test? ####comparing x+y to just x: lm2 <- lm(z~x+y) res2 <-(sum(lm2$resid^2)) lm3 <- lm(z~x) res3 <-(sum(lm3$resid^2)) summary(lm3) tval <- sqrt((res3-res2)/(res2/(50-3)) ) tval ## comparing R^2 versus ##multiple R^2 cor(lm1$fit,z)^2 summary(lm1) q1 <- runif(50) q2 <- runif(50) q3 <- runif(50) q4 <- runif(50) q5 <- runif(50) q6 <- runif(50) q7 <- runif(50) q8 <- runif(50) q9 <- runif(50) q10 <- runif(50) summary(lm(z~q1))$r.squared summary(lm(z~q1))$adj.r.squared summary(lm(z~q1+q2))$r.squared summary(lm(z~q1+q2))$adj.r.squared fruitfly table(fruitfly$activity) library(faraway) data(fruitfly) summary(fruitfly) ff.lm0 <- lm(longevity~activity+0,data=fruitfly) ff.lm <- lm(longevity~thorax,data=fruitfly) summary(ff.lm) ff.lm2 <- lm(longevity~thorax+activity+0,data=fruitfly) summary(ff.lm2) ff.lm1 <- lm(longevity~thorax+activity,data=fruitfly) ff.lm1 summary(ff.lm1) ff.lm <- lm(longevity~thorax,data=fruitfly) ff.lm ff.lm1 anova(ff.lm,ff.lm1) anova(ff.lm1) ff.lm0 <- lm(longevity~activity+0,data=fruitfly) ff.lm2 <- lm(longevity~thorax+activity+0,data=fruitfly) summary(ff.lm2) ?fruitfly