#################################################################
##  Code for Book: Applied Statistical Analysis in R
##   A Graduate course for Psychology, Human factors, and Data Science
##  (c) 2022 Shane T. Mueller, Ph.D.
##  Michigan Technological University
##  shanem@mtu.edu
##
##  Textbook and videos available at http://pages.mtu.edu/~shanem/psy5210
##
##
## This file contains example software code relevant for Chapter 9
##  Introduction to Linear Regression
##
##
##   Libraries used quantreg, plotly, rgl, scatterplot3d, manipulate
##
##


#################################################################
##  ESTIMATING LINEAR REGRESSION PARAMETERS


## Load the manipulate library, which is already part of RStudio:
## it will allow us to make little controllers for our data.
#install.packages(c("manipulate","faraway","rgl","scatterplot3d"))
library(manipulate)

set.seed(500)
##Here is a sample randomly generated data set
x <- runif(100)*10
y <- x*5 + 15 + rnorm(100)*10

#y <-  -2*x^2 + x*5 + 150 + rnorm(100)*10

plot(x,y,pch=16,col="cadetblue3")


## This function will plot x vs y with a line having
## intercept a and slope b.
##
plotme <- function(x,y,a=0,b=1)
{
  plot(x,y,col="gold",pch=16,
       main=paste("intercept:",a,"slope:",b))
  points(x,y)
  abline(a,b)
  
  pred <- a + b*x
  points(x,pred,pch=1,cex=.5)
  
  rmse <- sqrt(mean((y-pred)^2))
  title(sub=paste("RMSE=",round(rmse,3)))
  
}

##You can play with plotme to find values that seem to work:
plotme(x,y,35,0)
plotme(x,y,36,0)
plotme(x,y,37,0)
plotme(x,y,20,5)

plotme(x,y,5,10)
plotme(x,y,5,8)
plotme(x,y,14,5.2)

##You can play with plotme to find values that seem to work:
#pdf("c9-lm1.pdf",width=9,height=9)
par(mfrow=c(2,2))
plotme(x,y,50,0)
plotme(x,y,0,8)
plotme(x,y,20,4)
plotme(x,y,15,5)
#dev.off()


#####  Exercise from book:
#####  create a new data set and use plotme to fit it.
#####  
## The manipulate function let's you create a slider widget that
##  makes this easier.
par(mfrow=c(1,1))
manipulate(plotme(x,y,a,b),a=slider(-20,100,step=.1),b=slider(0,10,step=.1))

## Task: how can we make plotme better and more useful to us?
##
##  ideas: 
##      - computing expected values of the line
##      - coloring/shading of points based on under/over
##      - Compute R^2 or RMSE deviation statistic to help produce best fit
##      - count how many points are over/under
##   
##
## Task:  Create a new set of x and y values and try to use the plotme/manipulate
## to find the best-fitting line.


n <- 100
beta1 <- (sum(x*y) - sum(x) * sum(y)/n)  /(sum(x^2) - sum(x)^2/n)
beta2 <- mean(y) - beta1 * mean(x)

beta1
beta2


##some magic:

xs <- cbind(1,x)
solve(t(xs) %*% xs) %*% t(xs) %*% y


plotme(x,y,14.05,5.14)
model <- lm(y~x)
model

#reload these values:
set.seed(500)
x <- runif(100)*10
y <- x*5 + 15 + rnorm(100)*10


#pdf("c9-plotbestline.pdf",width=8,height=5)
par(mfrow=c(1,1))

model <- lm(y~x)
model
plot(x,y,pch=16,cex=1.5,col="gold",
     main=paste("Best-fitting line\n",
                "y = ",round(model$coef[1] ,2) ," + ", round(model$coef[2],3) , " * x",sep=""))

points(x,y,pch=1,cex=1.5,col="grey20")
abline(model$coef,lwd=2)
#dev.off()


#we can use the model for prediction:
predictedy <- model$coef[1] + model$coef[2]* x
points(x,predictedy)
plot(y,predictedy,cex=.5,col="grey20",pch=16)
abline(0,1)
cor(y,predictedy)^2

##two other ways to get the predicted y values for our x values.
cor(predictedy,model$fit)
cor(predictedy,predict(model,newdata=data.frame(x=x)))


## we can use predict to get predictions for x values we did not see:

predict(model,newdata= data.frame(x=c(-100,0,10,30,5000)))


#######################################
#### slope-only model estimation
costs <- data.frame(
  product=c("apples", "bananas", "eggs", "steak", "potatoes","oranges"),
  cost1950=c(0.39,0.14,0.79,0.59,0.07,.23),
  cost1990=c(1.98,   0.48,    1.05, 2.99,  0.31,.74)    
)  
  
plot(costs$cost1950,costs$cost1990,xlim=c(0,1),cex=.3,ylim=c(0,4),type="n",
     xlab="Cost in 1950",ylab="Cost in 1990")
text(costs$cost1950,costs$cost1990,costs$product)
abline(lm(cost1990~cost1950,data=costs)$coef, col="red")
abline(0,lm(cost1990~cost1950+0,data=costs)$coef,col="black")

lm(cost1990~cost1950,data=costs)
lm(cost1990~cost1950+0,data=costs)
####################################
##quantile-based regression.
library(quantreg)
rq(cost1990~cost1950,data=costs)

  set.seed(3)
n <- 500
ses <- runif(n)*10
##one-predictor model
y <-  -12 +  5*ses + (exp(rnorm(n)*1.4 -.25))* 25

mod1.lm<-lm(y~ses)$coef
mod1.rq <- rq(y~ses)$coef

#pdf("c9-quantreg.pdf", width=9,height=5)
par(mfrow=c(1,2))
plot(ses,y,col="gold",pch=16,cex=.8,
     main="Socio-economic status of parents\n and  children's income",
     xlab="Parent Socio-economic status",ylab="Child's income (x1000)")
abline(mod1.lm,lty=1)
abline(mod1.rq,lty=3)
abline(-12+20,5,lwd=2,lty=2)
legend(0,2500,c("linear","quantile","ideal"),lty=c(1,3,2),lwd=c(1,1,2))

plot(ses,y,col="gold",pch=16,cex=.8,
     main="Socio-economic status of parents\n and  children's income",
     xlab="Parent Socio-economic status",ylab="Child's income (x1000)",
     ylim=c(0,100))

abline(mod1.lm,lty=1)
abline(mod1.rq,lty=3)
abline(-12+20,5,lwd=2,lty=2)
#dev.off()


####################################################
##   Data with two predictors
##
##  Suppose you havesome DV that is related to two IVs?
##


set.seed(1)
x <- runif(50)*10
y <- runif(50)*10
z <- 5*x +  15*y  -12 + rnorm(50)*10

plotme2 <- function(x,y,z,b0,b1,b2)
{
  
  pred <- b0 + b1 * x + b2 * y
  cols <- c(rgb(.9,.2,.2,.7), rgb(.2,.8,.4,.7))[1+(z<pred)]

  par(mfrow=c(1,3))
   plot(x,z,pch=16)
   title(main=paste(round(b0,3), "+",round(b1,3), "*x +" ,round(b2,3), "*y"))
   points(x,pred,cex=1.4,col=cols,pch=16)

  ## draw segments between each given and preidcted value,
  ## with a color that tells up whether it is an under or 
  ## over-estimate
   
  
  segments(x,z,x,pred,lty=1,col=cols)
  abline(b0+b2*5,b1)

   plot(y,z,pch=16)
   points(y,pred,pch=16,cex=1.4,col=cols)
   abline(b0+b1*5,b2)
   segments(y,z,y,pred,lty=3,col=cols)

  rmse <- mean(sum((z-pred)^2))
  
  plot(z,pred,main=paste(round(rmse),round(cor(z,pred)^2,3)),xlim=range(z),ylim=range(z),col=cols,pch=16)
  title(sub=paste(round(c(b0,b1,b2),2),collapse=":"))
  
  abline(0,1)
}

##  Use the new plotme, adjust the three new vaues to find a best-fitting 
##  prediction line.
##
manipulate(plotme2(x,y,z, b0,b1,b2),b0=slider(-20,20,step=.1,initial=0),
            b1=slider(0,20,step=.1),
            b2=slider(0,20,step=.1))


################################################
##  For a certain class of generating models,
##  we can compute a (complicated) statistic to estimate
##  these very simply.  The mathematics behind this is in
##  the reading.  the lm function will do this very easily,
##  using a syntax similar to some of the boxplot and table functions:

xs <- cbind(1,x,y)
betas <- solve(t(xs) %*% xs) %*% t(xs) %*% z


plotme2(x,y,z,betas[1],betas[2],betas[3])

model2 <- lm(z ~ x+y)
print(model2)
betas <- model2$coef
plotme2(x,y,z,betas[1],betas[2],betas[3])


################################################
## 
## The lm function computes statistics on the data
## that estimate a set of linear slopes, typically
## called 'beta' weights. 
## beta0=intercept, 
## beta1 = 1st slope
## beta2 = 2nd slope
## etc.
##
## Each slope is associated with a different predictor variable you provide.
##
## For linear combinations of variables (e.g., b1x1 + b2x2 +....), and 
## assuming a linear relationship between predictor and outcome, this
## a complicated statistic can be computed directly.


## The lm() function will estimate these parameters for you.


#pdf("c9-successivemodels.pdf",width=9,height=4)
par(mfrow=c(1,2))
###Let's try one with no noise
set.seed(501)
x <- 1:100
y <- x*3 + 55
plot(x,y,main="Data and predicted model with no noise",col="cadetblue3",pch=16)
g1 <- lm(y~x)
abline(g1$coef)
print(g1)
print(g1$coef)

##Let's try one with noise
y <- x *3 + 55 + rnorm(100)*50
g2 <- lm(y~x)
plot(x,y,main="Data and predicted model with some noise",col="cadetblue3",pch=16)
abline(g2$coef)
print(g2)
#dev.off()

## We can get the coefficients directly:
##
print(g2$coeff)


#pdf("c9-fit1.pdf",width=6,height=6)
plot(x,y,cex=2,col="gold",pch=16)
points(x,y,cex=2,col="grey20")
abline(g2$coeff,lwd=3,col="red")
abline(55,3,col="grey20",lty=2,lwd=3)
#dev.off()

#pdf("c9-lmfit2.pdf",width=8,height=3)
par(mfrow=c(1,3))
plot(y,g2$fit,col="gold",pch=16,cex=1.2)
points(y,g2$fit,col="grey20")
abline(0,1)
hist(g2$resid,breaks=20,col="gold")
qqnorm(sort(g2$resid))
#dev.off()


###########################################
## Example: Visualizing models with multiple Predictors
##

x <- runif(100)*5
y <- 1:100
z <- 100 + 3*x -y*.2 + rnorm(100)*5


g3 <- lm(z~x+y)
g3


par(mfrow=c(1,2))
plot(x,z)
plot(y,z)


library(scatterplot3d)
#pdf("c9-scatter3d.pdf",width=8,height=8)
par(mfrow=c(3,3))

for(angle in (0:8)*90/8)
{ 
  s3d <-scatterplot3d(x,y,z ,pch=16, highlight.3d=TRUE,
                      angle=angle,
                      type="h", main="3D Scatterplot")
  s3d$plane3d(g3,lty.box="solid")
  
}

#dev.off()
##Rgl

library(rgl)
plot3d(z,x,y,col="red", size=3) 


#####
library(plotly)
##This is hard to visualize

data <- data.frame(x,y,z)
plot_ly(data, x = ~x, y = ~y, z = ~z) %>% add_markers()


##For more than 1 predictor, you can visualize observe versus predicted,
## or predicted vs. single variable, possibly including predicted. (see plotme above)


############################################
##   Exercise I
##   (A fake data set)
##
x <- 1:100
x2 <- runif(100)
y <- -.4*x - 150 - log(runif(100)*x*.3)*10
y2 <- -.4*x -150 - log(runif(100)*x*.3)*10 + x2*3
##
## 0. Look at the relationship visually
## 1.  Compute the linear model for y versus x
## 2.  Determine you best estimate for y based only on x
## 3. Plot actual versus predicted.
## 4. Examine the residuals.
## 5. Compare the estimates for quantreg rq regression.