---
title: "Simple Linear Regression"
author: "Lesson 10"
output:
  html_document:
    css: http://www.bradthiessen.com/batlab2.css
    highlight: pygments
    theme: spacelab
    toc: yes
    toc_depth: 5
    fig_width: 5.6
    fig_height: 4
---

## Load ggvis, ggplot2, mosaic, and broom packages
library(ggvis)
library(ggplot2)
library(mosaic)
library(broom)

*****

## Scenario: Predict success of first-year students at St. Ambrose

## Load ACT/GPA data.frame
actgpa <- read.csv("http://www.bradthiessen.com/html5/data/actgpa.csv")

## Scatterplot (hsGPA, GPA)
actgpa %>% 
  ggvis(x=~hsGPA, y=~fallGPA, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>%
  add_axis("x", title = "High School GPA") %>%
  scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "SAU 1st semester GPA") %>%
  scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE)

## Summary statistics
favstats(~fallGPA, data=actgpa)
favstats(~hsGPA, data=actgpa)
cor(fallGPA ~ hsGPA, data=actgpa)

## Coefficients of linear model
lm(fallGPA~hsGPA, data=actgpa)


## Scatterplot (ACT, GPA)
actgpa %>% 
  ggvis(x=~ACTcomp, y=~fallGPA, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.25, strokeOpacity:=.4) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>%
  add_axis("x", title = "ACT Composite Score") %>%
  scale_numeric("x", domain=c(14, 36), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "SAU 1st semester GPA") %>%
  scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE)

## Summary statistics
favstats(~ACTcomp, data=actgpa)
cor(fallGPA ~ ACTcomp, data=actgpa)

## Coefficients of linear model
lm(fallGPA~ACTcomp, data=actgpa)


*****

## Uncorrelated data

x <- as.numeric(c(1:20))
z <- sin(x-500)
y <- cos(x-500)
lineardata <- data.frame(x=z,y=y)

lineardata %>% 
  ggvis(x=~z, y=~y, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>%
  layer_points(size:=100) %>%
  add_axis("x", title = "X", grid=F) %>%
  scale_numeric("x", domain=c(-1.2, 1.2), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Y", grid=F) %>%
  scale_numeric("y", domain=c(-1.2, 1.2), nice=FALSE, clamp=TRUE)

cor(z, y, data=lineardata)

*****

## Input beer dataset
brand <- c("Busch", "MGD", "Bud Light", "Coors Light", "Miller Lite", "Budweiser")
media <- c(8.7, 21.5, 68.7, 76.6, 100.1, 120.0)
ship <- c(8.1, 5.6, 20.7, 13.2, 15.9, 36.3)
beer <- data.frame(brand, media, ship)

## Scatterplot
beer %>% 
  ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>%
  layer_points(size:=100) %>%
  add_axis("x", title = "Media expenditures (millions of $)") %>%
  scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Shipments (millions of bottles)") %>%
  scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE)

## Scatterplot with least squares line
beer %>% 
  ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>%
  layer_points(size:=100) %>%
  layer_model_predictions(model = "lm", strokeWidth:=3, stroke:="red") %>%
  add_axis("x", title = "Media expenditures (millions of $)", grid=F) %>%
  scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Shipments (millions of bottles)", grid=F) %>%
  scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE)

## Scatterplot with lowess regression (curve)
beer %>% 
  ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>%
  layer_points(size:=100) %>%
  layer_smooths(strokeWidth:=5, stroke:="red") %>%
  add_axis("x", title = "Media expenditures (millions of $)") %>%
  scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Shipments (millions of bottles)") %>%
  scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE)

## Summary statistics for beer data
beer %>%
  summarize(avgShip = mean(ship), sdShip = sd(ship),
            acgMedia = mean(media), sdMedia = sd(media))
cor(media, ship, data=beer)

## Get coefficients for best-fitting line
model <- lm(ship~media, data=beer)
model
summary(model)
predict(model)
plot(model)
mplot(model)

## Simple scatterplot
beer %>% 
  ggvis(x=~media, y=~ship) %>%
  layer_points() %>%
  layer_model_predictions(model = "lm")

## Tidy model and other commands
tidymodel <- tidy(model)
tidymodel$p.value[2]
glance(model)$r.squared
anova(model)
augment(model)
coef(model)
confint(model)	
residuals(model)


*****

# Load ACT/GPA data.frame
actgpa <- read.csv("http://www.bradthiessen.com/html5/data/actgpa.csv")

## Scatterplot (hsGPA, GPA)
actgpa %>% 
  ggvis(x=~hsGPA, y=~fallGPA, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>%
  add_axis("x", title = "High School GPA") %>%
  scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "SAU 1st semester GPA") %>%
  scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE)

# Summary statistics
favstats(~fallGPA, data=actgpa)
favstats(~hsGPA, data=actgpa)
cor(fallGPA ~ hsGPA, data=actgpa)

## Coefficients of linear model
gpa.model <- lm(fallGPA~hsGPA, data=actgpa)
summary(gpa.model)

## A randomized model
gpa2 <- data.frame(fallGPA = actgpa$fallGPA, hsGPA = shuffle(actgpa$hsGPA))

gpa2 %>% 
  ggvis(x=~hsGPA, y=~fallGPA, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="green", strokeOpacity:=.8) %>%
  add_axis("x", title = "High School GPA") %>%
  scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "SAU 1st semester GPA") %>%
  scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE)

## Another randomized model
gpa3 <- data.frame(fallGPA = actgpa$fallGPA, hsGPA = shuffle(actgpa$hsGPA))

gpa3 %>% 
  ggvis(x=~hsGPA, y=~fallGPA, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="green", strokeOpacity:=.8) %>%
  add_axis("x", title = "High School GPA") %>%
  scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "SAU 1st semester GPA") %>%
  scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE)

## Coefficients for randomized models
lm(fallGPA~hsGPA, data=gpa2)
lm(fallGPA~hsGPA, data=gpa3)


## Randomization-based test
test.slope <- 0.9372

rand.slopes <- do(10000) * lm(fallGPA ~ shuffle(hsGPA), data=actgpa)

histogram(~hsGPA., data=rand.slopes,
          xlab="Possible slopes assuming null hypothesis is true", 
          groups=hsGPA. >= test.slope,   # Highlight values > test statistic
          main=paste("p-value = ", prop(~hsGPA. >= test.slope, data=rand.slopes)))
ladd(panel.abline(v=test.slope))   # Add vertical line at test statistic


## Bootstrap confidence interval
bstrap <- do(10000) * lm(fallGPA ~ hsGPA, data=resample(actgpa))
densityplot(~hsGPA, data=bstrap, plot.points = FALSE, col="steelblue", lwd=4)
cdata(0.95, hsGPA, data = bstrap)


## Likelihood ratio test
fullmodel <- lm(ship ~ media, data=beer)
reducedmodel <- lm(ship ~ 1, data=beer)
glance(fullmodel)
logLik(fullmodel)
logLik(reducedmodel)
2 * (logLik(fullmodel) - logLik(reducedmodel))
pchisq(6.96824, df=3, lower.tail=FALSE)
library(lmtest)
lrtest(reducedmodel, fullmodel)


## AIC
AIC(reducedmodel, fullmodel)
(6 * log(191.0244/6)) +2
(6 * (log(2 * pi) + 1 + log(191.0244/6)) + (2 * (2+1)))
((-2) * (-18.89556)) + (2*3)

## ANOVA to compare models
anova(reducedmodel, fullmodel) 
anova(fullmodel)
beer$pred <- predict(fullmodel)
beer$error <- resid(fullmodel)
beer$error2 <- beer$error^2
sum(beer$error2)

anova(reducedmodel)
(6 * log(610.19/6)) +2

*****

## Violin data
violin <- read.csv("http://www.bradthiessen.com/html5/data/violin.csv")

violin %>% 
  ggvis(x=~years, y=~neural, 
        fill:= "steelblue", stroke:="steelblue", fillOpacity:=.8, strokeOpacity:=.4) %>%
  layer_points() %>%
  scale_numeric("x", domain=c(0, 20), nice=FALSE, clamp=TRUE)  %>%
  scale_numeric("y", domain=c(0, 30), nice=FALSE, clamp=TRUE)

model <- lm(neural ~ years, data=violin)
summary(model)
anova(model)
mplot(model)


*****

######################
## Prestige example ##
######################


## Load data
prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")

## Set occupation type as a factor variable
prestige$type <- as.factor(prestige$occ_type)

## Eliminate the job titles
prestige2 <- prestige %>%
  dplyr::select(-title, -blue, -professional, -white)

## Create scatterplot matrix with histograms on diagonals
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(prestige2,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## ANOVA, summary statistics, Bartlett's test, Tukey/Bonferroni
model <- lm(prestige ~ as.factor(occ_type), data=prestige)
anova(model)

prestige %>%
  group_by(occ_type) %>%
  summarize(n=n(), mean=mean(prestige), sd=sd(prestige))

bartlett.test(prestige ~ occ_type, data = prestige)

TukeyHSD(model, conf.level=.95)
plot(TukeyHSD(model))

with(prestige, pairwise.t.test(prestige, occ_type, p.adjust.method="bonferroni"))


##### Construct linear model: Prestige = f(income)

## Scatterplot with regression line
prestige %>% 
  ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
  add_axis("x", title = "Income") %>%
  scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

## Create model
model <- lm(prestige~income, data=prestige)
# Get coefficients
coef(model)
# Summarize model to get R-squared
summary(model)
# Get ANOVA summary table
anova(model)

# Get confidence intervals for coefficients
confint(model)

### Create new data (job with income = 7000)
newdata = data.frame(income=7000)

## Find the predicted prestige for that new job
predict(model, newdata) 

## Find confidence interval
predict(model, newdata, interval="confidence") 

## Find prediction interval
predict(model, newdata, interval="predict") 

## Assumptions
plot(model)

## Homoscedasticity test
ncvTest(model)

# Normality visualization: component + residual plot 
crPlots(model)


# Square transformation
lm(prestige^2 ~ income, data=prestige)

## Robust regression
library(MASS)
rlm(prestige ~ income, data=prestige)

## Quantile regression
library(quantreg)
rq(prestige ~ income, tau=.5, data=prestige)
summary(rq(prestige ~ income, tau=.5, data=prestige))

## Lowess
lowess(prestige$prestige ~ prestige$income)

prestige %>% 
  ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.4, strokeOpacity:=.5) %>%
  layer_points() %>%
  layer_smooths(stroke:="red", strokeWidth:=5) %>%
  add_axis("x", title = "Income") %>%
  scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

## Nonlinear -- quadratic model
quad = lm(prestige ~ income + I(income^2), data=prestige)
coef(quad)
summary(quad)
plot(prestige~income, data=prestige)
curve(14.18319+0.0061535*x-.0000001433097*x^2,add=T)
plot(quad)


## Log model
logmod = lm(prestige ~ log(income), data=prestige)
summary(logmod)
plot(prestige~income, data=prestige)
curve(-139.856 + 21.556*log(x),add=T)
plot(logmod)