--- title: "Simple Linear Regression" author: "Lesson 10" output: html_document: css: http://www.bradthiessen.com/batlab2.css highlight: pygments theme: spacelab toc: yes toc_depth: 5 fig_width: 5.6 fig_height: 4 --- ## Load ggvis, ggplot2, mosaic, and broom packages library(ggvis) library(ggplot2) library(mosaic) library(broom) ***** ## Scenario: Predict success of first-year students at St. Ambrose ## Load ACT/GPA data.frame actgpa <- read.csv("http://www.bradthiessen.com/html5/data/actgpa.csv") ## Scatterplot (hsGPA, GPA) actgpa %>% ggvis(x=~hsGPA, y=~fallGPA, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>% add_axis("x", title = "High School GPA") %>% scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "SAU 1st semester GPA") %>% scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE) ## Summary statistics favstats(~fallGPA, data=actgpa) favstats(~hsGPA, data=actgpa) cor(fallGPA ~ hsGPA, data=actgpa) ## Coefficients of linear model lm(fallGPA~hsGPA, data=actgpa) ## Scatterplot (ACT, GPA) actgpa %>% ggvis(x=~ACTcomp, y=~fallGPA, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.25, strokeOpacity:=.4) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>% add_axis("x", title = "ACT Composite Score") %>% scale_numeric("x", domain=c(14, 36), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "SAU 1st semester GPA") %>% scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE) ## Summary statistics favstats(~ACTcomp, data=actgpa) cor(fallGPA ~ ACTcomp, data=actgpa) ## Coefficients of linear model lm(fallGPA~ACTcomp, data=actgpa) ***** ## Uncorrelated data x <- as.numeric(c(1:20)) z <- sin(x-500) y <- cos(x-500) lineardata <- data.frame(x=z,y=y) lineardata %>% ggvis(x=~z, y=~y, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>% layer_points(size:=100) %>% add_axis("x", title = "X", grid=F) %>% scale_numeric("x", domain=c(-1.2, 1.2), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Y", grid=F) %>% scale_numeric("y", domain=c(-1.2, 1.2), nice=FALSE, clamp=TRUE) cor(z, y, data=lineardata) ***** ## Input beer dataset brand <- c("Busch", "MGD", "Bud Light", "Coors Light", "Miller Lite", "Budweiser") media <- c(8.7, 21.5, 68.7, 76.6, 100.1, 120.0) ship <- c(8.1, 5.6, 20.7, 13.2, 15.9, 36.3) beer <- data.frame(brand, media, ship) ## Scatterplot beer %>% ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>% layer_points(size:=100) %>% add_axis("x", title = "Media expenditures (millions of $)") %>% scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Shipments (millions of bottles)") %>% scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE) ## Scatterplot with least squares line beer %>% ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>% layer_points(size:=100) %>% layer_model_predictions(model = "lm", strokeWidth:=3, stroke:="red") %>% add_axis("x", title = "Media expenditures (millions of $)", grid=F) %>% scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Shipments (millions of bottles)", grid=F) %>% scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE) ## Scatterplot with lowess regression (curve) beer %>% ggvis(x=~media, y=~ship, fill := "steelblue", stroke:="steelblue", fillOpacity:=.9, strokeOpacity:=1) %>% layer_points(size:=100) %>% layer_smooths(strokeWidth:=5, stroke:="red") %>% add_axis("x", title = "Media expenditures (millions of $)") %>% scale_numeric("x", domain=c(0, 125), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Shipments (millions of bottles)") %>% scale_numeric("y", domain=c(0, 40), nice=FALSE, clamp=TRUE) ## Summary statistics for beer data beer %>% summarize(avgShip = mean(ship), sdShip = sd(ship), acgMedia = mean(media), sdMedia = sd(media)) cor(media, ship, data=beer) ## Get coefficients for best-fitting line model <- lm(ship~media, data=beer) model summary(model) predict(model) plot(model) mplot(model) ## Simple scatterplot beer %>% ggvis(x=~media, y=~ship) %>% layer_points() %>% layer_model_predictions(model = "lm") ## Tidy model and other commands tidymodel <- tidy(model) tidymodel$p.value[2] glance(model)$r.squared anova(model) augment(model) coef(model) confint(model) residuals(model) ***** # Load ACT/GPA data.frame actgpa <- read.csv("http://www.bradthiessen.com/html5/data/actgpa.csv") ## Scatterplot (hsGPA, GPA) actgpa %>% ggvis(x=~hsGPA, y=~fallGPA, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red", strokeOpacity:=.6) %>% add_axis("x", title = "High School GPA") %>% scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "SAU 1st semester GPA") %>% scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE) # Summary statistics favstats(~fallGPA, data=actgpa) favstats(~hsGPA, data=actgpa) cor(fallGPA ~ hsGPA, data=actgpa) ## Coefficients of linear model gpa.model <- lm(fallGPA~hsGPA, data=actgpa) summary(gpa.model) ## A randomized model gpa2 <- data.frame(fallGPA = actgpa$fallGPA, hsGPA = shuffle(actgpa$hsGPA)) gpa2 %>% ggvis(x=~hsGPA, y=~fallGPA, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="green", strokeOpacity:=.8) %>% add_axis("x", title = "High School GPA") %>% scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "SAU 1st semester GPA") %>% scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE) ## Another randomized model gpa3 <- data.frame(fallGPA = actgpa$fallGPA, hsGPA = shuffle(actgpa$hsGPA)) gpa3 %>% ggvis(x=~hsGPA, y=~fallGPA, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.4) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="green", strokeOpacity:=.8) %>% add_axis("x", title = "High School GPA") %>% scale_numeric("x", domain=c(1.8, 4), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "SAU 1st semester GPA") %>% scale_numeric("y", domain=c(0, 4), nice=FALSE, clamp=TRUE) ## Coefficients for randomized models lm(fallGPA~hsGPA, data=gpa2) lm(fallGPA~hsGPA, data=gpa3) ## Randomization-based test test.slope <- 0.9372 rand.slopes <- do(10000) * lm(fallGPA ~ shuffle(hsGPA), data=actgpa) histogram(~hsGPA., data=rand.slopes, xlab="Possible slopes assuming null hypothesis is true", groups=hsGPA. >= test.slope, # Highlight values > test statistic main=paste("p-value = ", prop(~hsGPA. >= test.slope, data=rand.slopes))) ladd(panel.abline(v=test.slope)) # Add vertical line at test statistic ## Bootstrap confidence interval bstrap <- do(10000) * lm(fallGPA ~ hsGPA, data=resample(actgpa)) densityplot(~hsGPA, data=bstrap, plot.points = FALSE, col="steelblue", lwd=4) cdata(0.95, hsGPA, data = bstrap) ## Likelihood ratio test fullmodel <- lm(ship ~ media, data=beer) reducedmodel <- lm(ship ~ 1, data=beer) glance(fullmodel) logLik(fullmodel) logLik(reducedmodel) 2 * (logLik(fullmodel) - logLik(reducedmodel)) pchisq(6.96824, df=3, lower.tail=FALSE) library(lmtest) lrtest(reducedmodel, fullmodel) ## AIC AIC(reducedmodel, fullmodel) (6 * log(191.0244/6)) +2 (6 * (log(2 * pi) + 1 + log(191.0244/6)) + (2 * (2+1))) ((-2) * (-18.89556)) + (2*3) ## ANOVA to compare models anova(reducedmodel, fullmodel) anova(fullmodel) beer$pred <- predict(fullmodel) beer$error <- resid(fullmodel) beer$error2 <- beer$error^2 sum(beer$error2) anova(reducedmodel) (6 * log(610.19/6)) +2 ***** ## Violin data violin <- read.csv("http://www.bradthiessen.com/html5/data/violin.csv") violin %>% ggvis(x=~years, y=~neural, fill:= "steelblue", stroke:="steelblue", fillOpacity:=.8, strokeOpacity:=.4) %>% layer_points() %>% scale_numeric("x", domain=c(0, 20), nice=FALSE, clamp=TRUE) %>% scale_numeric("y", domain=c(0, 30), nice=FALSE, clamp=TRUE) model <- lm(neural ~ years, data=violin) summary(model) anova(model) mplot(model) ***** ###################### ## Prestige example ## ###################### ## Load data prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv") ## Set occupation type as a factor variable prestige$type <- as.factor(prestige$occ_type) ## Eliminate the job titles prestige2 <- prestige %>% dplyr::select(-title, -blue, -professional, -white) ## Create scatterplot matrix with histograms on diagonals panel.hist <- function(x, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(usr[1:2], 0, 1.5) ) h <- hist(x, plot = FALSE) breaks <- h$breaks; nB <- length(breaks) y <- h$counts; y <- y/max(y) rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...) } pairs(prestige2, cex = 1, pch = 1, bg = "steelblue", diag.panel = panel.hist, cex.labels = 1, font.labels = 2) ## ANOVA, summary statistics, Bartlett's test, Tukey/Bonferroni model <- lm(prestige ~ as.factor(occ_type), data=prestige) anova(model) prestige %>% group_by(occ_type) %>% summarize(n=n(), mean=mean(prestige), sd=sd(prestige)) bartlett.test(prestige ~ occ_type, data = prestige) TukeyHSD(model, conf.level=.95) plot(TukeyHSD(model)) with(prestige, pairwise.t.test(prestige, occ_type, p.adjust.method="bonferroni")) ##### Construct linear model: Prestige = f(income) ## Scatterplot with regression line prestige %>% ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>% add_axis("x", title = "Income") %>% scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Prestige") %>% scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE) ## Create model model <- lm(prestige~income, data=prestige) # Get coefficients coef(model) # Summarize model to get R-squared summary(model) # Get ANOVA summary table anova(model) # Get confidence intervals for coefficients confint(model) ### Create new data (job with income = 7000) newdata = data.frame(income=7000) ## Find the predicted prestige for that new job predict(model, newdata) ## Find confidence interval predict(model, newdata, interval="confidence") ## Find prediction interval predict(model, newdata, interval="predict") ## Assumptions plot(model) ## Homoscedasticity test ncvTest(model) # Normality visualization: component + residual plot crPlots(model) # Square transformation lm(prestige^2 ~ income, data=prestige) ## Robust regression library(MASS) rlm(prestige ~ income, data=prestige) ## Quantile regression library(quantreg) rq(prestige ~ income, tau=.5, data=prestige) summary(rq(prestige ~ income, tau=.5, data=prestige)) ## Lowess lowess(prestige$prestige ~ prestige$income) prestige %>% ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.4, strokeOpacity:=.5) %>% layer_points() %>% layer_smooths(stroke:="red", strokeWidth:=5) %>% add_axis("x", title = "Income") %>% scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Prestige") %>% scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE) ## Nonlinear -- quadratic model quad = lm(prestige ~ income + I(income^2), data=prestige) coef(quad) summary(quad) plot(prestige~income, data=prestige) curve(14.18319+0.0061535*x-.0000001433097*x^2,add=T) plot(quad) ## Log model logmod = lm(prestige ~ log(income), data=prestige) summary(logmod) plot(prestige~income, data=prestige) curve(-139.856 + 21.556*log(x),add=T) plot(logmod)