# Install and load all necessary packages
list.of.packages <- c("tidyverse", "broom", "mosaic", "ggExtra")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(tidyverse)
library(mosaic)
library(broom)
library(modelr)
library(ggExtra)

Contest

The class will choose the criteria we use to declare the winner.

Celebrity	Guess	Actual	???
1. Oprah
2. Tom Hanks
3. Betty White
4. Pelé
5. Michael Jordan
6. Taylor Swift
7. Will Smith
8. George Washington
9. (students’ choice)
10. (students’ choice)
			score = __________

What was the point of that?

Scenario: College admissions

Suppose you work in the admissions office and are tasked with admitting students who are predicted to be successful at St. Ambrose.

How do you define success at St. Ambrose? What information do you think you would need to predict a student’s success? Which factor do you think best predicts success?

Click the tabs to see plots of data from first-year students at St. Ambrose from 2011-2015. On each plot, I’ve sketched the line that best fits the data. Interpret the slope and y-intercept of the line in the first plot. Then, use that plot to predict the first-semester GPA for a student with a high school GPA of 3.0.

High school GPAs

# Read data
sau <- read.csv("http://www.bradthiessen.com/html5/data/sau.csv")
# Convert female and minority variables to factors
sau <- sau %>%
  mutate(female = factor(female, labels = c("male", "female")),
         minority = factor(minority, labels = c("white", "minority")))
  
p <- ggplot(sau, aes(x = hsgpa, y=fallgpa)) +
  geom_point(alpha=.3, color="steelblue") +
  geom_smooth(method = "lm", se=FALSE, color="black") + 
  scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(limits = c(0.5,4), breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  theme(legend.position="none") +
  labs(
    x = "High School GPA",
    y = "Fall semester GPA at St. Ambrose"
  ) +
  annotate("text", x=1.1, y=1.5, label="y = 0.058 + 0.826x")
# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
#    Warning: Removed 26 rows containing non-finite values (stat_smooth).

#    Warning: Removed 26 rows containing non-finite values (stat_smooth).
#    Warning: Removed 26 rows containing missing values (geom_point).
#    Warning: Removed 26 rows containing non-finite values (stat_bin).

ACT Composite

p <- ggplot(sau, aes(x = jitter(ACTtotal), y=fallgpa)) +
  geom_point(alpha=.3, color="steelblue") +
  geom_smooth(method = "lm", se=FALSE, color="black") + 
  scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(limits = c(15,36), breaks=seq(16, 36, 2), minor_breaks=seq(15, 35, 2)) +
  theme(legend.position="none") +
  labs(
    x = "ACT Composite",
    y = "Fall semester GPA at St. Ambrose"
  )

# Use library(ggExtra) to plot marginal histograms
ggMarginal(p, type="histogram", fill="steelblue", color="white")
#    Warning: Removed 26 rows containing non-finite values (stat_smooth).

#    Warning: Removed 26 rows containing non-finite values (stat_smooth).
#    Warning: Removed 26 rows containing missing values (geom_point).
#    Warning: Removed 26 rows containing non-finite values (stat_bin).

ACT Math

p <- ggplot(sau, aes(x = jitter(ACTmath), y=fallgpa)) +
  geom_point(alpha=.3, color="steelblue") +
  geom_smooth(method = "lm", se=FALSE, color="black") + 
  scale_y_continuous(limits = c(0.5,4),breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(limits = c(14,36), breaks=seq(14, 36, 2), minor_breaks=seq(15, 35, 2)) +
  theme(legend.position="none") +
  labs(
    x = "ACT Math",
    y = "Fall semester GPA at St. Ambrose"
  )
ggMarginal(p, type="histogram", fill="steelblue", color="white")
#    Warning: Removed 31 rows containing non-finite values (stat_smooth).
#    Warning: Removed 3 rows containing non-finite values (stat_bin).
#    Warning: Removed 30 rows containing non-finite values (stat_smooth).
#    Warning: Removed 27 rows containing missing values (geom_point).
#    Warning: Removed 4 rows containing non-finite values (stat_bin).
#    Warning: Removed 26 rows containing non-finite values (stat_bin).

HSGPA and Retention

ggplot(sau, aes(x = hsgpa, y=jitter(retention, .1))) +
  geom_point(alpha=.3, color="steelblue") +
  geom_smooth(method = "lm", se=FALSE, color="black") + 
  scale_y_continuous(limits = c(0,1), breaks=seq(0, 1, 1), minor_breaks=NULL) +
  theme(legend.position="none") +
  labs(
    x = "high school GPA",
    y = "1 = student returned for sophomore year"
  )
#    Warning: Removed 963 rows containing non-finite values (stat_smooth).
#    Warning: Removed 949 rows containing missing values (geom_point).

Simple linear model

Here’s some simulated data showing the relationship between ACT scores and 1st-semester college GPAs.

# Simulate ACT and first-semester GPA data
set.seed(123)
firstyear <- tibble(
  act = c(rep(c(14, 16, 18, 20, 22, 24, 26, 28, 30, 32),3)),
  gpa = round((act/9) + rnorm(30, 0, .3),2),
  act2 = act + rnorm(30,0,.1)
)

# Plot the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
  geom_point() +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Describe and parameterize a model of GPAs as a function of ACT scores.

Let’s randomly generate 300 lines of the form \(y = b_0+b_1x\) to display on our data:


# Looks linear.  Randomly choose 250 slopes and y-intercepts. 
set.seed(123)
models <- tibble(
  b0 = runif(300, -2, 3.5),
  b1 = runif(300, -.2, .2)
)

# Plot all 300 models on top of the data
ggplot(data = firstyear, aes(x = act, y = gpa)) +
  geom_abline(aes(intercept = b0, slope = b1), data = models, alpha = 0.25) +
  geom_point() +
  geom_abline(aes(intercept = 0.0638, slope = .1077), data = models, alpha = 0.25) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Choosing the best line

How can we determine which line best fits our data? Interpret total (absolute) distance and root mean square deviation on the plots displayed below.

Model #1

# Copy our data to new data frame
withpredictions <- firstyear

# Choose a slope and y-intercept
b00 <- 42/24
b10 <- 1/24
# Predict GPAs with this slope and y-intercept
withpredictions$pred2 <- b00 + (b10 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid2 <- withpredictions$gpa - ( b00 + (b10 * withpredictions$act) )


ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b00, slope = b10), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend=withpredictions$act2, 
           y=withpredictions$gpa, yend=withpredictions$pred2, color="red") +
  annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =", 
                                                round( sqrt(mean(withpredictions$resid2^2)), 3))) +
  annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =", 
                                                round( sum(abs(withpredictions$resid2)), 3))) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Model #2

# Choose a slope and y-intercept
b01 <- 3/50
b11 <- 1/9
# Predict GPAs with this slope and y-intercept
withpredictions$pred1 <- b01 + (b11 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid1 <- withpredictions$gpa - ( b01 + (b11 * withpredictions$act) )


ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b01, slope = b11), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend=withpredictions$act2, 
           y=withpredictions$gpa, yend=withpredictions$pred1, color="red") +
  annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =", 
                                                round( sqrt(mean(withpredictions$resid1^2)), 3))) +
  annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =", 
                                                round( sum(abs(withpredictions$resid1)), 3))) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Model #3

# Choose a slope and y-intercept
b02 <- 3.5
b12 <- -1/42
# Predict GPAs with this slope and y-intercept
withpredictions$pred3 <- b02 + (b12 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid3 <- withpredictions$gpa - ( b02 + (b12 * withpredictions$act) )


ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b02, slope = b12), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend=withpredictions$act2, 
           y=withpredictions$gpa, yend=withpredictions$pred3, color="red") +
  annotate("text", x = 27, y = 1.6, label=paste("root mean square deviation =", 
                                                round( sqrt(mean(withpredictions$resid3^2)), 3))) +
  annotate("text", x = 27, y = 1.8, label=paste("total (absolute) distance =", 
                                                round( sum(abs(withpredictions$resid3)), 3))) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Finding smallest RMSE through iteration

We can calculate this root mean square deviation for all 300 lines we plotted earlier. Let’s display the ten best models (with the smallest RMSE).

# Don't worry about this code.
# Function to calculate predictions based on slope and y-intercept
model1 <- function(a, data) {
  a[1] + data$act * a[2]
}

# Function to calculate vertical error distances
measure_distance <- function(mod, data) {
  diff <- data$gpa - model1(mod, data)
  sqrt(mean(diff ^ 2))
}

# Compute distance for all 300 models
sim1_dist <- function(b0, b1) {
  measure_distance(c(b0, b1), firstyear)
}

# Add distances to the models data frame
models <- models %>% 
  mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))

# Find 10 best models (with smallest rmse) and plot on data
ggplot(firstyear, aes(act, gpa)) + 
  geom_point(size = 2, colour = "black") + 
  geom_abline(
    aes(intercept = b0, slope = b1, colour = -dist), 
    data = filter(models, rank(dist) <= 10)
  ) +
  labs(title = "10 best models (with smallest RMSE)")

Each of the ten lines displayed above has a y-intercept and a slope, so we can think of each model as a point: \((b_0, \ b_1)\). Let’s plot these “points” for all 300 linear models:

# Plot the slopes and y-intercepts of all 300 models
# Highlight best combinations of slope and intercept with red circles
ggplot(models, aes(b0, b1)) +
  geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist))

The points with lighter shades of blue represent models with smaller amounts of error. The red circles highlight the 10 best models that were displayed earlier.

How could you use this plot to identify the single best model (which might not be any of the 300 points appearing on the plot)?

Let’s try a grid search to find the best combination of slope and y-intercept.

# Create the grid by selecting ranges for b0 and b1
grid <- expand.grid(
  b0 = seq(-1, 1, length = 25),
  b1 = seq(0.05, .15, length = 25)
  ) %>% 
  mutate(dist = purrr::map2_dbl(b0, b1, sim1_dist))

# Zoom and enhance
grid %>% 
  ggplot(aes(b0, b1)) +
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist))

Let’s take these 10 best models and display them back on the data.

ggplot(firstyear, aes(act, gpa)) + 
  geom_point(size = 2, colour = "grey30") + 
  geom_abline(
    aes(intercept = b0, slope = b1, colour = -dist), 
    data = filter(grid, rank(dist) <= 10)
  )

We could continue this zoom-and-enhance method until we really narrowed down our best y-intercept and slope. Thankfully, there’s an easier way.

We want to find the model that minimizes the distances from each point to the line. Think back to your previous math courses. What methods have you used to find minimums (or to minimize functions)?

Why might we be more interested in vertical errors than horizontal or perpendicular errors?

Vertical errors

# Choose a slope and y-intercept
b0 <- 0.06384
b1 <- 0.10772
# Predict GPAs with this slope and y-intercept
withpredictions$pred <- b0 + (b1 * withpredictions$act)
# Add errors (vertical distances)
withpredictions$resid <- withpredictions$gpa - ( b0 + (b1 * withpredictions$act) )

# Plot vertical errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend=withpredictions$act2, 
           y=withpredictions$gpa, yend=withpredictions$pred, color="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
  labs(x = "ACT", y = "GPA")

Horizontal

# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend= ( (withpredictions$gpa - b0) / b1), 
           y=withpredictions$gpa, yend=withpredictions$gpa, color="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
  labs(x = "ACT", y = "GPA")

Perpendicular

# Plot horizontal errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) + 
  annotate("segment", x=withpredictions$act2, xend= ((withpredictions$act2 + b1 * withpredictions$gpa - b0 * b1) / (1 + b1^2)), 
           y=withpredictions$gpa, yend = (b0 + b1 * withpredictions$act2), color="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + 
  geom_abline(aes(intercept = 200, slope=-3)) + coord_fixed(ratio = 1, xlim=c(18,26)) +
    labs(x = "ACT", y=NULL)

Squared (vertical)

# Plot squared errors
ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) + 
  annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
           ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
    labs(x = "ACT", y=NULL)

Before we derive the formula for the line of best fit, why should we be willing to accept any amount of error? Why use a line when we can connect-the-dots or use a curve?

Connect-the-dots

ggplot(data = withpredictions, aes(x = act, y = gpa)) +
  geom_point() +
  geom_line(color="blue", size=1) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
  labs(x = "ACT", y = "GPA")

Curve

ggplot(data = withpredictions, aes(x = act, y = gpa)) +
  geom_point() +
  geom_smooth(method="loess", se=FALSE, span=.4) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
  labs(x = "ACT", y = "GPA")

Line

ggplot(data = withpredictions, aes(x = act, y = gpa)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) +
  labs(x = "ACT", y = "GPA")

Deriving the least squares regression line

We want to minimize the squared vertical distances from predictions made by our model, \(\hat{y}_i=b_0+b_1x_i\) to each observed data point, \((x_i, y_i)\).

Each of these vertical distances can be written as: \(e_i = y_i-(b_0+b_1x_i)=y_i-b_0-b_1x_i\)

Our goal is to find values for the y-intercept \((b_0)\) and slope \((b_1)\) that minimize: \(Q =\sum_{i=1}^{n}(y_i-b_0-b_1x_i)^2\).

To minimize a function, we can set its first derivative equal to zero and solve (and verify the second derivative is positive at that value). Since we have two variables in \(Q\), we’ll need to take partial derivatives:

Partial derivative of Q with respect to b0

\(\frac{\partial Q }{\partial b_0}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_0}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum(y_i-b_0-b_1x_i)\)

Set this partial derivative equal to zero:

\(-2\sum(y_i-b_0-b_1x_i)=0\)

\(\sum(y_i-b_0-b_1x_i)=0\)

\(\sum{y_i}=nb_0+b_1\sum{x_i}\)

Partial derivative of Q with respect to b1

\(\frac{\partial Q }{\partial b_1}=\frac{\partial \sum(y_i-b_0-b_1x_i)^2 }{\partial b_1}=2\sum(y_i-b_0-b_1x_i)\frac{\partial \sum(y_i-b_0-b_1x_i) }{\partial b_0}=-2\sum{x_i}(y_i-b_0-b_1x_i)\)

Set this partial derivative equal to zero:

\(-2\sum{x_i}(y_i-b_0-b_1x_i)=0\)

\(\sum{x_i}(y_i-b_0-b_1x_i)=0\)

\(\sum{x_iy_i}-b_0\sum{x_i}-b_1\sum{x_i^2}=0\)

\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}=0\)

Solve this system of normal equations

\(\sum{y_i}=nb_0+b_1\sum{x_i}\)

\(\sum{x_iy_i}=b_0\sum{x_i}+b_1\sum{x_i^2}\)

We can solve this system to get:

\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}\)

and

\(b_0=\frac{\sum{x_i^2}\sum{y_i}-\sum{x_i}\sum{x_iy_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{\sum{y_i}}{n}-b_1\frac{\sum{x_i}}{n}=\overline{y}-b_1\overline{x}\)

Using formulas for variance and covariance, we can rewrite \(b_1\) as:

\(b_1=\frac{n\sum{x_iy_i}-\sum{x_i}\sum{y_i}}{n\sum{x_i^2} \ - (\sum{x_i})^2}=\frac{s_{xy}}{s_x^2}=r\frac{s_y}{s_x}\)

The line that minimizes the sum of squared errors has:

\(\textrm{slope}=b_1=r\frac{s_y}{s_x}\)
\(\textrm{y-intercept}=b_0=\overline{y}-b_1\overline{x}\)

For this sample of 30 observations:

\(\overline{Y}=\overline{GPA}=2.541\), \(s_y=s_{GPA}=0.694\)

\(\overline{X}=\overline{ACT}=23.0\), \(s_x=s_{ACT}=5.843\)

\(n = 30\), \(r_{y,x}=r_{GPA,ACT}=0.906\)

Calculate the slope and y-intercept for the least squares regression line.

# Get summary statistics
firstyear %>%
  summarize(meanx = mean(act), sdx = sd(act),
            meany = mean(gpa), sdy = sd(gpa),
            n = n(), cor = cor(act, gpa))
#    # A tibble: 1 x 6
#      meanx      sdx    meany       sdy     n       cor
#      <dbl>    <dbl>    <dbl>     <dbl> <int>     <dbl>
#    1    23 5.842767 2.541333 0.6944397    30 0.9062938

# Calculate b1
firstyear %>%
  summarize(b1 = cor(gpa, act) * sd(gpa) / sd(act),
            b0 = mean(gpa) - (b1 * mean(act)) )
#    # A tibble: 1 x 2
#             b1         b0
#          <dbl>      <dbl>
#    1 0.1077172 0.06383838

Fitting linear models in R

In R, we can use the lm() function to fit linear models. Expand the code –>

# lm = linear model
# lm(y ~ x, data)
lm(gpa ~ act, data = firstyear)
#    
#    Call:
#    lm(formula = gpa ~ act, data = firstyear)
#    
#    Coefficients:
#    (Intercept)          act  
#        0.06384      0.10772
#
# The output shows coefficients
# It's typically more useful to store the model
model1 <- lm(gpa ~ act, data = firstyear)
#
# We can then investigate this model
# Summary statistics
summary(model1)
#    
#    Call:
#    lm(formula = gpa ~ act, data = firstyear)
#    
#    Residuals:
#         Min       1Q   Median       3Q      Max 
#    -0.55992 -0.17981 -0.02775  0.17302  0.55095 
#    
#    Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
#    (Intercept) 0.063838   0.225053   0.284    0.779    
#    act         0.107717   0.009493  11.347 5.51e-12 ***
#    ---
#    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#    
#    Residual standard error: 0.2987 on 28 degrees of freedom
#    Multiple R-squared:  0.8214,   Adjusted R-squared:  0.815 
#    F-statistic: 128.7 on 1 and 28 DF,  p-value: 5.515e-12
#
# Coefficients
coef(model1)
#    (Intercept)         act 
#     0.06383838  0.10771717
#
# Broom package:  tidy(), glance(), augment() functions
# Tidy the model coefficients, standard errors, and p-values
tidy(model1)
#    # A tibble: 2 x 5
#             term   estimate   std.error  statistic      p.value
#            <chr>      <dbl>       <dbl>      <dbl>        <dbl>
#    1 (Intercept) 0.06383838 0.225052612  0.2836598 7.787580e-01
#    2         act 0.10771717 0.009493271 11.3466861 5.514804e-12
#
# Glance at summary statistics for the model
glance(model1)
#    # A tibble: 1 x 11
#      r.squared adj.r.squared     sigma statistic      p.value    df
#          <dbl>         <dbl>     <dbl>     <dbl>        <dbl> <int>
#    1 0.8213685     0.8149888 0.2986988  128.7473 5.514804e-12     2
#    # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
#    #   deviance <dbl>, df.residual <int>
#
# Augment the data with predictions and residuals 
head(augment(model1))
#    # A tibble: 6 x 9
#        gpa   act  .fitted    .se.fit      .resid       .hat    .sigma
#    * <dbl> <dbl>    <dbl>      <dbl>       <dbl>      <dbl>     <dbl>
#    1  1.39    14 1.571879 0.10136040 -0.18187879 0.11515152 0.3018954
#    2  1.71    16 1.787313 0.08596523 -0.07731313 0.08282828 0.3037830
#    3  2.47    18 2.002747 0.07229860  0.46725253 0.05858586 0.2897173
#    4  2.24    20 2.218182 0.06152343  0.02181818 0.04242424 0.3041497
#    5  2.48    22 2.433616 0.05535481  0.04638384 0.03434343 0.3040443
#    6  3.18    24 2.649051 0.05535481  0.53094949 0.03434343 0.2858551
#    # ... with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
#
#
# Modelr package: add_predictions() function
# Add predictions to our dataset
firstyear <- firstyear %>%
  add_predictions(model1)
#
# Now we can plot these predictions as a line
ggplot(data = firstyear, aes(x = act, y = gpa)) +
  geom_point() +
  geom_line(aes(y = pred), color="blue", size=1) +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL)

Is the best line any good?

Just because we’re able to calculate the best line, it doesn’t mean that line is meaningful. For example, here’s the line that best fits this data:

# Simulate data forming a parabola
sim_data <- tibble(
  x = c(-5:5),
  y = x^2
)

# Plot the data
# geom_smooth(method = "lm") plots the line of best fit
ggplot(data = sim_data, aes(x = x, y = y)) +
  geom_point(size=2) +
  geom_smooth(method="lm", se=FALSE)

It looks like we’ll need some way of measuring just how well a model (a line) fits a given dataset. Let’s derive some of these measures for three different datasets:

One in which the best line fits the data perfectly
One for our sample data (where the best line seems to fit pretty well)
One in which the best line does not fit the data at all

We’ll complete the following table to see the value of each measure in each case:

Measures of fit

SSE

We already know the best line is the one that minimizes the sum of squared errors (SSE). Can we use SSE to evaluate how well a line fits the data?

What would SSE equal if a line perfectly fit the data?

Now let’s consider a situation where the best line doesn’t fit the data at all. In this situation, X and Y would be uncorrelated – knowing the value of X would not tell you anything about the value of Y. To find the best fitting line, what value of \(a\) would minimize: \(\Sigma{(Y-a)^2}\).

If you substitute your answer for \(a\), what does this expression represent? What’s the maximum value for SSE?

# Later, we'll see what this ANOVA table represents
# For now, it gives us SSE (Sum Sq Residuals)
anova(model1)
#    # A tibble: 2 x 5
#         Df  `Sum Sq`   `Mean Sq` `F value`     `Pr(>F)`
#    * <int>     <dbl>       <dbl>     <dbl>        <dbl>
#    1     1 11.486959 11.48695919  128.7473 5.514804e-12
#    2    28  2.498187  0.08922098        NA           NA

# We can also calculate this directly with the residuals
sum(residuals(model1)^2)
#    [1] 2.498187

What will happen to the value of SSE if we add any additional point to our data? Why is this a problem?

MSE, RSE, and RMSE

Perhaps a better measure of fit would be the average squared error (the average squared distance from observation to the prediction line). One simple estimate of this average squared error would be MSE:

\(s_{y|x}^2=\frac{SS_E}{df_E}=\frac{\Sigma(y_i-\hat{y})^2}{n-2}\)

What would \(s_{y|x}^2\) be in a situation with perfect fit? worst fit? What’s the maximum value of \(s_{y|x}^2\)?

# MSE is displayed in the ANOVA summary table.  Calculate directly with:
sum(residuals(model1)^2) / model1$df.residual
#    [1] 0.08922098

It would be easier to interpret if our measure were not in squared units. Suppose we calculate RSE (residual standard error): \(RSE=s_{y|x}=\sqrt{s_{y|x}^2}=\sqrt\frac{SS_E}{df_E}=\sqrt\frac{\Sigma(y_i-\hat{y})^2}{n-2}\). What would RSE be in a situation with perfect fit? worst fit?

# We can calculate it directly with...
sqrt( sum(residuals(model1)^2) / model1$df.residual )
#    [1] 0.2986988

# We can also glance at it (sigma) with the broom package
glance(model1)
#    # A tibble: 1 x 11
#      r.squared adj.r.squared     sigma statistic      p.value    df
#          <dbl>         <dbl>     <dbl>     <dbl>        <dbl> <int>
#    1 0.8213685     0.8149888 0.2986988  128.7473 5.514804e-12     2
#    # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
#    #   deviance <dbl>, df.residual <int>

The \(n-2\) in the denominator is used to provide an unbiased estimate. Another popular measure of model fit is RMSE (root mean square error): \(RMSE=\sqrt\frac{\Sigma(y_i-\hat{y})^2}{n}\). Sketch a rough representation for what RMSE represents. The code calculates RMSE for our sample data.

# The modelr package has a rmse() function
rmse(model1, firstyear)
#    [1] 0.2885705

# Calculate it directly.  length = sample size
sqrt( sum(residuals(model1)^2) / length(firstyear$act) )
#    [1] 0.2885705

So far, all our measures of fit are unbounded. Ideally, we’d might like a measure that has a known minimum and maximum. Suppose we took our total sums of squares (the total variation in Y) and partitioned it. Explain what the following formula represents, then calculate its value in the perfect and worst-fit scenarios:

\(\frac{SS_E}{SS_Y}=\frac{\Sigma(y_i-\overline{Y})^2}{\Sigma(y_i-\hat{Y})^2}=1-R^2\)

# The modelr package has an rsquare() function
1 - rsquare(model1, firstyear)
#    [1] 0.1786315

That seems backwards – the measure should be zero when we have no fit and 1.0 when we have perfect fit. Let’s try this:

\(R^2=\frac{SS_Y-SS_E}{SS_Y}=\frac{SS_{reg}}{SS_Y}=\frac{\Sigma(\hat{y}-\overline{Y})^2}{\Sigma(y_i-\hat{Y})^2}\)

This is the coefficient of determination (which has the same interpretation as eta-squared in ANOVA). Fill-in-the-blanks to show the value of \(R^2\) in perfect fit and worst fit scenarios.

# The modelr package has an rsquare() function
rsquare(model1, firstyear)
#    [1] 0.8213685

# We can glance at it with the broom package
glance(model1)
#    # A tibble: 1 x 11
#      r.squared adj.r.squared     sigma statistic      p.value    df
#          <dbl>         <dbl>     <dbl>     <dbl>        <dbl> <int>
#    1 0.8213685     0.8149888 0.2986988  128.7473 5.514804e-12     2
#    # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
#    #   deviance <dbl>, df.residual <int>

# It also appears in the summary
summary(model1)
#    
#    Call:
#    lm(formula = gpa ~ act, data = firstyear)
#    
#    Residuals:
#         Min       1Q   Median       3Q      Max 
#    -0.55992 -0.17981 -0.02775  0.17302  0.55095 
#    
#    Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
#    (Intercept) 0.063838   0.225053   0.284    0.779    
#    act         0.107717   0.009493  11.347 5.51e-12 ***
#    ---
#    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#    
#    Residual standard error: 0.2987 on 28 degrees of freedom
#    Multiple R-squared:  0.8214,   Adjusted R-squared:  0.815 
#    F-statistic: 128.7 on 1 and 28 DF,  p-value: 5.515e-12

# We can also get it from the ANOVA table
anova(model1)
#    # A tibble: 2 x 5
#         Df  `Sum Sq`   `Mean Sq` `F value`     `Pr(>F)`
#    * <int>     <dbl>       <dbl>     <dbl>        <dbl>
#    1     1 11.486959 11.48695919  128.7473 5.514804e-12
#    2    28  2.498187  0.08922098        NA           NA
11.487/(11.487+2.4982)
#    [1] 0.8213683

As we’ll see later, none of these measures are perfect. R-squared and RMSE seem to be the most widely used, though. Identify at least one advantage each measure has over the other.

Conditions of linear regression

Validity: The data we’re analyzing maps to the research question we are trying to answer
Additivity and Linearity: The deterministic component of the model is a linear function of the predictors.
Diagnosis: Look at plots of observed vs predicted or residuals vs predicted values. The points should be symmetrically distributed around a diagonal line in the former plot or around horizontal line in the latter plot, with a roughly constant variance.
Independent errors: No correlation among errors
Diagnosis: If you have time series data, be careful that consecutive errors are not related.
Equal variance of errors (homoscedasticity): The variance in the errors is the same across all levels of X.
Diagnosis: Look at the plot of residuals vs predicted values. If the residuals grow larger as a function of X, you have a problem.
Normality of errors: The variance in the errors is the same across all levels of X.
Diagnosis: Look at a P-P or Q-Q plot of the residuals. The residuals should fall near the diagonal line. You could also run a test for normality, like the Shapiro-Wilk or Kologorov-Smirnov tests. Note that the dependent and independent variables in a regression model do not need to be normally distributed by themselves–only the prediction errors need to be normally distributed

The plot() function displays some diagnostic plots for our model. Evaluate the conditions listed above based on our dataset, question of interest, and the diagnostic plots.

plot(model1)

Regression: Inference on slope

Let’s finally use our real dataset. To change things up a bit, let’s regress Fall semester GPAs on high school GPAs. Make sure you can interpret all this output:

full_model <- lm(fallgpa ~ hsgpa, data = sau)
summary(full_model)
#    
#    Call:
#    lm(formula = fallgpa ~ hsgpa, data = sau)
#    
#    Residuals:
#         Min       1Q   Median       3Q      Max 
#    -3.12108 -0.30133  0.07284  0.36865  1.74310 
#    
#    Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
#    (Intercept)  0.05767    0.07984   0.722     0.47    
#    hsgpa        0.82572    0.02425  34.046   <2e-16 ***
#    ---
#    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#    
#    Residual standard error: 0.5832 on 1902 degrees of freedom
#    Multiple R-squared:  0.3787,   Adjusted R-squared:  0.3783 
#    F-statistic:  1159 on 1 and 1902 DF,  p-value: < 2.2e-16

rmse(full_model, data = sau)
#    [1] 0.5828609

plot(full_model)

Earlier, we learned that we always expect to get a non-zero correlation in any sample of data. Similarly, we expect linear models with non-zero slopes in any sample of data (even if we have reason to believe the variables are uncorrelated).

Our model to predict fall semester GPAs is: FallGPA = 0.058 + 0.926(hsGPA)

Does the magnitude of this slope (0.926) imply high school and college GPAs have a relationship for our population of interest or could we have obtained a slope of this magnitude even if the variables are uncorrelated?

Explain how we’ll use randomization-based methods to test a null hypothesis that the slope of our linear model is zero. If it helps, explain what the two randomizations displayed below represent.

Explain how we’ll use randomization-based methods to test a null hypothesis that the slope of our linear model is zero.

# Store our observed slope of 0826
observed_slope <- full_model$coefficients[2]

# Run 10,000 randomizations, shuffling the high school GPA
rand_slopes <- Do(10000) * lm(fallgpa ~ shuffle(hsgpa), data = sau)

# The slopes are stored in the "hsgpa" variable.  Let's plot and estimate the p-value.
ggplot(data = rand_slopes, aes(x = hsgpa)) +
  geom_histogram(fill="lightblue", color="white", alpha = 0.8) +
  labs(
      title = "Randomized slopes",
      x = "Slope"
      ) +
  scale_x_continuous(breaks=seq(-.1, .1, .025), minor_breaks=NULL) +
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  ) +
  annotate("text", x = .075, y = 600, label = paste("p = 0"))
#    `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bootstrap confidence interval for the slope

Explain how the following bootstrap confidence interval for the slope was constructed.

boot <- Do(10000) * lm(fallgpa ~ hsgpa, data = sample(sau, replace=TRUE))

bootstrapCI <- confint(boot$hsgpa, level = 0.95, method = "quantile")
lower <- as.numeric(bootstrapCI[1]) # Store lower CI bound
upper <- as.numeric(bootstrapCI[2]) # Store upper CI bound

# Density plot
ggplot(data = boot, aes(x = hsgpa)) +
  geom_density(fill="lightblue", color="white", alpha = 0.8) +
  labs(
      title = "Bootstrap distribution of correlations",
      x = "bootstrap correlations"
      ) +
  scale_x_continuous(breaks=seq(.7, 1, 0.05), minor_breaks=NULL) +
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  ) +
  annotate("text", x = lower, y = 5, label = round(lower,3)) +
  annotate("text", x = upper, y = 5, label = round(upper,3)) +
  annotate("text", x = median(boot$hsgpa), y = 6, label = paste("95%")) +
  annotate("segment", x = lower, xend = upper, y = 4, yend = 4, color = "red")

Model selection

Two trees have fallen down during a windy night. Which of the following two possible explanations do you think is better? Why?

a) The wind has blown them down.

b) Two meteorites have each taken one tree down, and after that hit each other and removed any trace of themselves.

In the 14th century, William of Ockham wrote Entia non sunt multiplicanda praeter necessitatem, which translates to: More things should not be used than are necessary.

When comparing two competing theories, we might use Ockham’s Razor and prefer theories that are simpler (more parsimonious in that they require fewer assumptions or are easier to explain). On the other hand, we might prefer more complex theories if they more accurately predict future events.

When we compare competing statistical models, we’ll have to trade-off accuracy and simplicity. We’ll have to learn to navigate between:

Overfitting which leads to poor prediction by learning too much from our sample data

and

Underfitting which leads to poor prediction by learning too little from our sample data

You’ve already dealt with this issue once. Explain, again, why you might prefer a linear model to a curve or a connect-the-dots model.

If we’re interested in the best prediction of fall semester GPAs, one approach we might take is:

Start with a basic model with one predictor: \(y = b_0 + b_1\textrm{(hsGPA)}\). Calculate a measure of model fit.
Add a second predictor: \(y = b_0 + b_1\textrm{(hsGPA)} + b_2\textrm{(ACT)}\). Calculate a measure of model fit. If this model fits noticeably better, keep this more complicated model.
Add a third predictor: \(y = b_0 + b_1\textrm{(hsGPA)} + b_2\textrm{(ACT)} + b_3\textrm{(female)}\). Calculate a measure of model fit. If this model fits noticeably better than the previous model, keep this more complicated model.

We could also start with a complicated model, remove a predictor, and determine if the fit worsens noticeably.

When comparing models, it’s helpful to write out the full (more complicated) model and the reduced model. Write out the full and reduced models if I want to know whether ACT scores predict fall semester GPAs.

Below, I’ve sketched the full and reduced models on our (simulated) dataset. What does SSE represent in our reduced model? Will SSE always get smaller as we add more predictor variables?

Reduced Model

ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = mean(gpa), slope = 0), color="blue", size=1) + 
  annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - mean(withpredictions$gpa)),
           ymin=mean(withpredictions$gpa), ymax=withpredictions$gpa, alpha=.3, fill="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
    labs(x = "ACT", y=NULL) +
    labs(title = paste("Reduced model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2]+anova(lm(gpa ~ act, data = firstyear))[1,2],5)))

Full Model

ggplot(data = withpredictions, aes(x = act2, y = gpa)) +
  geom_point() +
  geom_abline(aes(intercept = b0, slope = b1), color="blue", size=1) + 
  annotate("rect", xmin=withpredictions$act2, xmax=withpredictions$act2 + (withpredictions$gpa - withpredictions$pred),
           ymin=withpredictions$gpa, ymax=withpredictions$pred, alpha=.3, fill="red") +
  scale_y_continuous(breaks=seq(0, 4, 1), minor_breaks=seq(0.5, 3.5, .5)) +
  scale_x_continuous(breaks=seq(12, 36, 2), minor_breaks=NULL) + coord_fixed(ratio = 1, xlim=c(18,26)) +
    labs(x = "ACT", y=NULL) +
  labs(title = paste("Full model SSE =", round(anova(lm(gpa ~ act, data = firstyear))[2,2],5)))

.

Recall that for this sample of data:

\(\overline{Y}=\overline{GPA}=2.541\), \(s_y=s_{GPA}=0.694\)

\(\overline{X}=\overline{ACT}=23.0\), \(s_x=s_{ACT}=5.843\)

\(n = 30\), \(r_{y,x}=r_{GPA,ACT}=0.906\)

Complete the ANOVA summary table to compare our full and reduced models? Explain what each value represents. From this, what could we conclude about our models? You can estimate the p-value with the F-distribution applet at http://lock5stat.com/statkey/theoretical_distribution/theoretical_distribution.html#F

anova(lm(gpa ~ act, data = firstyear))
#    # A tibble: 2 x 5
#         Df  `Sum Sq`   `Mean Sq` `F value`     `Pr(>F)`
#    * <int>     <dbl>       <dbl>     <dbl>        <dbl>
#    1     1 11.486959 11.48695919  128.7473 5.514804e-12
#    2    28  2.498187  0.08922098        NA           NA

t-test for slope

The only difference between our full and reduced models is the \(b_1\) coefficient (the slope). If \(b_1=0\), the full model would be the same as our reduced model. Another way, then, to compare our full and reduced models would be to test the hypothesis: \(H_0:b_1=0\). We can test this hypothesis with a t-test.

Explain what the following derivation shows. Then, conduct this t-test. Compare the value of the t-statistic to the value of the MSR (F) you calculated in the ANOVA summary table.

Finally, conduct a test of the hypothesis: \(H_0:r_{\textrm{GPA,ACT}}=0\)

t <- cor(firstyear$gpa, firstyear$act) / sqrt( (1-cor(firstyear$gpa, firstyear$act)^2) / (length(firstyear$act)-2))
t
#    [1] 11.34669
t^2
#    [1] 128.7473

Omnibus F-test

We can also calculate our MSR (F-statistic) with formula displayed below. Verify this yields the same value you calculated in the ANOVA summary table. In the code, I’ll calculate this manually and then use the anova() function.

# Manual calculation
Rf <- cor(firstyear$gpa, firstyear$act)
Rr <- 0
N <- length(firstyear$gpa)
kf <- 1
kr <- 0
F <- ( ((Rf^2 - Rr^2) / (kf - kr)) / ((1-Rf^2)/(N-kf-1)) )
F
#    [1] 128.7473

#
# ANOVA function
fullmodel <- lm(gpa ~ act, data=firstyear)
reducedmodel <- lm(gpa ~ 1, data=firstyear)
anova(reducedmodel, fullmodel)
#    # A tibble: 2 x 6
#      Res.Df       RSS    Df `Sum of Sq`        F     `Pr(>F)`
#    *  <dbl>     <dbl> <dbl>       <dbl>    <dbl>        <dbl>
#    1     29 13.985147    NA          NA       NA           NA
#    2     28  2.498187     1    11.48696 128.7473 5.514804e-12

Other criteria for model selection

As we’ll soon see, R-squared and p-values resulting from the omnibus F-test aren’t the best measures to use when comparing models. Here are a couple alternatives we’ll build upon later:

Likelihood

The likelihood of a model is the probability it produces our data given its parameter estimates. If we assume all our observations are independent, then we can write our likelihood function as:

We want to calculate \(P(\textrm{observation 1} \ \cap \ \textrm{observation 2} \ \cap \ ...)\). When we assume independence, we can calculate this as: \(P(\textrm{observation 1})P(\textrm{observation 2})...\).

Given our model with \(b_0\) and \(b_1\) – along with our assumption that errors are normally distributed – we can use a normal distribution to find \(P(\textrm{observation 1})\). We then simply have to multiply these probabilities for each observation in our dataset.

As an example, our first observation is: ACT = 14 with GPA = 1.39. Recall our model has \(b_0=0.06384\), \(b_1=0.10772\), and \(RSE=\sqrt{MS_E}=\sqrt{0.0892}=0.2987\).

For all students with an ACT score of 14, our model expects a GPA of: \(\hat{y}=0.06384+0.10772(14)=1.5719\).

Given that expectation, we can find the probability of actually observing a GPA of 1.39:

\(P[y=1.39 \ | \ y \sim \textrm{ND}(\mu=1.5719, \sigma=0.2987)]\)

Expand the code to see how to calculate this manually in R.

# We'll use dnorm to calculate the likelihood of each observation
# Here's the likelihood of the first observation:
dnorm(1.39, mean = (b0 + (b1 * 14)), sd = 0.2987)
#    [1] 1.109504

# Let's calculate the likelihood for ALL observations
# and multiply those together to get the model likelihood
# We'll also calculate the log-likelihood (which we'll see in a bit)
firstyear %>%
  mutate(likelihood = dnorm(gpa, mean = (b0 + (b1 * act)), sd = 0.2987)) %>%
  summarize(L = round(prod(likelihood),3 ),
            logL = round( log(L),2 ))
#    # A tibble: 1 x 2
#          L  logL
#      <dbl> <dbl>
#    1 0.005  -5.3

Typically, it’s easier to work with the natural log of the likelihood. This log-likelihood will always be negative, with values closer to zero indicating better model fit.

We can use log-likelihood to compare full and reduced models (by calculating the likelihood ratio):

If the full model is much better than the reduced model, we would expect:

\(L(\textrm{reduced})\) to be small
\(L(\textrm{full})\) to be large
\(\frac{L(\textrm{reduced})}{L(\textrm{full})}\) to be small
\(ln(\frac{L(\textrm{reduced})}{L(\textrm{full})})\) to be large and negative.
\(-2ln(\frac{L(\textrm{reduced})}{L(\textrm{full})})\) to be large and positive

To estimate a p-value from this likelihood ratio, we can use a chi-square distribution with \(df = df_{\textrm{full}} - df_{\textrm{reduced}}\)

Expand the code to see this calculation in R:

# Let's first calculate log-likelihood
# We can glance at the with the broom package
glance(fullmodel)
#    # A tibble: 1 x 11
#      r.squared adj.r.squared     sigma statistic      p.value    df
#          <dbl>         <dbl>     <dbl>     <dbl>        <dbl> <int>
#    1 0.8213685     0.8149888 0.2986988  128.7473 5.514804e-12     2
#    # ... with 5 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
#    #   deviance <dbl>, df.residual <int>

# We can calculate it directly with logLik()
logLik(fullmodel)
#    'log Lik.' -5.283677 (df=3)
logLik(reducedmodel)
#    'log Lik.' -31.12013 (df=2)

# Calculate the likelihood ratio
LR <- 2 * (logLik(fullmodel) - logLik(reducedmodel))
LR
#    'log Lik.' 51.67291 (df=3)

# Calculate the p-value
pchisq(LR, df=1, lower.tail=FALSE)
#    'log Lik.' 6.556111e-13 (df=3)

# If you install the lmtest package, you can use lrtest
library(lmtest)
#    Loading required package: zoo
#    
#    Attaching package: 'zoo'
#    The following objects are masked from 'package:base':
#    
#        as.Date, as.Date.numeric
lrtest(reducedmodel, fullmodel)
#    # A tibble: 2 x 5
#      `#Df`     LogLik    Df    Chisq `Pr(>Chisq)`
#    * <dbl>      <dbl> <dbl>    <dbl>        <dbl>
#    1     2 -31.120132    NA       NA           NA
#    2     3  -5.283677     1 51.67291 6.556111e-13

AIC

Akaike’s Information Criterion is another measure to compare competing models. AIC seeks to find the model that has a good fit to the data with relatively few parameters (predictors). It’s defined as:

where b = the number of coefficients estimated in our model (slope(s) and intercept) and p = the number of predictors in our model. When comparing models, we prefer the model that produces the smaller AIC value.

Expand the code to see the AIC() function in action:

AIC(reducedmodel, fullmodel)
#    # A tibble: 2 x 2
#         df      AIC
#    * <dbl>    <dbl>
#    1     2 66.24026
#    2     3 16.56735
anova(fullmodel)
#    # A tibble: 2 x 5
#         Df  `Sum Sq`   `Mean Sq` `F value`     `Pr(>F)`
#    * <int>     <dbl>       <dbl>     <dbl>        <dbl>
#    1     1 11.486959 11.48695919  128.7473 5.514804e-12
#    2    28  2.498187  0.08922098        NA           NA

Example: Musical neurons

Explain how we could use ANOVA to determine if an increase in neural activity is associated with playing stringed instruments. Expand the code and interpret the output. Also, calculate the omnibus F-statistic.

neuron <- read.csv("http://www.bradthiessen.com/html5/data/violin.csv")

# Convert years played to a factor variable.  I'll create a new variable.
neuron <- neuron %>%
  mutate(years_group = case_when(
    .$years == 0 ~ "a) 0 years",
    .$years > 0 & .$years < 10 ~ "b) <10 years",
    .$years > 10 ~ "c) >10 years"),
    years_group = as.factor(years_group))

anova(aov(neural ~ years_group, data = neuron))
#    # A tibble: 2 x 5
#         Df  `Sum Sq`  `Mean Sq` `F value`     `Pr(>F)`
#    * <int>     <dbl>      <dbl>     <dbl>        <dbl>
#    1     2 800.36548 400.182738  92.93477 1.287295e-07
#    2    11  47.36667   4.306061        NA           NA

Write out the full and reduced models to determine if years playing a stringed instrument is associated with neural activity.

Expand the code and interpret the output. Verify calculations as necessary, especially those in the ANOVA summary table. Interpret all plots.

# Create the models
full_neuron <- lm(neural ~ years, data = neuron)
reduced_neuron <- lm(neural ~ 1, data = neuron)

# Summarize the full model
summary(full_neuron)
#    
#    Call:
#    lm(formula = neural ~ years, data = neuron)
#    
#    Residuals:
#        Min      1Q  Median      3Q     Max 
#    -4.8644 -2.3730  0.1614  2.3713  4.6471 
#    
#    Coefficients:
#                Estimate Std. Error t value Pr(>|t|)    
#    (Intercept)   8.3873     1.1149   7.523 4.35e-06 ***
#    years         0.9971     0.1110   8.980 6.18e-07 ***
#    ---
#    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#    
#    Residual standard error: 3.009 on 13 degrees of freedom
#    Multiple R-squared:  0.8612,   Adjusted R-squared:  0.8505 
#    F-statistic: 80.63 on 1 and 13 DF,  p-value: 6.178e-07

# ANOVA summary table
anova(full_neuron)
#    # A tibble: 2 x 5
#         Df `Sum Sq`  `Mean Sq` `F value`     `Pr(>F)`
#    * <int>    <dbl>      <dbl>     <dbl>        <dbl>
#    1     1 730.2060 730.206005  80.63275 6.178311e-07
#    2    13 117.7273   9.055948        NA           NA

# Model comparison
AIC(full_neuron, reduced_neuron)
#    # A tibble: 2 x 2
#         df       AIC
#    * <dbl>     <dbl>
#    1     3  79.47297
#    2     2 107.08943
lrtest(reducedmodel, fullmodel)
#    # A tibble: 2 x 5
#      `#Df`     LogLik    Df    Chisq `Pr(>Chisq)`
#    * <dbl>      <dbl> <dbl>    <dbl>        <dbl>
#    1     2 -31.120132    NA       NA           NA
#    2     3  -5.283677     1 51.67291 6.556111e-13

# Get predicted values and residuals
neuron <- neuron %>% add_predictions(full_neuron)
neuron <- neuron %>% add_residuals(full_neuron)

# Plot model on data
ggplot(data = neuron, aes(x = years, y = neural)) +
  geom_point() +
  geom_line(aes(y = pred), color="blue", size=1) +
  scale_y_continuous(breaks=seq(0, 30, 10), minor_breaks=seq(5, 25, 5)) +
  scale_x_continuous(breaks=seq(0, 20, 5), minor_breaks=NULL)


# Plot residuals vs. predicted values
ggplot(data = neuron, aes(x = pred, y = resid)) +
  geom_point() +
  geom_hline(yintercept = 0)

The Prestige

The linear model predicting prestige from income is displayed below. Interpret the slope and y-intercept. Expand the code and interpret the output from R.

prestige_data <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")

ggplot(data = prestige_data, aes(x = income, y = prestige)) +
  geom_point(color="steelblue") +
  geom_smooth(method="lm", se=FALSE) +
  annotate("text", x=15000, y=35, label="R-squared = 0.511") +
  annotate("text", x=15000, y=28, label="y = 27.141 + 0.0029x")


prestige_model <- lm(prestige ~ income, data=prestige_data)

tidy(prestige_model)
#    # A tibble: 2 x 5
#             term     estimate    std.error statistic      p.value
#            <chr>        <dbl>        <dbl>     <dbl>        <dbl>
#    1 (Intercept) 27.141176368 2.2677036186  11.96857 5.135229e-21
#    2      income  0.002896799 0.0002833245  10.22432 3.192004e-17

Notice the t-statistic for the slope estimate is 10.224. Write out full and reduced models and complete the ANOVA summary table. Verify the MSR using both the omnibus F-statistic and the t-statistic of 10.224.?

tidy(anova(prestige_model))
#    # A tibble: 2 x 6
#           term    df    sumsq     meansq statistic      p.value
#          <chr> <int>    <dbl>      <dbl>     <dbl>        <dbl>
#    1    income     1 15279.26 15279.2567  104.5367 3.192004e-17
#    2 Residuals   100 14616.17   146.1617        NA           NA

For this model, RMSE = 11.97. Interpret this value.

Confidence intervals for predictions

We can easily predict the average prestige of all jobs that pay an income of 7,000: \(27.14 + 0.0029(7000)=47.4\). We know that prediction won’t be perfect, so it might make sense to construct a confidence interval around that prediction. Interpret the confidence interval displayed below:

Confidence interval for predictions: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\) where \(s_{y|x}=\sqrt{MS_E}\)

A 95% confidence interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 2.38\)

A 95% confidence interval is displayed over our data. Why does the width of the interval differ across values of X?

ggplot(data = prestige_data, aes(x = income, y = prestige)) +
  geom_point(color="steelblue") +
  geom_smooth(method = "lm", se=TRUE, color="red")

If you interpreted the confidence interval correctly, it probably didn’t give you what you wanted. If you want an interval to predict the prestige of individual jobs with specific incomes (and not the average of all jobs with that income), you’ll need to use a prediction interval. Expand the code to see how this was constructed. Why are prediction intervals wider than confidence intervals?

Prediction Interval: \(\hat{y} \ \pm \ t_{n-2}^{\alpha/2}s_{y|x}\sqrt{1+\frac{1}{n}+\frac{(x_0-\overline{X})^2}{(n-1)s_x^2}}\)

A 95% prediction interval for our prediction of 7000 is, then: \(7000 \ \pm \ 2\sqrt{146.16}\sqrt{1+\frac{1}{102}+\frac{(7000-6797.9)^2}{(102-1)(s_x^2)4245.92^2}}=47.4 \ \pm \ 24.10\)

We can get a rough estimate of this interval by simply taking: \(\hat{y} \ \pm \ 2\sqrt{MS_E}\)

# Get predictions (actual and lower/upper confidence)
predictions <- predict(prestige_model, interval="prediction")
#    Warning in predict.lm(prestige_model, interval = "prediction"): predictions on current data refer to _future_ responses

# Add them to data frame
prestige_data <- cbind(prestige_data, predictions)

# Plot
ggplot(data = prestige_data, aes(x = income, y = prestige)) +
  geom_ribbon(aes(ymin = lwr, ymax = upr), fill="grey70", alpha=.5) +
  geom_point(color="steelblue") +
  geom_smooth(method="lm", se=FALSE)

Linear Regression