Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)


Multiple Linear Regression

## Load data
prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")

## Set occupation type as a factor variable
prestige$type <- as.factor(prestige$occ_type)

## Eliminate the job titles
prestige2 <- prestige %>%
  dplyr::select(-title, -blue, -professional, -white)

## Create scatterplot matrix with histograms on diagonals
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(prestige2,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## Simple linear regression
## Prestige = f(income)
income.model <- lm(prestige ~ income, data=prestige)
summary(income.model)
## 
## Call:
## lm(formula = prestige ~ income, data = prestige)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.007  -8.378  -2.378   8.432  32.084 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.714e+01  2.268e+00   11.97   <2e-16 ***
## income      2.897e-03  2.833e-04   10.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.09 on 100 degrees of freedom
## Multiple R-squared:  0.5111, Adjusted R-squared:  0.5062 
## F-statistic: 104.5 on 1 and 100 DF,  p-value: < 2.2e-16
prestige %>% 
  ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
  add_axis("x", title = "Income") %>%
  scale_numeric("x", domain=c(0, 28000), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

## Prestige = f(education)
education.model <- lm(prestige ~ education, data=prestige)
summary(education.model)
## 
## Call:
## lm(formula = prestige ~ education, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.0397  -6.5228   0.6611   6.7430  18.1636 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -10.732      3.677  -2.919  0.00434 ** 
## education      5.361      0.332  16.148  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.103 on 100 degrees of freedom
## Multiple R-squared:  0.7228, Adjusted R-squared:   0.72 
## F-statistic: 260.8 on 1 and 100 DF,  p-value: < 2.2e-16
anova(income.model)
## Analysis of Variance Table
## 
## Response: prestige
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## income      1  15279 15279.3  104.54 < 2.2e-16 ***
## Residuals 100  14616   146.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
prestige %>% 
  ggvis(x=~education, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
  add_axis("x", title = "Education") %>%
  scale_numeric("x", domain=c(0, 20), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

## Prestige = f(% women)
percwomn.model <- lm(prestige ~ percwomn, data=prestige)
summary(percwomn.model)
## 
## Call:
## lm(formula = prestige ~ percwomn, data = prestige)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.444 -12.391  -4.126  13.034  39.185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.69300    2.30760  21.101   <2e-16 ***
## percwomn    -0.06417    0.05385  -1.192    0.236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.17 on 100 degrees of freedom
## Multiple R-squared:  0.014,  Adjusted R-squared:  0.004143 
## F-statistic:  1.42 on 1 and 100 DF,  p-value: 0.2362
anova(percwomn.model)
## Analysis of Variance Table
## 
## Response: prestige
##            Df  Sum Sq Mean Sq F value Pr(>F)
## percwomn    1   418.6  418.63  1.4202 0.2362
## Residuals 100 29476.8  294.77
prestige %>% 
  ggvis(x=~percwomn, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, stroke:="red") %>%
  add_axis("x", title = "Education") %>%
  scale_numeric("x", domain=c(0, 100), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

## Prestige = f(income, education)
inc.ed.model <- lm(prestige ~ income + education, data=prestige)
summary(inc.ed.model)
## 
## Call:
## lm(formula = prestige ~ income + education, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.4040  -5.3308   0.0154   4.9803  17.6889 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.8477787  3.2189771  -2.127   0.0359 *  
## income       0.0013612  0.0002242   6.071 2.36e-08 ***
## education    4.1374444  0.3489120  11.858  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.81 on 99 degrees of freedom
## Multiple R-squared:  0.798,  Adjusted R-squared:  0.7939 
## F-statistic: 195.6 on 2 and 99 DF,  p-value: < 2.2e-16
coef(inc.ed.model)
##  (Intercept)       income    education 
## -6.847778720  0.001361166  4.137444384
anova(inc.ed.model)
## Analysis of Variance Table
## 
## Response: prestige
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## income     1 15279.3 15279.3  250.49 < 2.2e-16 ***
## education  1  8577.3  8577.3  140.62 < 2.2e-16 ***
## Residuals 99  6038.9    61.0                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(inc.ed.model)

## Prestige = f(income, education, %women)
inc.ed.wmn.model <- lm(prestige ~ income + education + percwomn, data=prestige)
summary(inc.ed.wmn.model)
## 
## Call:
## lm(formula = prestige ~ income + education + percwomn, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.8246  -5.3332  -0.1364   5.1587  17.5045 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.7943342  3.2390886  -2.098   0.0385 *  
## income       0.0013136  0.0002778   4.729 7.58e-06 ***
## education    4.1866373  0.3887013  10.771  < 2e-16 ***
## percwomn    -0.0089052  0.0304071  -0.293   0.7702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.846 on 98 degrees of freedom
## Multiple R-squared:  0.7982, Adjusted R-squared:  0.792 
## F-statistic: 129.2 on 3 and 98 DF,  p-value: < 2.2e-16
coef(inc.ed.wmn.model)
##  (Intercept)       income    education     percwomn 
## -6.794334203  0.001313560  4.186637275 -0.008905157
anova(inc.ed.wmn.model)
## Analysis of Variance Table
## 
## Response: prestige
##           Df  Sum Sq Mean Sq  F value Pr(>F)    
## income     1 15279.3 15279.3 248.1727 <2e-16 ***
## education  1  8577.3  8577.3 139.3167 <2e-16 ***
## percwomn   1     5.3     5.3   0.0858 0.7702    
## Residuals 98  6033.6    61.6                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(inc.ed.wmn.model)

# Compare model with no predictors to model with income & education
no.model <- lm(prestige~1, data=prestige)
anova(income.model, no.model)
## Analysis of Variance Table
## 
## Model 1: prestige ~ income
## Model 2: prestige ~ 1
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    100 14616                                  
## 2    101 29895 -1    -15279 104.54 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# adjusted R-squared manual calculation
0.7982 - ((1-0.7982)*(3/(102-3-1)))
## [1] 0.7920224
## Standardized beta coefficients
coef(lm(scale(prestige) ~ scale(income) + scale(education) + scale(percwomn), data=prestige))
##      (Intercept)    scale(income) scale(education)  scale(percwomn) 
##    -2.462036e-17     3.241757e-01     6.639551e-01    -1.642104e-02
# Manual calculation for beta (for education predictor)
4.1866*(2.7284/17.204)
## [1] 0.6639572
## Assumptions check
par(mfrow=c(2,2))    # visualize four graphs at once
plot(inc.ed.wmn.model)

par(mfrow=c(1,1))    # reset the graphics defaults


## Compare model 1 vs model 2
anova(income.model, inc.ed.model)
## Analysis of Variance Table
## 
## Model 1: prestige ~ income
## Model 2: prestige ~ income + education
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    100 14616.2                                  
## 2     99  6038.9  1    8577.3 140.62 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Confidence interval for a prediction
predict(inc.ed.wmn.model, list(income=5000,education=10,percwomn=40), interval="conf")
##        fit      lwr      upr
## 1 41.28363 39.57498 42.99229
## Prediction interval
predict(inc.ed.wmn.model, list(income=5000,education=10,percwomn=40), interval="pred")
##        fit      lwr      upr
## 1 41.28363 25.61911 56.94816
###########################
###########################
## Height/Weight Example ##
###########################
###########################

## Load data
htwt <- read.csv("http://www.bradthiessen.com/html5/data/htwt.csv")

## Rename female variable to "gender"
htwt <- htwt %>%
  rename(gender = female)

## Create scatterplot matrix with histograms on diagonals
pairs(htwt,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## Summary statistics
table(htwt$gender)
## 
## female   male 
##    509    491
htwt %>%
  summarize(mn_height=mean(height), sd_height=sd(height),
            mn_weight=mean(weight), sd_weight=sd(weight),
            mn_mal=mean(mal), sd_mal=sd(mal))
##   mn_height sd_height mn_weight sd_weight mn_mal   sd_mal
## 1   166.163  8.025138  57.17209  9.656277  2.591 2.842851
## Reduced model (no predictors)
null.model <- lm(weight ~ 1, data=htwt)
summary(null.model)
## 
## Call:
## lm(formula = weight ~ 1, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.252  -6.372  -1.382   4.968  54.188 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  57.1721     0.3054   187.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.656 on 999 degrees of freedom
## Full model
modelx1x2 <- lm(weight ~ height + gender, data=htwt)
coef(modelx1x2)
## (Intercept)      height  gendermale 
## -53.7879605   0.6717494  -1.3438638
summary(modelx1x2)
## 
## Call:
## lm(formula = weight ~ height + gender, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.936  -5.398  -1.351   3.172  47.475 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -53.78796    6.31386  -8.519   <2e-16 ***
## height        0.67175    0.03895  17.245   <2e-16 ***
## gendermale   -1.34386    0.62501  -2.150   0.0318 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.238 on 997 degrees of freedom
## Multiple R-squared:  0.2736, Adjusted R-squared:  0.2721 
## F-statistic: 187.8 on 2 and 997 DF,  p-value: < 2.2e-16
anova(modelx1x2)
## Analysis of Variance Table
## 
## Response: weight
##            Df Sum Sq Mean Sq  F value  Pr(>F)    
## height      1  25173 25172.9 370.9122 < 2e-16 ***
## gender      1    314   313.8   4.6231 0.03178 *  
## Residuals 997  67664    67.9                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(null.model, modelx1x2)
## Analysis of Variance Table
## 
## Model 1: weight ~ 1
## Model 2: weight ~ height + gender
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    999 93150                                  
## 2    997 67664  2     25487 187.77 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assumptions check
par(mfrow=c(2,2))    # visualize four graphs at once
plot(modelx1x2)

par(mfrow=c(1,1))    # reset the graphics defaults

## Other models (to get R-squared values)
modelx1 <- lm(weight ~ height, data=htwt)
summary(modelx1)
## 
## Call:
## lm(formula = weight ~ height, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.405  -5.462  -1.446   3.306  46.691 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -46.76375    5.41280  -8.639   <2e-16 ***
## height        0.62551    0.03254  19.224   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.253 on 998 degrees of freedom
## Multiple R-squared:  0.2702, Adjusted R-squared:  0.2695 
## F-statistic: 369.6 on 1 and 998 DF,  p-value: < 2.2e-16
modelx2 <- lm(weight ~ gender, data=htwt)
summary(modelx2)
## 
## Call:
## lm(formula = weight ~ gender, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.597  -6.217  -0.930   4.510  51.843 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54.9101     0.4159 132.043  < 2e-16 ***
## gendermale    4.6070     0.5935   7.763 2.05e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.382 on 998 degrees of freedom
## Multiple R-squared:  0.05694,    Adjusted R-squared:  0.056 
## F-statistic: 60.26 on 1 and 998 DF,  p-value: 2.05e-14
modelx3 <- lm(weight ~ mal, data=htwt)
summary(modelx3)
## 
## Call:
## lm(formula = weight ~ mal, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.360  -6.614  -1.530   4.976  55.355 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  57.6439     0.4129 139.620   <2e-16 ***
## mal          -0.1821     0.1074  -1.696   0.0902 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.647 on 998 degrees of freedom
## Multiple R-squared:  0.002874,   Adjusted R-squared:  0.001875 
## F-statistic: 2.877 on 1 and 998 DF,  p-value: 0.09018
modelx1x3 <- lm(weight ~ height +mal, data=htwt)
summary(modelx1x3)
## 
## Call:
## lm(formula = weight ~ height + mal, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.561  -5.397  -1.382   3.381  46.766 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -48.03581    5.52756  -8.690   <2e-16 ***
## height        0.63152    0.03296  19.158   <2e-16 ***
## mal           0.10530    0.09305   1.132    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.252 on 997 degrees of freedom
## Multiple R-squared:  0.2712, Adjusted R-squared:  0.2697 
## F-statistic: 185.5 on 2 and 997 DF,  p-value: < 2.2e-16
modelx2x3 <- lm(weight ~ gender +mal, data=htwt)
summary(modelx2x3)
## 
## Call:
## lm(formula = weight ~ gender + mal, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.597  -6.216  -0.933   4.503  51.860 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 54.917679   0.539810 101.735  < 2e-16 ***
## gendermale   4.603995   0.608943   7.561 9.08e-14 ***
## mal         -0.002374   0.107137  -0.022    0.982    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.387 on 997 degrees of freedom
## Multiple R-squared:  0.05694,    Adjusted R-squared:  0.05505 
## F-statistic:  30.1 on 2 and 997 DF,  p-value: 2.027e-13
modelx1x2x3 <- lm(weight ~ height + gender + mal, data=htwt)
summary(modelx1x2x3)
## 
## Call:
## lm(formula = weight ~ height + gender + mal, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.954  -5.452  -1.309   3.222  47.481 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -54.26812    6.34372  -8.555   <2e-16 ***
## height        0.67323    0.03901  17.260   <2e-16 ***
## gendermale   -1.26238    0.63344  -1.993   0.0465 *  
## mal           0.07500    0.09415   0.797   0.4259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.24 on 996 degrees of freedom
## Multiple R-squared:  0.2741, Adjusted R-squared:  0.2719 
## F-statistic: 125.3 on 3 and 996 DF,  p-value: < 2.2e-16
## Compare models to determine if gender matters
anova(modelx1, modelx1x2)
## Analysis of Variance Table
## 
## Model 1: weight ~ height
## Model 2: weight ~ height + gender
##   Res.Df   RSS Df Sum of Sq      F  Pr(>F)  
## 1    998 67978                              
## 2    997 67664  1    313.76 4.6231 0.03178 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Split data and run separate regressions
males <- htwt %>%
  filter(gender=="male")

females <- htwt %>%
  filter(gender=="female")

male.model <- lm(weight ~ height, data=males)
coef(male.model)
## (Intercept)      height 
## -72.0137587   0.7706638
female.model <- lm(weight ~ height, data=females)
coef(female.model)
## (Intercept)      height 
## -33.7505460   0.5479189
## Full model including interaction
full.model <- lm(weight ~ height*gender, data=htwt)
coef(full.model)
##       (Intercept)            height        gendermale height:gendermale 
##       -33.7505460         0.5479189       -38.2632127         0.2227448
anova(full.model)
## Analysis of Variance Table
## 
## Response: weight
##                Df Sum Sq Mean Sq  F value    Pr(>F)    
## height          1  25173 25172.9 373.5646 < 2.2e-16 ***
## gender          1    314   313.8   4.6562  0.031181 *  
## height:gender   1    548   547.8   8.1297  0.004445 ** 
## Residuals     996  67116    67.4                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(full.model)
## 
## Call:
## lm(formula = weight ~ height * gender, data = htwt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.622  -5.428  -1.428   3.263  48.135 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -33.75055    9.43230  -3.578 0.000363 ***
## height              0.54792    0.05825   9.407  < 2e-16 ***
## gendermale        -38.26321   12.96338  -2.952 0.003235 ** 
## height:gendermale   0.22274    0.07812   2.851 0.004445 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.209 on 996 degrees of freedom
## Multiple R-squared:  0.2795, Adjusted R-squared:  0.2773 
## F-statistic: 128.8 on 3 and 996 DF,  p-value: < 2.2e-16
## Compare full model (with interaction) to model without interaction
anova(modelx1x2, full.model)
## Analysis of Variance Table
## 
## Model 1: weight ~ height + gender
## Model 2: weight ~ height * gender
##   Res.Df   RSS Df Sum of Sq      F   Pr(>F)   
## 1    997 67664                                
## 2    996 67116  1    547.83 8.1297 0.004445 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Manual calculation for omnibus F-test
(.2795-.2736)/((1-.2795)/(1000-3-1))
## [1] 8.156003
## Scatterplot with two regression lines
htwt %>% 
  ggvis(x=~height, y=~weight, fill := "steelblue", stroke:="steelblue", fillOpacity:=.2, strokeOpacity:=.3) %>%
  layer_points(fill=~gender) %>%
  group_by(gender) %>%
  layer_model_predictions(model = "lm", formula=weight~height,
                          strokeWidth:=5, stroke=~gender, strokeOpacity:=1) %>%
  add_axis("x", title = "Height (cm)") %>%
  scale_numeric("x", domain=c(140, 190), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Weight (kg)") %>%
  scale_numeric("y", domain=c(30, 115), nice=FALSE, clamp=TRUE)

###########################
###########################
####### GPA EXAMPLE #######
###########################
###########################

## Load data
gpadata <- read.csv("http://www.bradthiessen.com/html5/data/f13s14.csv")

## Select GPA, ACT, gender, and hours studied data
gpa <-
  gpadata %>%
  select(S1_P49FallTermGPA, S1_P49HSGPA, S1_P49HSPerRank, 
         S1_P49StudentAthlete, S1_P49ACTComposite, S1_NA150, S1_P49Gender)

# Rename variables
gpa <- 
  gpa %>%
  rename(sauGPA = S1_P49FallTermGPA, hsGPA = S1_P49HSGPA,
         hsRANK = S1_P49HSPerRank, athlete = S1_P49StudentAthlete,
         ACTscore = S1_P49ACTComposite, hoursSTUDY = S1_NA150,
         gender = S1_P49Gender)
  
# Create factor variables for athlete and gender
gpa$gender <- factor(gpa$gender, labels = c("male", "female"))
gpa$athlete <- factor(gpa$athlete, labels = c("not athlete", "athlete"))

# Complete cases only (no missing data)
gpa.data <- gpa[complete.cases(gpa),]

# Save this data (final format for website)
write.table(gpa.data, file="~/Desktop/gpadata.csv", sep=",")

## Load data from website
gpa <- read.csv("http://www.bradthiessen.com/html5/data/gpadata.csv")

# Look at data
head(gpa)
##   sauGPA hsGPA hsRANK     athlete ACTscore hoursSTUDY gender
## 1   2.87  2.82     43 not athlete       24          5   male
## 2   3.16  3.49     76 not athlete       32          7   male
## 3   2.67  2.97     72 not athlete       22         15 female
## 4   3.00  3.29     63 not athlete       20         15   male
## 5   3.00  3.41     65 not athlete       21          8 female
## 6   1.69  3.26     70 not athlete       21          4   male
tail(gpa)
##     sauGPA hsGPA hsRANK     athlete ACTscore hoursSTUDY gender
## 250   1.71  3.09     54     athlete       27         14   male
## 251   3.08  3.70     78 not athlete       23         10 female
## 252   0.19  2.86     30 not athlete       20          5   male
## 253   3.06  3.49     68 not athlete       20          5 female
## 254   2.62  2.52     31 not athlete       18          3   male
## 255   2.33  3.12     45 not athlete       23          2 female
# Summarize data
summary(gpa)
##      sauGPA         hsGPA           hsRANK              athlete   
##  Min.   :0.00   Min.   :1.960   Min.   :  6.00   athlete    : 89  
##  1st Qu.:2.27   1st Qu.:2.900   1st Qu.: 44.00   not athlete:166  
##  Median :2.80   Median :3.350   Median : 66.00                    
##  Mean   :2.65   Mean   :3.275   Mean   : 63.27                    
##  3rd Qu.:3.20   3rd Qu.:3.680   3rd Qu.: 83.00                    
##  Max.   :3.78   Max.   :4.000   Max.   :100.00                    
##     ACTscore       hoursSTUDY       gender   
##  Min.   :17.00   Min.   : 0.00   female:144  
##  1st Qu.:20.00   1st Qu.: 4.00   male  :111  
##  Median :22.00   Median : 8.00               
##  Mean   :22.96   Mean   :10.62               
##  3rd Qu.:25.00   3rd Qu.:15.00               
##  Max.   :34.00   Max.   :50.00
## Here's a package that gives more detailed summary statistics
## with the describe() command
library(psych)
describe(gpa)
##            vars   n  mean    sd median trimmed   mad   min    max range
## sauGPA        1 255  2.65  0.75   2.80    2.73  0.67  0.00   3.78  3.78
## hsGPA         2 255  3.27  0.52   3.35    3.30  0.52  1.96   4.00  2.04
## hsRANK        3 255 63.27 24.48  66.00   64.33 28.17  6.00 100.00 94.00
## athlete*      4 255  1.65  0.48   2.00    1.69  0.00  1.00   2.00  1.00
## ACTscore      5 255 22.96  3.66  22.00   22.62  2.97 17.00  34.00 17.00
## hoursSTUDY    6 255 10.62  8.90   8.00    9.17  5.93  0.00  50.00 50.00
## gender*       7 255  1.44  0.50   1.00    1.42  0.00  1.00   2.00  1.00
##             skew kurtosis   se
## sauGPA     -1.11     1.28 0.05
## hsGPA      -0.40    -0.86 0.03
## hsRANK     -0.33    -0.93 1.53
## athlete*   -0.63    -1.61 0.03
## ACTscore    0.76     0.06 0.23
## hoursSTUDY  1.69     3.16 0.56
## gender*     0.26    -1.94 0.03
## Scatterplot matrix
pairs(gpa,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## Null model 
null.model <- lm(sauGPA ~ 1, data=gpa)

## Full model 
# Collinearity problem
cor(hsGPA, hsRANK, data=gpa)
## [1] 0.903263
# Fit model
full.model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
coef(full.model)
##        (Intercept)              hsGPA athletenot athlete 
##        -0.54890739         0.68848215        -0.02926881 
##           ACTscore         hoursSTUDY         gendermale 
##         0.04286319         0.00927878        -0.27532973
confint(full.model)
##                           2.5 %      97.5 %
## (Intercept)        -1.043328143 -0.05448664
## hsGPA               0.525462014  0.85150228
## athletenot athlete -0.178247137  0.11970951
## ACTscore            0.020047281  0.06567909
## hoursSTUDY          0.001558211  0.01699935
## gendermale         -0.422525385 -0.12813407
summary(full.model)
## 
## Call:
## lm(formula = sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + 
##     gender, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.97642 -0.24947  0.04454  0.33731  1.07124 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.548907   0.251034  -2.187 0.029704 *  
## hsGPA               0.688482   0.082771   8.318 5.91e-15 ***
## athletenot athlete -0.029269   0.075641  -0.387 0.699129    
## ACTscore            0.042863   0.011584   3.700 0.000265 ***
## hoursSTUDY          0.009279   0.003920   2.367 0.018697 *  
## gendermale         -0.275330   0.074736  -3.684 0.000282 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5248 on 249 degrees of freedom
## Multiple R-squared:  0.5184, Adjusted R-squared:  0.5088 
## F-statistic: 53.61 on 5 and 249 DF,  p-value: < 2.2e-16
# Compare null and full models
anova(null.model, full.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ 1
## Model 2: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1    254 142.414                                  
## 2    249  68.583  5    73.832 53.612 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assumptions check
par(mfrow=c(2,2))    # visualize four graphs at once
plot(full.model)

par(mfrow=c(1,1))    # reset the graphics defaults


## Include all interaction terms
interact.model <- lm(sauGPA ~ hsGPA*athlete*ACTscore*hoursSTUDY*gender, data=gpa)
summary(interact.model)
## 
## Call:
## lm(formula = sauGPA ~ hsGPA * athlete * ACTscore * hoursSTUDY * 
##     gender, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.86303 -0.23787  0.06885  0.32812  1.16645 
## 
## Coefficients:
##                                                          Estimate
## (Intercept)                                             -2.858533
## hsGPA                                                    1.426448
## athletenot athlete                                      -3.421158
## ACTscore                                                 0.187316
## hoursSTUDY                                              -0.189176
## gendermale                                              -2.068469
## hsGPA:athletenot athlete                                 0.782668
## hsGPA:ACTscore                                          -0.043748
## athletenot athlete:ACTscore                              0.114351
## hsGPA:hoursSTUDY                                         0.059479
## athletenot athlete:hoursSTUDY                            0.322663
## ACTscore:hoursSTUDY                                      0.004147
## hsGPA:gendermale                                         0.371719
## athletenot athlete:gendermale                           12.750745
## ACTscore:gendermale                                      0.057158
## hoursSTUDY:gendermale                                    0.189588
## hsGPA:athletenot athlete:ACTscore                       -0.024624
## hsGPA:athletenot athlete:hoursSTUDY                     -0.098374
## hsGPA:ACTscore:hoursSTUDY                               -0.001345
## athletenot athlete:ACTscore:hoursSTUDY                  -0.007738
## hsGPA:athletenot athlete:gendermale                     -3.125614
## hsGPA:ACTscore:gendermale                               -0.010055
## athletenot athlete:ACTscore:gendermale                  -0.633812
## hsGPA:hoursSTUDY:gendermale                             -0.042746
## athletenot athlete:hoursSTUDY:gendermale                -0.507255
## ACTscore:hoursSTUDY:gendermale                          -0.005725
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY             0.002488
## hsGPA:athletenot athlete:ACTscore:gendermale             0.155965
## hsGPA:athletenot athlete:hoursSTUDY:gendermale           0.112163
## hsGPA:ACTscore:hoursSTUDY:gendermale                     0.001181
## athletenot athlete:ACTscore:hoursSTUDY:gendermale        0.028069
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale -0.006461
##                                                         Std. Error t value
## (Intercept)                                               8.896733  -0.321
## hsGPA                                                     2.616383   0.545
## athletenot athlete                                       10.001927  -0.342
## ACTscore                                                  0.413151   0.453
## hoursSTUDY                                                0.984171  -0.192
## gendermale                                               10.249080  -0.202
## hsGPA:athletenot athlete                                  2.923433   0.268
## hsGPA:ACTscore                                            0.117887  -0.371
## athletenot athlete:ACTscore                               0.467675   0.245
## hsGPA:hoursSTUDY                                          0.280278   0.212
## athletenot athlete:hoursSTUDY                             1.043032   0.309
## ACTscore:hoursSTUDY                                       0.041278   0.100
## hsGPA:gendermale                                          3.018346   0.123
## athletenot athlete:gendermale                            12.524280   1.018
## ACTscore:gendermale                                       0.479190   0.119
## hoursSTUDY:gendermale                                     1.092736   0.173
## hsGPA:athletenot athlete:ACTscore                         0.132866  -0.185
## hsGPA:athletenot athlete:hoursSTUDY                       0.295006  -0.333
## hsGPA:ACTscore:hoursSTUDY                                 0.011449  -0.117
## athletenot athlete:ACTscore:hoursSTUDY                    0.044410  -0.174
## hsGPA:athletenot athlete:gendermale                       3.679553  -0.849
## hsGPA:ACTscore:gendermale                                 0.136975  -0.073
## athletenot athlete:ACTscore:gendermale                    0.584537  -1.084
## hsGPA:hoursSTUDY:gendermale                               0.316015  -0.135
## athletenot athlete:hoursSTUDY:gendermale                  1.302175  -0.390
## ACTscore:hoursSTUDY:gendermale                            0.045755  -0.125
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY              0.012228   0.203
## hsGPA:athletenot athlete:ACTscore:gendermale              0.165924   0.940
## hsGPA:athletenot athlete:hoursSTUDY:gendermale            0.372868   0.301
## hsGPA:ACTscore:hoursSTUDY:gendermale                      0.012903   0.092
## athletenot athlete:ACTscore:hoursSTUDY:gendermale         0.055376   0.507
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale   0.015409  -0.419
##                                                         Pr(>|t|)
## (Intercept)                                                0.748
## hsGPA                                                      0.586
## athletenot athlete                                         0.733
## ACTscore                                                   0.651
## hoursSTUDY                                                 0.848
## gendermale                                                 0.840
## hsGPA:athletenot athlete                                   0.789
## hsGPA:ACTscore                                             0.711
## athletenot athlete:ACTscore                                0.807
## hsGPA:hoursSTUDY                                           0.832
## athletenot athlete:hoursSTUDY                              0.757
## ACTscore:hoursSTUDY                                        0.920
## hsGPA:gendermale                                           0.902
## athletenot athlete:gendermale                              0.310
## ACTscore:gendermale                                        0.905
## hoursSTUDY:gendermale                                      0.862
## hsGPA:athletenot athlete:ACTscore                          0.853
## hsGPA:athletenot athlete:hoursSTUDY                        0.739
## hsGPA:ACTscore:hoursSTUDY                                  0.907
## athletenot athlete:ACTscore:hoursSTUDY                     0.862
## hsGPA:athletenot athlete:gendermale                        0.397
## hsGPA:ACTscore:gendermale                                  0.942
## athletenot athlete:ACTscore:gendermale                     0.279
## hsGPA:hoursSTUDY:gendermale                                0.893
## athletenot athlete:hoursSTUDY:gendermale                   0.697
## ACTscore:hoursSTUDY:gendermale                             0.901
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY               0.839
## hsGPA:athletenot athlete:ACTscore:gendermale               0.348
## hsGPA:athletenot athlete:hoursSTUDY:gendermale             0.764
## hsGPA:ACTscore:hoursSTUDY:gendermale                       0.927
## athletenot athlete:ACTscore:hoursSTUDY:gendermale          0.613
## hsGPA:athletenot athlete:ACTscore:hoursSTUDY:gendermale    0.675
## 
## Residual standard error: 0.5361 on 223 degrees of freedom
## Multiple R-squared:   0.55,  Adjusted R-squared:  0.4874 
## F-statistic: 8.791 on 31 and 223 DF,  p-value: < 2.2e-16
# Compare to model with no interaction terms
anova(full.model, interact.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
## Model 2: sauGPA ~ hsGPA * athlete * ACTscore * hoursSTUDY * gender
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    249 68.583                           
## 2    223 64.091 26     4.492 0.6011 0.9384
# No athlete model
no.athlete.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, data=gpa)
coef(no.athlete.model)
##  (Intercept)        hsGPA     ACTscore   hoursSTUDY   gendermale 
## -0.566182830  0.688931448  0.042495854  0.009302746 -0.264003535
confint(no.athlete.model)
##                    2.5 %      97.5 %
## (Intercept) -1.051883884 -0.08048178
## hsGPA        0.526207969  0.85165493
## ACTscore     0.019795834  0.06519587
## hoursSTUDY   0.001596431  0.01700906
## gendermale  -0.399206710 -0.12880036
summary(no.athlete.model)
## 
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, 
##     data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.99086 -0.24437  0.05333  0.33249  1.06572 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.566183   0.246612  -2.296 0.022513 *  
## hsGPA        0.688931   0.082622   8.338 5.09e-15 ***
## ACTscore     0.042496   0.011526   3.687 0.000278 ***
## hoursSTUDY   0.009303   0.003913   2.377 0.018183 *  
## gendermale  -0.264004   0.068649  -3.846 0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5239 on 250 degrees of freedom
## Multiple R-squared:  0.5181, Adjusted R-squared:  0.5104 
## F-statistic: 67.21 on 4 and 250 DF,  p-value: < 2.2e-16
# Compare null and full models
anova(no.athlete.model, full.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender
## Model 2: sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    250 68.624                           
## 2    249 68.583  1  0.041239 0.1497 0.6991
# No athlete or hours studying model
no.hours.model <- lm(sauGPA ~ hsGPA + ACTscore + gender, data=gpa)
coef(no.hours.model)
## (Intercept)       hsGPA    ACTscore  gendermale 
## -0.64166123  0.70510522  0.04799384 -0.27525872
confint(no.hours.model)
##                  2.5 %      97.5 %
## (Intercept) -1.1277550 -0.15556748
## hsGPA        0.5414414  0.86876909
## ACTscore     0.0255507  0.07043699
## gendermale  -0.4113817 -0.13913572
summary(no.hours.model)
## 
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + gender, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.07674 -0.24683  0.06869  0.34865  1.00116 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.64166    0.24682  -2.600  0.00988 ** 
## hsGPA        0.70511    0.08310   8.485 1.89e-15 ***
## ACTscore     0.04799    0.01140   4.212 3.54e-05 ***
## gendermale  -0.27526    0.06912  -3.983 8.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5288 on 251 degrees of freedom
## Multiple R-squared:  0.5072, Adjusted R-squared:  0.5014 
## F-statistic: 86.13 on 3 and 251 DF,  p-value: < 2.2e-16
# Compare to our previous model
anova(no.hours.model, no.athlete.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ hsGPA + ACTscore + gender
## Model 2: sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    251 70.175                              
## 2    250 68.624  1    1.5516 5.6525 0.01818 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#######################
#######################
## Questions 34a-34c ##
#######################
#######################

## a) ACT scores
act.model <- lm(sauGPA ~ ACTscore, data=gpa)
summary(act.model)
## 
## Call:
## lm(formula = sauGPA ~ ACTscore, data = gpa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3123 -0.2870  0.0755  0.4415  1.1649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.13044    0.25245   0.517    0.606    
## ACTscore     0.10972    0.01086  10.105   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6333 on 253 degrees of freedom
## Multiple R-squared:  0.2875, Adjusted R-squared:  0.2847 
## F-statistic: 102.1 on 1 and 253 DF,  p-value: < 2.2e-16
anova(act.model)
## Analysis of Variance Table
## 
## Response: sauGPA
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## ACTscore    1  40.948  40.948   102.1 < 2.2e-16 ***
## Residuals 253 101.466   0.401                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## b) ACT scores on top of GPA
gpa.model <- lm(sauGPA ~ hsGPA, data=gpa)
gpa.act.model <- lm(sauGPA ~ hsGPA + ACTscore, data=gpa)
anova(gpa.model, gpa.act.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ hsGPA
## Model 2: sauGPA ~ hsGPA + ACTscore
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    253 78.66                                  
## 2    252 74.61  1      4.05 13.679 0.0002662 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## c) Add hours studied per week
gpa.act.hours.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY, data=gpa)
anova(gpa.model, gpa.act.model, gpa.act.hours.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ hsGPA
## Model 2: sauGPA ~ hsGPA + ACTscore
## Model 3: sauGPA ~ hsGPA + ACTscore + hoursSTUDY
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1    253 78.660                                   
## 2    252 74.610  1    4.0500 13.9860 0.0002282 ***
## 3    251 72.683  1    1.9262  6.6518 0.0104758 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#######################
#######################
### Questions 35-37 ###
#######################
#######################

## T-test
t.test(sauGPA ~ athlete, var.equal=TRUE, sig.level=0.05, data=gpa)
## 
##  Two Sample t-test
## 
## data:  sauGPA by athlete
## t = -2.3323, df = 253, p-value = 0.02047
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.41952830 -0.03539793
## sample estimates:
##     mean in group athlete mean in group not athlete 
##                  2.501573                  2.729036
t.test(sauGPA ~ athlete, var.equal=TRUE, conf.level=0.95, data=gpa)$conf.int
## [1] -0.41952830 -0.03539793
## attr(,"conf.level")
## [1] 0.95
## Athlete alone
ath.model <- lm(sauGPA ~ athlete, data=gpa)
summary(ath.model)
## 
## Call:
## lm(formula = sauGPA ~ athlete, data = gpa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7290 -0.3453  0.1410  0.5410  1.2784 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.50157    0.07869  31.792   <2e-16 ***
## athletenot athlete  0.22746    0.09753   2.332   0.0205 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7423 on 253 degrees of freedom
## Multiple R-squared:  0.02105,    Adjusted R-squared:  0.01718 
## F-statistic:  5.44 on 1 and 253 DF,  p-value: 0.02047
(.02105)/((1-.02105)/(255-1-1))
## [1] 5.440165
## ACT 
act.model <- lm(sauGPA ~ ACTscore, data=gpa)
summary(act.model)
## 
## Call:
## lm(formula = sauGPA ~ ACTscore, data = gpa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3123 -0.2870  0.0755  0.4415  1.1649 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.13044    0.25245   0.517    0.606    
## ACTscore     0.10972    0.01086  10.105   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6333 on 253 degrees of freedom
## Multiple R-squared:  0.2875, Adjusted R-squared:  0.2847 
## F-statistic: 102.1 on 1 and 253 DF,  p-value: < 2.2e-16
## Athlete + ACT 
ath.act.model <- lm(sauGPA ~ ACTscore + athlete, data=gpa)
summary(ath.act.model)
## 
## Call:
## lm(formula = sauGPA ~ ACTscore + athlete, data = gpa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3498 -0.2638  0.1023  0.4435  1.1081 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.08294    0.25309   0.328   0.7434    
## ACTscore            0.10779    0.01088   9.909   <2e-16 ***
## athletenot athlete  0.14093    0.08335   1.691   0.0921 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.631 on 252 degrees of freedom
## Multiple R-squared:  0.2955, Adjusted R-squared:  0.2899 
## F-statistic: 52.86 on 2 and 252 DF,  p-value: < 2.2e-16
## Model comparison
anova(act.model, ath.act.model)
## Analysis of Variance Table
## 
## Model 1: sauGPA ~ ACTscore
## Model 2: sauGPA ~ ACTscore + athlete
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    253 101.47                              
## 2    252 100.33  1    1.1381 2.8587 0.09212 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################
####################
# Cross-validation #
####################
####################
library(DAAG)
cross.validated.model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
CVlm(gpa, cross.validated.model, m=10)
## Analysis of Variance Table
## 
## Response: sauGPA
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## hsGPA        1   63.8    63.8  231.47 < 2e-16 ***
## athlete      1    0.5     0.5    1.70 0.19340    
## ACTscore     1    4.0     4.0   14.40 0.00019 ***
## hoursSTUDY   1    1.9     1.9    6.92 0.00907 ** 
## gender       1    3.7     3.7   13.57 0.00028 ***
## Residuals  249   68.6     0.3                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Warning in CVlm(gpa, cross.validated.model, m = 10): 
## 
##  As there is >1 explanatory variable, cross-validation
##  predicted values for a fold are not a linear function
##  of corresponding overall predicted values.  Lines that
##  are shown for the different folds are approximate

## 
## fold 1 
## Observations in test set: 25 
##                  9     48    65   71    86    92    99    100   101
## Predicted    2.764  2.026 3.376 2.91 3.050 3.285  2.41  2.040 3.127
## cvpred       2.814  2.024 3.347 2.90 3.025 3.235  2.43  2.068 3.129
## sauGPA       2.400  1.210 3.470 3.29 3.290 3.730  2.18  1.420 3.470
## CV residual -0.414 -0.814 0.123 0.39 0.265 0.495 -0.25 -0.648 0.341
##                 106    109  119    122   130   150   151    175     187
## Predicted    3.4890 3.0162 3.69  3.088 3.038 2.266 2.419  2.411  2.4800
## cvpred       3.4773 2.9837 3.64  3.116 3.036 2.242 2.422  2.461  2.5609
## sauGPA       3.4500 3.0600 3.77  2.730 3.590 3.000 2.710  1.830  2.5000
## CV residual -0.0273 0.0763 0.13 -0.386 0.554 0.758 0.288 -0.631 -0.0609
##               191     215   217   227   229    236    240
## Predicted   1.751  2.4334 2.428 2.149  2.21  2.982  1.791
## cvpred      1.699  2.4568 2.474 2.172  2.27  2.962  1.826
## sauGPA      2.220  2.4000 2.790 2.290  0.92  2.860  1.670
## CV residual 0.521 -0.0568 0.316 0.118 -1.35 -0.102 -0.156
## 
## Sum of squares = 5.69    Mean square = 0.23    n = 25 
## 
## fold 2 
## Observations in test set: 26 
##                 11     26     34     42    43     47      53     55    59
## Predicted    3.277  2.684  3.187  3.409 2.844  1.964  3.1188  1.772 3.165
## cvpred       3.287  2.712  3.218  3.421 2.852  1.998  3.1302  1.783 3.176
## sauGPA       3.130  2.500  2.820  3.030 3.250  1.210  3.1200  1.440 3.330
## CV residual -0.157 -0.212 -0.398 -0.391 0.398 -0.788 -0.0102 -0.343 0.154
##                 62    63      68    78     83   104  107   127    146
## Predicted    3.627  2.20  3.0608 2.351  3.309 3.519 2.68 2.228  2.603
## cvpred       3.638  2.24  3.0823 2.363  3.311 3.525 2.71 2.252  2.589
## sauGPA       3.490  0.58  3.0400 2.490  3.200 3.750 3.56 2.470  2.360
## CV residual -0.148 -1.66 -0.0423 0.127 -0.111 0.225 0.85 0.218 -0.229
##                149   196   204    207   218    235   245    246
## Predicted   2.8837 2.007  2.92  2.301 3.094  3.436 2.843  2.174
## cvpred      2.9025 2.004  2.94  2.316 3.051  3.392 2.864  2.207
## sauGPA      3.0000 2.870  1.67  2.130 3.500  3.110 3.300  1.860
## CV residual 0.0975 0.866 -1.27 -0.186 0.449 -0.282 0.436 -0.347
## 
## Sum of squares = 7.98    Mean square = 0.31    n = 26 
## 
## fold 3 
## Observations in test set: 26 
##                 2     27    31     36     37     39     41    61    76
## Predicted   2.986  3.099 2.690  3.059  2.339 3.3850  2.792 3.205 2.769
## cvpred      2.935  3.137 2.666  3.044  2.374 3.3653  2.817 3.158 2.755
## sauGPA      3.160  2.630 3.060  2.930  1.860 3.3800  2.210 3.730 3.290
## CV residual 0.225 -0.507 0.394 -0.114 -0.514 0.0147 -0.607 0.572 0.535
##                77     79   105  155     163    164   168     177   181
## Predicted   3.031  2.803 2.889 2.39  3.2956  1.737 2.190  2.6837 2.188
## cvpred      3.037  2.806 2.897 2.38  3.2299  1.744 2.181  2.6696 2.192
## sauGPA      3.470  2.500 3.000 3.39  3.1800  1.420 2.930  2.5800 2.380
## CV residual 0.433 -0.306 0.103 1.01 -0.0499 -0.324 0.749 -0.0896 0.188
##                183    192     206   219   241  243   252    255
## Predicted    1.742  2.692  1.5063 1.619 2.161 2.82  2.02  2.574
## cvpred       1.761  2.693  1.5203 1.645 2.168 2.85  2.03  2.589
## sauGPA       1.000  2.420  1.4300 1.920 2.600 3.38  0.19  2.330
## CV residual -0.761 -0.273 -0.0903 0.275 0.432 0.53 -1.84 -0.259
## 
## Sum of squares = 8.42    Mean square = 0.32    n = 26 
## 
## fold 4 
## Observations in test set: 26 
##                 4      14    15     46      57    87     89   91  113
## Predicted   2.408  3.0860 3.110  2.330  2.1444 3.058 3.3833 3.34 3.04
## cvpred      2.398  3.0747 3.081  2.324  2.1181 3.043 3.3711 3.35 3.02
## sauGPA      3.000  3.0600 3.400  2.190  2.0700 3.500 3.4100 3.50 3.17
## CV residual 0.602 -0.0147 0.319 -0.134 -0.0481 0.457 0.0389 0.15 0.15
##               131   136   143    157   170   173  174   194   210  213
## Predicted    3.52 2.803 3.341  2.577 1.721 1.755 2.18 1.788 2.338 1.93
## cvpred       3.50 2.791 3.322  2.558 1.696 1.724 2.15 1.754 2.312 1.90
## sauGPA       3.20 3.200 3.460  2.360 2.030 2.270 3.25 2.270 2.430 2.50
## CV residual -0.30 0.409 0.138 -0.198 0.334 0.546 1.10 0.516 0.118 0.60
##               216   222     223   228    234    242  247
## Predicted   1.670 2.456  1.6917  1.62 2.4865  2.176 2.58
## cvpred      1.639 2.427  1.6558  1.60 2.4697  2.159 2.56
## sauGPA      2.160 3.070  1.5600  0.21 2.5100  1.880 3.00
## CV residual 0.521 0.643 -0.0958 -1.39 0.0403 -0.279 0.44
## 
## Sum of squares = 6.22    Mean square = 0.24    n = 26 
## 
## fold 5 
## Observations in test set: 26 
##                  6      13    25    44      50     58    66      82    84
## Predicted    2.328  2.6402 3.156 2.722  2.1290  3.686  3.58  3.2844 2.375
## cvpred       2.323  2.7101 3.147 2.706  2.1383  3.721  3.60  3.2764 2.387
## sauGPA       1.690  2.6700 3.500 3.020  2.0800  3.140  3.47  3.2500 2.800
## CV residual -0.633 -0.0401 0.353 0.314 -0.0583 -0.581 -0.13 -0.0264 0.413
##                102    103   115    121    123   125   153   160   161
## Predicted   3.6079  3.207 3.010 3.1638  2.681 2.657 2.257 2.950 1.796
## cvpred      3.6355  3.238 3.002 3.1571  2.719 2.641 2.243 2.953 1.767
## sauGPA      3.6700  3.130 3.440 3.2100  2.130 2.790 2.570 3.060 2.080
## CV residual 0.0345 -0.108 0.438 0.0529 -0.589 0.149 0.327 0.107 0.313
##                179     200   205    211    221   225    250   253
## Predicted    3.212  2.3787 2.463  2.697  2.517 2.582  2.590 2.728
## cvpred       3.212  2.3961 2.453  2.677  2.529 2.588  2.615 2.715
## sauGPA       2.230  2.3800 2.860  2.370  2.360 2.790  1.710 3.060
## CV residual -0.982 -0.0161 0.407 -0.307 -0.169 0.202 -0.905 0.345
## 
## Sum of squares = 4.18    Mean square = 0.16    n = 26 
## 
## fold 6 
## Observations in test set: 26 
##                  8    22    29     40    49    51    70    72     75
## Predicted    3.202 3.601 2.887  3.214 2.414 2.022 2.185  2.85  3.181
## cvpred       3.262 3.608 2.881  3.227 2.421 2.001 2.153  2.81  3.201
## sauGPA       2.890 3.780 3.380  2.670 2.950 2.460 2.540  2.63  2.870
## CV residual -0.372 0.172 0.499 -0.557 0.529 0.459 0.387 -0.18 -0.331
##                  95    96    110    118   139    145   159     162     165
## Predicted    2.9407  1.86  3.297  2.534 2.217 2.5178 2.751  1.8508  1.4683
## cvpred       2.9617  1.90  3.303  2.534 2.212 2.4752 2.721  1.8763  1.4592
## sauGPA       2.8700  1.08  3.140  2.390 2.670 2.5600 3.000  1.7900  1.4000
## CV residual -0.0917 -0.82 -0.163 -0.144 0.458 0.0848 0.279 -0.0863 -0.0592
##                176    182    186   199   201    208   212    239
## Predicted    2.588  3.103 2.2307 3.068 2.153  2.145 1.889  2.149
## cvpred       2.593  3.104 2.2027 3.077 2.177  2.175 1.904  2.166
## sauGPA       2.330  2.620 2.2500 3.400 2.270  2.000 2.190  1.280
## CV residual -0.263 -0.484 0.0473 0.323 0.093 -0.175 0.286 -0.886
## 
## Sum of squares = 3.86    Mean square = 0.15    n = 26 
## 
## fold 7 
## Observations in test set: 25 
##                 1    12    19     20   45     52    54     73     74    81
## Predicted   2.163 2.425 2.776  2.923 2.83  2.859  2.90  2.919 2.8078  2.51
## cvpred      2.112 2.376 2.755  2.957 2.85  2.881  2.89  2.923 2.8108  2.52
## sauGPA      2.870 2.470 3.380  2.570 3.05  2.220  2.67  2.420 2.8400  2.42
## CV residual 0.758 0.094 0.625 -0.387 0.20 -0.661 -0.22 -0.503 0.0292 -0.10
##                 85    94    97   112    138   147     156   158    167
## Predicted    2.128 3.163 2.881 2.781  2.736 3.039  3.2923 2.886  2.576
## cvpred       2.104 3.199 2.895 2.768  2.741 3.034  3.3075 2.895  2.545
## sauGPA       1.730 3.400 3.270 3.080  2.470 3.310  3.2500 3.070  2.200
## CV residual -0.374 0.201 0.375 0.312 -0.271 0.276 -0.0575 0.175 -0.345
##              172   185  198    230    251   254
## Predicted   2.58 1.690 1.59 1.5666 3.0478 1.681
## cvpred      2.54 1.646 1.54 1.5002 3.0518 1.623
## sauGPA      2.98 2.360 2.20 1.5800 3.0800 2.620
## CV residual 0.44 0.714 0.66 0.0798 0.0282 0.997
## 
## Sum of squares = 4.78    Mean square = 0.19    n = 25 
## 
## fold 8 
## Observations in test set: 25 
##                 3     5      7        18    21     23    28     30     32
## Predicted   2.549 2.744  3.150  3.628666 3.300  1.900 2.915  3.647  3.958
## cvpred      2.541 2.737  3.173  3.630152 3.297  1.879 2.911  3.686  3.971
## sauGPA      2.670 3.000  2.930  3.630000 3.590  1.070 3.400  3.570  3.670
## CV residual 0.129 0.263 -0.243 -0.000152 0.293 -0.809 0.489 -0.116 -0.301
##                  33     98   111    114   120     124   126    132    133
## Predicted    3.0907  3.604 2.460  3.074 2.769  2.9925 2.908  2.105  2.453
## cvpred       3.0865  3.609 2.461  3.124 2.769  3.0159 2.907  2.098  2.442
## sauGPA       3.0200  3.240 2.760  2.600 3.250  2.9200 3.140  1.850  2.000
## CV residual -0.0665 -0.369 0.299 -0.524 0.481 -0.0959 0.233 -0.248 -0.442
##               135    137    148   152    180   189   248
## Predicted   3.249  2.755  2.966 2.175  2.813 2.683 2.831
## cvpred      3.248  2.783  2.961 2.195  2.841 2.663 2.833
## sauGPA      3.400  1.830  2.650 2.670  2.470 3.470 2.980
## CV residual 0.152 -0.953 -0.311 0.475 -0.371 0.807 0.147
## 
## Sum of squares = 4.35    Mean square = 0.17    n = 25 
## 
## fold 9 
## Observations in test set: 25 
##                10     24    64     67    90     93   108   116   134   140
## Predicted   2.506 1.9830  1.83 3.3982 3.032  1.945  3.06 2.055 1.790 2.001
## cvpred      2.501 1.9711  1.83 3.3633 3.035  1.942  3.06 2.019 1.788 2.001
## sauGPA      3.330 2.0000  0.00 3.4000 3.270  1.670  2.93 2.470 2.670 2.670
## CV residual 0.829 0.0289 -1.83 0.0367 0.235 -0.272 -0.13 0.451 0.882 0.669
##               144   166   169  171   178  184   195  202    203   224
## Predicted   2.702 1.622 2.155 2.78 2.419 2.38 2.101 2.85  2.481 2.594
## cvpred      2.699 1.602 2.136 2.78 2.404 2.37 2.057 2.85  2.484 2.575
## sauGPA      2.920 1.870 2.400 3.15 3.190 3.33 2.260 3.07  1.600 2.730
## CV residual 0.221 0.268 0.264 0.37 0.786 0.96 0.203 0.22 -0.884 0.155
##               231   232  237   238    249
## Predicted   3.327  2.00 2.31 2.511 2.9773
## cvpred      3.314  2.01 2.31 2.508 2.9757
## sauGPA      3.450  0.83 2.54 2.800 3.0600
## CV residual 0.136 -1.18 0.23 0.292 0.0843
## 
## Sum of squares = 9.91    Mean square = 0.4    n = 25 
## 
## fold 10 
## Observations in test set: 25 
##                16    17     35     38      56    60    69     80    88
## Predicted    3.58  2.98  1.708  3.759  2.7714 2.354 3.228 3.1268 2.805
## cvpred       3.63  3.07  1.732  3.779  2.7754 2.284 3.176 3.1396 2.849
## sauGPA       3.47  0.00  1.000  3.500  2.6900 2.870 3.610 3.2000 3.220
## CV residual -0.16 -3.07 -0.732 -0.279 -0.0854 0.586 0.434 0.0604 0.371
##                117    128   129   141   142   154    188   190     193
## Predicted    2.410  2.954  1.79 3.433 2.645 2.326  2.882 2.760  3.3891
## cvpred       2.404  2.959  1.84 3.485 2.659 2.333  2.949 2.823  3.4945
## sauGPA       1.440  2.270  0.27 3.610 3.060 2.540  2.880 2.930  3.4000
## CV residual -0.964 -0.689 -1.57 0.125 0.401 0.207 -0.069 0.107 -0.0945
##               197  209    214    220   226   233   244
## Predicted   2.332 2.82  2.857  1.907 2.849 2.684 1.833
## cvpred      2.383 2.83  2.854  1.941 2.857 2.712 1.863
## sauGPA      3.060 3.20  2.640  1.290 3.270 3.000 1.980
## CV residual 0.677 0.37 -0.214 -0.651 0.413 0.288 0.117
## 
## Sum of squares = 16.2    Mean square = 0.65    n = 25 
## 
## Overall (Sum over all 25 folds) 
##    ms 
## 0.281
## I'm going to make these comments so they won't print out.
## These are the commands I used to get the other avg. mse values
# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)

# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore + gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)

# cross.validated.model <- lm(sauGPA ~ hsGPA + ACTscore, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)

# cross.validated.model <- lm(sauGPA ~ hsGPA, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)

# cross.validated.model <- lm(sauGPA ~ hsGPA*athlete*ACTscore*hoursSTUDY*gender, data=gpa)
# CVlm(gpa, cross.validated.model, m=10)



####################
####################
## Best subsets  ###
####################
####################

# All Subsets Regression
library(leaps)
leaps<-regsubsets(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender,
                  data=gpa, nbest=10)
# view results 
summary(leaps)
## Subset selection object
## Call: regsubsets.formula(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + 
##     gender, data = gpa, nbest = 10)
## 5 Variables  (and intercept)
##                    Forced in Forced out
## hsGPA                  FALSE      FALSE
## athletenot athlete     FALSE      FALSE
## ACTscore               FALSE      FALSE
## hoursSTUDY             FALSE      FALSE
## gendermale             FALSE      FALSE
## 10 subsets of each size up to 5
## Selection Algorithm: exhaustive
##           hsGPA athletenot athlete ACTscore hoursSTUDY gendermale
## 1  ( 1 )  "*"   " "                " "      " "        " "       
## 1  ( 2 )  " "   " "                "*"      " "        " "       
## 1  ( 3 )  " "   " "                " "      "*"        " "       
## 1  ( 4 )  " "   " "                " "      " "        "*"       
## 1  ( 5 )  " "   "*"                " "      " "        " "       
## 2  ( 1 )  "*"   " "                "*"      " "        " "       
## 2  ( 2 )  "*"   " "                " "      " "        "*"       
## 2  ( 3 )  "*"   " "                " "      "*"        " "       
## 2  ( 4 )  "*"   "*"                " "      " "        " "       
## 2  ( 5 )  " "   " "                "*"      " "        "*"       
## 2  ( 6 )  " "   " "                "*"      "*"        " "       
## 2  ( 7 )  " "   "*"                "*"      " "        " "       
## 2  ( 8 )  " "   " "                " "      "*"        "*"       
## 2  ( 9 )  " "   "*"                " "      "*"        " "       
## 2  ( 10 ) " "   "*"                " "      " "        "*"       
## 3  ( 1 )  "*"   " "                "*"      " "        "*"       
## 3  ( 2 )  "*"   " "                " "      "*"        "*"       
## 3  ( 3 )  "*"   " "                "*"      "*"        " "       
## 3  ( 4 )  "*"   "*"                "*"      " "        " "       
## 3  ( 5 )  "*"   "*"                " "      " "        "*"       
## 3  ( 6 )  "*"   "*"                " "      "*"        " "       
## 3  ( 7 )  " "   " "                "*"      "*"        "*"       
## 3  ( 8 )  " "   "*"                "*"      " "        "*"       
## 3  ( 9 )  " "   "*"                "*"      "*"        " "       
## 3  ( 10 ) " "   "*"                " "      "*"        "*"       
## 4  ( 1 )  "*"   " "                "*"      "*"        "*"       
## 4  ( 2 )  "*"   "*"                "*"      " "        "*"       
## 4  ( 3 )  "*"   "*"                "*"      "*"        " "       
## 4  ( 4 )  "*"   "*"                " "      "*"        "*"       
## 4  ( 5 )  " "   "*"                "*"      "*"        "*"       
## 5  ( 1 )  "*"   "*"                "*"      "*"        "*"
# plot a table of models showing variables in each model.
plot(leaps,scale="r2")

# plot statistic by subset size 
library(car)
#subsets(leaps, statistic="rsq")



####################
####################
#### Stepwise   ####
####################
####################
# Using AIC
library(MASS)
model <- lm(sauGPA ~ hsGPA + athlete + ACTscore + hoursSTUDY + gender, data=gpa)
stepAIC(model, scale = 0,
        direction = c("both"),
        trace = 0, use.start = FALSE,
        k = 2)
## 
## Call:
## lm(formula = sauGPA ~ hsGPA + ACTscore + hoursSTUDY + gender, 
##     data = gpa)
## 
## Coefficients:
## (Intercept)        hsGPA     ACTscore   hoursSTUDY   gendermale  
##     -0.5662       0.6889       0.0425       0.0093      -0.2640
# Note: Change to trace = 1 above to see more
# detailed output
####################
####################
#### Ridge Reg   ###
####################
####################

# Find coefficients of linear model
coef(lm(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender, 
                  data=gpa))
##        (Intercept)              hsGPA             hsRANK 
##           -0.66883            0.75144           -0.00156 
## athletenot athlete           ACTscore         hoursSTUDY 
##           -0.03159            0.04356            0.00933 
##         gendermale 
##           -0.28076
# Run the ridge regression
ridge <- lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender, 
                  data=gpa, lambda = seq(0, 50, .1))

# Find the value of lambda
select(lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender, 
                  data=gpa, lambda = seq(0, 50, .01)))
## modified HKB estimator is 5.45 
## modified L-W estimator is 3.81 
## smallest value of GCV  at 6.43
# Get coefficients for ridge regression using lambda = 6.43
lm.ridge(sauGPA ~ hsGPA + hsRANK + athlete + ACTscore + hoursSTUDY + gender, 
                  data=gpa, lambda = 6.43)
##                                 hsGPA             hsRANK 
##          -0.488398           0.662211           0.000206 
## athletenot athlete           ACTscore         hoursSTUDY 
##          -0.024268           0.043171           0.009273 
##         gendermale 
##          -0.270204
# Load a library to plot the coefficients
library(genridge)
# Plot the coefficients
traceplot(ridge)
abline(v=6.43, lty=1, lw=3)