Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)


########################
########################
### Polynomial Reg. ###
########################
########################

## Load data
geiger <- read.csv("http://www.bradthiessen.com/html5/data/geiger.csv")
head(geiger)
##   time counts
## 1    0  126.6
## 2    1  101.8
## 3    2   71.6
## 4    3   85.1
## 5    4  101.6
## 6    5   67.5
tail(geiger)
##    time counts
## 26   25   13.4
## 27   26   26.8
## 28   27    9.8
## 29   28   18.8
## 30   29   25.9
## 31   30   19.3
cor(counts, time, data=geiger)
## [1] -0.8767747
## Fit linear model
lin.mod <- lm(counts ~ time, data=geiger)
summary(lin.mod)
## 
## Call:
## lm(formula = counts ~ time, data = geiger)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.045 -10.021  -2.083   5.126  40.461 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  86.1389     5.0272  17.134  < 2e-16 ***
## time         -2.8262     0.2879  -9.818 9.99e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.34 on 29 degrees of freedom
## Multiple R-squared:  0.7687, Adjusted R-squared:  0.7608 
## F-statistic:  96.4 on 1 and 29 DF,  p-value: 9.991e-11
anova(lin.mod)
## Analysis of Variance Table
## 
## Response: counts
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## time       1 19809.5 19809.5  96.397 9.991e-11 ***
## Residuals 29  5959.5   205.5                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## R-squared is 0.7687 (looks good)

## Assumptions check
par(mfrow=c(2,2))    # visualize four graphs at once
plot(lin.mod)

par(mfrow=c(1,1))    # reset the graphics defaults

## Show linear model
geiger %>% 
  ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
  add_axis("x", title = "time") %>%
  scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "counts") %>%
  scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)

## Show lowess
geiger %>% 
  ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
  add_axis("x", title = "time") %>%
  scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "counts") %>%
  scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)

## LOWESS with slider for span
library(shiny)

geiger %>% 
  ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red",
                  span = input_slider(min=0.1, max=1, value=.5, step=.1, 
                                      label="smoothing span")) %>%
  add_axis("x", title = "time") %>%
  scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "counts") %>%
  scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE)
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.

## Add a quadratic term
quad.mod = lm(counts ~ time + I(time^2), data=geiger)
summary(quad.mod)
## 
## Call:
## lm(formula = counts ~ time + I(time^2), data = geiger)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.287  -6.047  -1.605   4.231  20.593 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 107.97130    4.65611  23.189  < 2e-16 ***
## time         -7.34330    0.71841 -10.222 5.92e-11 ***
## I(time^2)     0.15057    0.02314   6.507 4.74e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.205 on 28 degrees of freedom
## Multiple R-squared:  0.9079, Adjusted R-squared:  0.9014 
## F-statistic: 138.1 on 2 and 28 DF,  p-value: 3.143e-15
## R-squared = 0.9079

## Check assumptions
par(mfrow=c(2,2))    # visualize four graphs at once
plot(quad.mod)

par(mfrow=c(1,1))    # reset the graphics defaults

## Plot model
plot(counts~time, data=geiger)
curve(107.97130 - 7.3433*x + 0.15057*x^2,add=T, lw=4, col=2)

## Compare to linear model
null.mod <- lm(counts ~ 1, data=geiger)
anova(null.mod, lin.mod, quad.mod)
## Analysis of Variance Table
## 
## Model 1: counts ~ 1
## Model 2: counts ~ time
## Model 3: counts ~ time + I(time^2)
##   Res.Df     RSS Df Sum of Sq       F    Pr(>F)    
## 1     30 25769.0                                   
## 2     29  5959.5  1   19809.5 233.797 4.037e-15 ***
## 3     28  2372.4  1    3587.1  42.335 4.736e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add a cubic term
cub.mod = lm(counts ~ time + I(time^2) + I(time^3), data=geiger)
summary(cub.mod)
## 
## Call:
## lm(formula = counts ~ time + I(time^2) + I(time^3), data = geiger)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.707  -5.615  -1.108   6.758  21.680 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 113.251350   5.760812  19.659  < 2e-16 ***
## time         -9.646057   1.690640  -5.706 4.61e-06 ***
## I(time^2)     0.345644   0.132206   2.614   0.0144 *  
## I(time^3)    -0.004335   0.002894  -1.498   0.1458    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.007 on 27 degrees of freedom
## Multiple R-squared:  0.915,  Adjusted R-squared:  0.9056 
## F-statistic: 96.88 on 3 and 27 DF,  p-value: 1.442e-14
## R-squared = 0.9150

## Check assumptions
par(mfrow=c(2,2))    # visualize four graphs at once
plot(quad.mod)

par(mfrow=c(1,1))    # reset the graphics defaults

## Plot model
plot(counts~time, data=geiger)
curve(113.25135 - 9.646057*x + 0.345644*x^2 - 0.004335*x^3,add=T, lw=4, col=2)

## Compare to other model
anova(null.mod, lin.mod, quad.mod, cub.mod)
## Analysis of Variance Table
## 
## Model 1: counts ~ 1
## Model 2: counts ~ time
## Model 3: counts ~ time + I(time^2)
## Model 4: counts ~ time + I(time^2) + I(time^3)
##   Res.Df     RSS Df Sum of Sq       F    Pr(>F)    
## 1     30 25769.0                                   
## 2     29  5959.5  1   19809.5 244.176 4.763e-15 ***
## 3     28  2372.4  1    3587.1  44.215 3.897e-07 ***
## 4     27  2190.5  1     182.0   2.243    0.1458    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Let's go to a 7th degree model
seven.mod = lm(counts ~ time + I(time^2) + I(time^3) +
               I(time^4) + I(time^5) + I(time^6) + I(time^7), data=geiger)
summary(seven.mod)
## 
## Call:
## lm(formula = counts ~ time + I(time^2) + I(time^3) + I(time^4) + 
##     I(time^5) + I(time^6) + I(time^7), data = geiger)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.9967  -5.0016  -0.3222   4.3678  23.4455 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.238e+02  8.224e+00  15.057 2.11e-13 ***
## time        -3.164e+01  1.112e+01  -2.845  0.00917 ** 
## I(time^2)    9.881e+00  4.686e+00   2.109  0.04606 *  
## I(time^3)   -1.712e+00  8.409e-01  -2.036  0.05345 .  
## I(time^4)    1.524e-01  7.576e-02   2.011  0.05614 .  
## I(time^5)   -7.139e-03  3.601e-03  -1.982  0.05951 .  
## I(time^6)    1.677e-04  8.623e-05   1.945  0.06408 .  
## I(time^7)   -1.558e-06  8.191e-07  -1.902  0.06979 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.766 on 23 degrees of freedom
## Multiple R-squared:  0.9314, Adjusted R-squared:  0.9105 
## F-statistic: 44.63 on 7 and 23 DF,  p-value: 6.731e-12
## R-squared = 0.9314

## Check assumptions
par(mfrow=c(2,2))    # visualize four graphs at once
plot(seven.mod)

par(mfrow=c(1,1))    # reset the graphics defaults

## Plot model
coef(seven.mod)
##   (Intercept)          time     I(time^2)     I(time^3)     I(time^4) 
##  1.238349e+02 -3.164351e+01  9.881051e+00 -1.711959e+00  1.523853e-01 
##     I(time^5)     I(time^6)     I(time^7) 
## -7.138673e-03  1.677408e-04 -1.557870e-06
plot(counts~time, data=geiger)
curve(123.8349 - 31.64351*x + 9.881051*x^2 - 1.711959*x^3 +
        0.1523853*x^4 - 0.00713867*x^5 + 0.0001677408*x^6 -
        0.00000155787*x^7 ,add=T, lw=4, col=2)

## Compare to other model
anova(quad.mod, seven.mod)
## Analysis of Variance Table
## 
## Model 1: counts ~ time + I(time^2)
## Model 2: counts ~ time + I(time^2) + I(time^3) + I(time^4) + I(time^5) + 
##     I(time^6) + I(time^7)
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     28 2372.4                           
## 2     23 1767.2  5    605.22 1.5754 0.2066
## Overfits the data

## Cross-validate to choose best model
library(DAAG)
geiger$time2 <- geiger$time^2
geiger$time3 <- geiger$time^3
geiger$time4 <- geiger$time^4

cross.validated.model <- lm(counts ~ time + time2 + time3 + time4, data=geiger)
## This command yields ms = 110
# CVlm(geiger, cross.validated.model, m=10)

cross.validated.model <- lm(counts ~ time + time2 + time3, data=geiger)
## This command yields ms = 105
# CVlm(geiger, cross.validated.model, m=10)

cross.validated.model <- lm(counts ~ time + time2, data=geiger)
## This command yields ms = 103
# CVlm(geiger, cross.validated.model, m=10)

cross.validated.model <- lm(counts ~ time, data=geiger)
## This command yields ms = 219
# CVlm(geiger, cross.validated.model, m=10)

## The model with the squared term has the lowest average mean square error



### Another example - speed vs stopping distance
data(cars)
head(cars)
##   speed dist
## 1     4    2
## 2     4   10
## 3     7    4
## 4     7   22
## 5     8   16
## 6     9   10
## Scatterplot with lowess
cars %>% 
  ggvis(x=~speed, y=~dist, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
  add_axis("x", title = "speed") %>%
  scale_numeric("x", domain=c(0,26), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "stopping distance") %>%
  scale_numeric("y", domain=c(0, 125), nice=FALSE, clamp=TRUE)

## Set-up models
null.model <- lm(dist ~ 1, data=cars)
linear.model <- lm(dist ~ speed, data=cars)
quadratic.model <- lm(dist ~ speed + I(speed^2), data=cars)
cubic.model <- lm(dist ~ speed + I(speed^2) + I(speed^3), data=cars)

## Test models
anova(null.model, linear.model, quadratic.model, cubic.model)
## Analysis of Variance Table
## 
## Model 1: dist ~ 1
## Model 2: dist ~ speed
## Model 3: dist ~ speed + I(speed^2)
## Model 4: dist ~ speed + I(speed^2) + I(speed^3)
##   Res.Df   RSS Df Sum of Sq       F    Pr(>F)    
## 1     49 32539                                   
## 2     48 11354  1   21185.5 91.6398 1.601e-12 ***
## 3     47 10825  1     528.8  2.2874    0.1373    
## 4     46 10634  1     190.4  0.8234    0.3689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(cars, main="lowess cars")

plot(dist~speed, data=cars)
curve(-17.579095 + 3.932409*x , add=T, lw=3)
curve(2.4701378 + 0.9132876*x + 0.0999593*x^2 , add=T, col=2, lw=3)

## Load data
prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv")

## Set-up models
mod0 <- lm(prestige ~ 1, data=prestige)
mod1 <- lm(prestige ~ income, data=prestige)
mod2 <- lm(prestige ~ income + I(income^2), data=prestige)
mod3 <- lm(prestige ~ income + I(income^2) + I(income^3), data=prestige)

## Test models
anova(mod0, mod1, mod2, mod3)
## Analysis of Variance Table
## 
## Model 1: prestige ~ 1
## Model 2: prestige ~ income
## Model 3: prestige ~ income + I(income^2)
## Model 4: prestige ~ income + I(income^2) + I(income^3)
##   Res.Df   RSS Df Sum of Sq        F    Pr(>F)    
## 1    101 29895                                    
## 2    100 14616  1   15279.3 124.0617 < 2.2e-16 ***
## 3     99 12077  1    2539.3  20.6179 1.597e-05 ***
## 4     98 12070  1       7.4   0.0598    0.8073    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(prestige~income, data=prestige)
curve(27.141176368 + 0.002896799*x , add=T, lw=3)
curve(14.18319 + 0.006153506*x - 0.0000001433097*x^2 , add=T, col=2, lw=3)

## Cross validate
prestige$income2 <- prestige$income^2
prestige$income3 <- prestige$income^3
prestige$income4 <- prestige$income^4

cross.validated.model <- lm(prestige ~ income + income2 + income3 + income4, data=prestige)
## This command yields ms = 134
# CVlm(prestige, cross.validated.model, m=10)

cross.validated.model <- lm(prestige ~ income + income2 + income3, data=prestige)
## This command yields ms = 131
# CVlm(prestige, cross.validated.model, m=10)

cross.validated.model <- lm(prestige ~ income + income2, data=prestige)
## This command yields ms = 130
# CVlm(prestige, cross.validated.model, m=10)

cross.validated.model <- lm(prestige ~ income, data=prestige)
## This command yields ms = 154
# CVlm(prestige, cross.validated.model, m=10)

### Cross-validation indicates to include quadratic term

## Check assumptions
par(mfrow=c(2,2))    # visualize four graphs at once
plot(mod2)

par(mfrow=c(1,1))    # reset the graphics defaults




########################
########################
#### Nonlinear Reg. #### -- Skip?
########################
########################

## Temperature and vapor pressure of mercury (mm of mercury)
# Handbook of Chemistry and Physics, CRC Press (1973)
data(pressure)
head(pressure)
##   temperature pressure
## 1           0   0.0002
## 2          20   0.0012
## 3          40   0.0060
## 4          60   0.0300
## 5          80   0.0900
## 6         100   0.2700
# Change temperature to Kelvin
pressure$temperature = pressure$temperature + 273.15
# Change pressure to kiloPascals
pressure$pressure = pressure$pressure * .1333

# Source:  http://ww2.coastal.edu/kingw/statistics/R-tutorials/simplenonlinear.html
# When a liquid is placed in an evacuated, closed chamber 
# (i.e., in a "vacuum"), it begins to evaporate. This 
# vaporization will continue until the vapor and the liquid 
# reach a dynamic equilibrium (the number of particles of
# liquid that go into the vapor is equal to the number of
# particles of vapor that go into the liquid). At this point,
# the pressure exerted by the vapor on the liquid is called
# the vapor pressure (or equilibrium vapor pressure) of the
# substance. Vapor pressure has a complex relationship with
# temperature, and so far as I know, there is no law in theory
# that specifies this relationship (at least not in the case of
# mercury). Since we are dealing with a liquid/vapor phase
# transition, and not just a gas, the simple gas laws you
# learned back in high school chemistry don't apply. The vapor
# pressure of mercury and its relationship to temperature has
# been an important question, for example, in the calibration
# of scientific instruments like mercury thermometers and barometers.


plot(pressure)

summary(pressure)
##   temperature       pressure        
##  Min.   :273.1   Min.   :  0.00003  
##  1st Qu.:363.1   1st Qu.:  0.02399  
##  Median :453.1   Median :  1.17304  
##  Mean   :453.1   Mean   : 16.57408  
##  3rd Qu.:543.1   3rd Qu.: 16.86245  
##  Max.   :633.1   Max.   :107.43980
# Plot various log transformations
## Identity, log(y), power, log(x)
## Identity, exponential, power, log

par(mfrow=c(1,4))    # one row of four graphs
plot(pressure ~ temperature, data=pressure,
     main="Vapor Pressure\nof Mercury", 
     xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)")
plot(pressure ~ temperature, data=pressure,
     main="Vapor Pressure\nof Mercury", 
     xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="y")
plot(pressure ~ temperature, data=pressure,
     main="Vapor Pressure\nof Mercury", 
     xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="xy")
plot(pressure ~ temperature, data=pressure,
     main="Vapor Pressure\nof Mercury", 
     xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="x")

par(mfrow=c(1,1))    # reset the graphics defaults

## Plot power model
linear.model <- lm(pressure ~ temperature, data=pressure)
summary(linear.model)
## 
## Call:
## lm(formula = pressure ~ temperature, data = pressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.072 -15.604  -4.378   9.637  54.577 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -74.7835    19.6282  -3.810 0.001400 ** 
## temperature   0.2016     0.0421   4.788 0.000171 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.1 on 17 degrees of freedom
## Multiple R-squared:  0.5742, Adjusted R-squared:  0.5492 
## F-statistic: 22.93 on 1 and 17 DF,  p-value: 0.000171
quad.model <- lm(pressure ~ temperature + I(temperature^2), data=pressure)
summary(quad.model)
## 
## Call:
## lm(formula = pressure ~ temperature + I(temperature^2), data = pressure)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.6824  -7.2504  -0.1803   6.4301  22.7109 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.272e+02  4.229e+01   5.373 6.22e-05 ***
## temperature      -1.214e+00  1.941e-01  -6.255 1.15e-05 ***
## I(temperature^2)  1.562e-03  2.129e-04   7.336 1.67e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.92 on 16 degrees of freedom
## Multiple R-squared:  0.9024, Adjusted R-squared:  0.8902 
## F-statistic:    74 on 2 and 16 DF,  p-value: 8.209e-09
power.model <- lm(log(pressure) ~ log(temperature), data=pressure)
summary(power.model)
## 
## Call:
## lm(formula = log(pressure) ~ log(temperature), data = pressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2390 -0.3594  0.2142  0.4625  0.6045 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -108.1138     3.1744  -34.06   <2e-16 ***
## log(temperature)   17.6150     0.5212   33.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5736 on 17 degrees of freedom
## Multiple R-squared:  0.9853, Adjusted R-squared:  0.9845 
## F-statistic:  1142 on 1 and 17 DF,  p-value: < 2.2e-16
newdf <- data.frame(temperature=seq(min(pressure$temperature), 
                                    max(pressure$temperature), len=19))
plot(pressure ~ temperature, data=pressure)
lines(newdf$temperature, exp(predict(power.model, newdf)), col=2, lw=4)
lines(newdf$temperature, predict(linear.model, newdf), col=3, lw=4)
lines(newdf$temperature, predict(quad.model, newdf), col=4, lw=4)
text(600, .75, substitute(plain("R-square: ") * r2, list(r2=summary(power.model)$r.squared)))

## Try with prestige
par(mfrow=c(1,4))    # one row of four graphs
plot(prestige ~ income, data=prestige,
     xlab="Income", ylab="Prestige")
plot(prestige ~ income, data=prestige,
     xlab="Income", ylab="Prestige", log="x")
plot(prestige ~ income, data=prestige,
     xlab="Income", ylab="Prestige", log="y")
plot(prestige ~ income, data=prestige,
     xlab="Income", ylab="Prestige", log="xy")

par(mfrow=c(1,1))    # reset the graphics defaults


linear.model <- lm(prestige ~ income, data=prestige)
summary(linear.model)
## 
## Call:
## lm(formula = prestige ~ income, data = prestige)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.007  -8.378  -2.378   8.432  32.084 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.714e+01  2.268e+00   11.97   <2e-16 ***
## income      2.897e-03  2.833e-04   10.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.09 on 100 degrees of freedom
## Multiple R-squared:  0.5111, Adjusted R-squared:  0.5062 
## F-statistic: 104.5 on 1 and 100 DF,  p-value: < 2.2e-16
power.model <- lm(log(prestige) ~ log(income), data=prestige)
summary(power.model)
## 
## Call:
## lm(formula = log(prestige) ~ log(income), data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.67077 -0.16653 -0.02778  0.18921  0.61675 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.54292    0.37880  -1.433    0.155    
## log(income)  0.49855    0.04364  11.425   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2594 on 100 degrees of freedom
## Multiple R-squared:  0.5662, Adjusted R-squared:  0.5619 
## F-statistic: 130.5 on 1 and 100 DF,  p-value: < 2.2e-16
logx.model <- lm(prestige ~ log(income), data=prestige)
summary(logx.model)
## 
## Call:
## lm(formula = prestige ~ log(income), data = prestige)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.114  -9.342  -1.218   8.101  30.454 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -139.856     16.954  -8.249  6.6e-13 ***
## log(income)   21.556      1.953  11.037  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.61 on 100 degrees of freedom
## Multiple R-squared:  0.5492, Adjusted R-squared:  0.5447 
## F-statistic: 121.8 on 1 and 100 DF,  p-value: < 2.2e-16
logy.model <- lm(log(prestige) ~ income, data=prestige)
summary(logy.model)
## 
## Call:
## lm(formula = log(prestige) ~ income, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.72380 -0.16408 -0.01177  0.22431  0.64408 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.353e+00  5.466e-02  61.338  < 2e-16 ***
## income      6.208e-05  6.829e-06   9.091 9.73e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2914 on 100 degrees of freedom
## Multiple R-squared:  0.4525, Adjusted R-squared:  0.447 
## F-statistic: 82.64 on 1 and 100 DF,  p-value: 9.727e-15
newdf <- data.frame(income=seq(min(prestige$income), 
                                    max(prestige$income), len=100))
plot(prestige ~ income, data=prestige)
lines(newdf$income, exp(predict(power.model, newdf)), col=2, lw=4)
lines(newdf$income, predict(linear.model, newdf), col=3, lw=4)
lines(newdf$income, predict(logx.model, newdf), col=4, lw=4)

## Lowess for prestige
prestige %>% 
  ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_smooths(stroke:="red", strokeWidth:=5) %>%
  add_axis("x", title = "Income") %>%
  scale_numeric("x", domain=c(0,25000), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Prestige") %>%
  scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE)

########################
########################
##### Robust Reg. ######
########################
########################

# First, fit linear model
linear.model <- lm(prestige ~ income + education + percwomn, data=prestige)
summary(linear.model)
## 
## Call:
## lm(formula = prestige ~ income + education + percwomn, data = prestige)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.8246  -5.3332  -0.1364   5.1587  17.5045 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.7943342  3.2390886  -2.098   0.0385 *  
## income       0.0013136  0.0002778   4.729 7.58e-06 ***
## education    4.1866373  0.3887013  10.771  < 2e-16 ***
## percwomn    -0.0089052  0.0304071  -0.293   0.7702    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.846 on 98 degrees of freedom
## Multiple R-squared:  0.7982, Adjusted R-squared:  0.792 
## F-statistic: 129.2 on 3 and 98 DF,  p-value: < 2.2e-16
# Now, take 5000 bootstrap samples using residual approach
# (described in activity)
library(car)
linear.boot <- Boot(linear.model, R=5000, method="residual")
summary(linear.boot)
##                R   original    bootBias     bootSE    bootMed
## (Intercept) 5000 -6.7943342 -1.6143e-02 3.26057222 -6.8127110
## income      5000  0.0013136 -4.9195e-06 0.00027123  0.0013092
## education   5000  4.1866373  4.3619e-03 0.38286896  4.1985342
## percwomn    5000 -0.0089052 -6.1002e-05 0.03058230 -0.0090783
## Fictitious data with outlier
x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 25)
y <- c(1.1, 1.9, 3.1, 3.9, 5.1, 5.9, 7.1, 7.9, 9.1, 0)
df <- data.frame(x,y)

# Create linear model
linear.model <- lm(y ~ x, df)
summary(linear.model)
## 
## Call:
## lm(formula = y ~ x, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8414 -2.6515 -0.1898  2.2720  4.7338 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   5.0133     1.4767   3.395  0.00943 **
## x            -0.0719     0.1548  -0.464  0.65467   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.172 on 8 degrees of freedom
## Multiple R-squared:  0.02626,    Adjusted R-squared:  -0.09546 
## F-statistic: 0.2158 on 1 and 8 DF,  p-value: 0.6547
# Plot linear model
newdf <- data.frame(x=seq(min(df$x), max(df$x), len=100))
plot(y ~ x, data=df)
lines(newdf$x, predict(linear.model, newdf), col=2, lw=4)

# Create bootstrap model with randomly sampled cases
boot2 <- Boot(linear.model, R = 10000, method="case")
summary(boot2)
##                 R  original bootBias  bootSE   bootMed
## (Intercept) 10000  5.013333 -1.49143 2.73935  4.403217
## x           10000 -0.071905  0.33049 0.53793 -0.071905
# Plot bootstrap model
plot(y ~ x, data=df)
curve((5.0133-1.586) + ((-0.0719 + 0.347)*x), add=T, lw=3, col=2)

# Get predicted values from the bootstrap regression
df$pred <- 3.43 + 0.275*x

## Fit lowess
df %>% 
  ggvis(x=~x, y=~y, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>%
  layer_points() %>%
    layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
  add_axis("x", title = "x") %>%
  scale_numeric("x", domain=c(0,25), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "y") %>%
  scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE)

# Create bootstrap model for prestige data
linear.model <- lm(prestige ~ income + education + percwomn, data=prestige)
boot3 <- Boot(linear.model, R = 5000, method="case")
summary(boot3)
##                R   original    bootBias     bootSE    bootMed
## (Intercept) 5000 -6.7943342 -4.3558e-02 3.16021675 -6.7595832
## income      5000  0.0013136  9.7042e-05 0.00043847  0.0013219
## education   5000  4.1866373 -6.3514e-02 0.46671924  4.1713369
## percwomn    5000 -0.0089052  4.1023e-03 0.03737987 -0.0067788
########################
########################
##### Quantile Reg #####
########################
########################

## Load data
hsb <- read.csv("http://www.bradthiessen.com/html5/data/hsb.csv")
head(hsb)
##   schid minority female    ses mathach size schtype meanses
## 1  1224        0      1 -1.528   5.876  842       0  -0.428
## 2  1224        0      1 -0.588  19.708  842       0  -0.428
## 3  1224        0      0 -0.528  20.349  842       0  -0.428
## 4  1224        0      0 -0.668   8.781  842       0  -0.428
## 5  1224        0      0 -0.158  17.898  842       0  -0.428
## 6  1224        0      0  0.022   4.583  842       0  -0.428
hsb %>%
  group_by(schid) %>%
  summarize(students=n(), school.type=mean(schtype), mean.mathach=mean(mathach))
## Source: local data frame [160 x 4]
## 
##    schid students school.type mean.mathach
## 1   1224       47           0     9.715447
## 2   1288       25           0    13.510800
## 3   1296       48           0     7.635958
## 4   1308       20           1    16.255500
## 5   1317       48           1    13.177687
## 6   1358       30           0    11.206233
## 7   1374       28           0     9.728464
## 8   1433       35           1    19.719143
## 9   1436       44           1    18.111614
## 10  1461       33           0    16.842636
## ..   ...      ...         ...          ...
## Create scatterplot matrix
hsb2 <- data.frame(hsb$minority, hsb$female, scale(hsb$ses), scale(hsb$mathach), hsb$size)
pairs(hsb2, col=rgb(0, 0, 0, 0.01))

# Is math achievement significantly related to SES?
linear.model <- lm(scale(mathach) ~ scale(ses), data=hsb)
summary(linear.model)
## 
## Call:
## lm(formula = scale(mathach) ~ scale(ses), data = hsb)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.82605 -0.69174  0.03393  0.73636  2.31174 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.116e-15  1.100e-02    0.00        1    
## scale(ses)  3.608e-01  1.100e-02   32.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9327 on 7183 degrees of freedom
## Multiple R-squared:  0.1301, Adjusted R-squared:   0.13 
## F-statistic:  1075 on 1 and 7183 DF,  p-value: < 2.2e-16
anova(linear.model)
## Analysis of Variance Table
## 
## Response: scale(mathach)
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## scale(ses)    1    935  934.96  1074.7 < 2.2e-16 ***
## Residuals  7183   6249    0.87                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Show linear model
hsb %>% 
  ggvis(x=~scale(ses), y=~scale(mathach), fill := "steelblue", stroke:="steelblue", fillOpacity:=.05, strokeOpacity:=.1) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red", se=T) %>%
  add_axis("x", title = "SES") %>%
  scale_numeric("x", domain=c(-4,3), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Math Achievement") %>%
  scale_numeric("y", domain=c(-3, 2), nice=FALSE, clamp=TRUE)

# Quantile regression asks whether a relation
# changes across the distribution of the outcome.
# Is the relation of SES with math achievement
# different across levels of math achievement?

library(quantreg)

# Median regression
q50 <- rq(scale(mathach) ~ scale(ses), tau=0.50, data=hsb)
summary(q50)
## 
## Call: rq(formula = scale(mathach) ~ scale(ses), tau = 0.5, data = hsb)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             Value    Std. Error t value  Pr(>|t|)
## (Intercept)  0.02945  0.01528    1.92685  0.05404
## scale(ses)   0.44536  0.01539   28.93027  0.00000
# Graph median regression
plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4)
abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line 

# 10th-90th percentiles
quantregtens <- rq(scale(mathach) ~ scale(ses), data=hsb,
                   tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
                         0.60, 0.70, 0.80, 0.90))

quantreg.plot <- summary(quantregtens)
## Warning in summary.rq(xi, ...): 1 non-positive fis
## Warning in summary.rq(xi, ...): 1 non-positive fis
plot(quantreg.plot, mfrow=c(1,2))

plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4)
abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line 
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
  abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) 
  }

## Test difference in slopes at 25th and 50th percentiles
q25 <- rq(scale(mathach) ~ scale(ses), tau=0.25, data=hsb)
summary(q25)
## 
## Call: rq(formula = scale(mathach) ~ scale(ses), tau = 0.25, data = hsb)
## 
## tau: [1] 0.25
## 
## Coefficients:
##             Value     Std. Error t value   Pr(>|t|) 
## (Intercept)  -0.68503   0.01659  -41.29596   0.00000
## scale(ses)    0.40577   0.01590   25.52552   0.00000
anova(q25,q50)
## Quantile Regression Analysis of Deviance Table
## 
## Model: scale(mathach) ~ scale(ses)
## Joint Test of Equality of Slopes: tau in {  0.25 0.5  }
## 
##   Df Resid Df F value  Pr(>F)   
## 1  1    14369  7.4503 0.00635 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Minority achievement gap
# 10th-90th percentiles
quantregtens <- rq(scale(mathach) ~ minority, data=hsb,
                   tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
                         0.60, 0.70, 0.80, 0.90))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
quantreg.plot <- summary(quantregtens)
plot(quantreg.plot, mfrow=c(1,2))

plot(hsb$minority, scale(hsb$mathach),cex=.25,type="n",xlab="Minority", ylab="Math Achievement")
points(hsb$minority, scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
abline(lm(scale(hsb$mathach)~hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line 
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
  abline(rq(scale(hsb$mathach)~hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) 
  }
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique

# Multiple Quantile Regression
quantregtens <- rq(scale(mathach) ~ minority + scale(ses), data=hsb,
                   tau=c(0.10, 0.20, 0.30, 0.40, 0.50,
                         0.60, 0.70, 0.80, 0.90))
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
quantreg.plot <- summary(quantregtens)
## Warning in summary.rq(xi, ...): 1 non-positive fis
plot(quantreg.plot, mfrow=c(1,3))

# Compare 25th and 50th percentiles
q25<-rq(scale(mathach)~scale(ses)+minority, tau=.25,data=hsb)
q50<-rq(scale(mathach)~scale(ses)+minority, tau=.50,data=hsb)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
anova(q25, q50)
## Quantile Regression Analysis of Deviance Table
## 
## Model: scale(mathach) ~ scale(ses) + minority
## Joint Test of Equality of Slopes: tau in {  0.25 0.5  }
## 
##   Df Resid Df F value   Pr(>F)   
## 1  2    14368  5.4878 0.004145 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot quantile lines
plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement")
points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1))
abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4)
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
abline(lm(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line 
## Warning in abline(lm(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority),
## : only using the first two of 3 regression coefficients
taus <- c(.1,.3,.5,.7,.9)
for( i in 1:length(taus)){
  abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) 
  }
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients
## Warning in abline(rq(scale(hsb$mathach) ~ scale(hsb$ses) + hsb$minority, :
## only using the first two of 3 regression coefficients

########################
########################
##### ANOVA as Reg #####
########################
########################


#### Ambiguous prose
#### Load the data from the web
ambiguous <- read.csv("http://www.bradthiessen.com/html5/data/ambiguousprose.csv")

str(ambiguous)
## 'data.frame':    57 obs. of  2 variables:
##  $ Condition    : Factor w/ 3 levels "After","Before",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Comprehension: int  6 5 4 2 1 3 5 3 3 2 ...
dotPlot( ~ Comprehension | Condition, data = ambiguous, nint = 17,
          endpoints = c(-2, 10), layout = c(1,3), aspect = .35, cex=.7, 
          xlab = "Comprehension score (0-7)")

#### Run an ANOVA modeling comprehension as a function of condition (treatments)
mod1 <- aov(Comprehension ~ Condition, data=ambiguous)
anova(mod1)
## Analysis of Variance Table
## 
## Response: Comprehension
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Condition  2 35.053 17.5263  10.012 0.0002002 ***
## Residuals 54 94.526  1.7505                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mean(Comprehension ~ Condition, data=ambiguous)
##    After   Before     None 
## 3.210526 4.947368 3.368421
## Create possible coding (ordinal code for condition)
## Note that this is a bad idea!
ambiguous$condcode[ambiguous$Condition=="After"] <- 1
ambiguous$condcode[ambiguous$Condition=="Before"] <- 2
ambiguous$condcode[ambiguous$Condition=="None"] <- 3

## Run a linear regression with this condition code
## Still a bad idea!
cond.code <- lm(Comprehension ~ condcode, data=ambiguous)
summary(cond.code)
## 
## Call:
## lm(formula = Comprehension ~ condcode, data = ambiguous)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.92105 -0.92105  0.07895  1.15789  3.15789 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.68421    0.53740   6.856  6.5e-09 ***
## condcode     0.07895    0.24877   0.317    0.752    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.534 on 55 degrees of freedom
## Multiple R-squared:  0.001828,   Adjusted R-squared:  -0.01632 
## F-statistic: 0.1007 on 1 and 55 DF,  p-value: 0.7522
## Show linear model
ambiguous %>% 
  ggvis(x=~condcode, y=~Comprehension, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>%
  layer_points() %>%
    layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>%
  add_axis("x", title = "time") %>%
  scale_numeric("x", domain=c(1,3), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "counts") %>%
  scale_numeric("y", domain=c(0, 7), nice=FALSE, clamp=TRUE)

## Create dummy variables x1, x2, x3
## Note that creating the 3rd dummy variable is a bad idea!
ambiguous$x1 <- ifelse(ambiguous$Condition =="After", 1, 0) 
ambiguous$x2 <- ifelse(ambiguous$Condition =="Before", 1, 0) 
ambiguous$x3 <- ifelse(ambiguous$Condition =="None", 1, 0) 

## Run linear regression with all 3 dummy variables
## Bad idea!
lin.mod <- lm(Comprehension ~ x1 + x2 + x3, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
## 
## Response: Comprehension
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## x1         1 11.368 11.3684  6.4944 0.0136980 *  
## x2         1 23.684 23.6842 13.5301 0.0005422 ***
## Residuals 54 94.526  1.7505                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
## 
## Call:
## lm(formula = Comprehension ~ x1 + x2 + x3, data = ambiguous)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.94737 -0.94737  0.05263  1.05263  2.78947 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3684     0.3035  11.097 1.51e-15 ***
## x1           -0.1579     0.4293  -0.368 0.714436    
## x2            1.5789     0.4293   3.678 0.000542 ***
## x3                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared:  0.2705, Adjusted R-squared:  0.2435 
## F-statistic: 10.01 on 2 and 54 DF,  p-value: 0.0002002
## Run linear regression with the first two dummy variables
## Bad idea!
lin.mod <- lm(Comprehension ~ x1 + x2, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
## 
## Response: Comprehension
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## x1         1 11.368 11.3684  6.4944 0.0136980 *  
## x2         1 23.684 23.6842 13.5301 0.0005422 ***
## Residuals 54 94.526  1.7505                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
## 
## Call:
## lm(formula = Comprehension ~ x1 + x2, data = ambiguous)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.94737 -0.94737  0.05263  1.05263  2.78947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3684     0.3035  11.097 1.51e-15 ***
## x1           -0.1579     0.4293  -0.368 0.714436    
## x2            1.5789     0.4293   3.678 0.000542 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared:  0.2705, Adjusted R-squared:  0.2435 
## F-statistic: 10.01 on 2 and 54 DF,  p-value: 0.0002002
## Run linear regression with factor variable
## Same output as above
lin.mod <- lm(Comprehension ~ Condition, data=ambiguous)
anova(lin.mod)
## Analysis of Variance Table
## 
## Response: Comprehension
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Condition  2 35.053 17.5263  10.012 0.0002002 ***
## Residuals 54 94.526  1.7505                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lin.mod)
## 
## Call:
## lm(formula = Comprehension ~ Condition, data = ambiguous)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.94737 -0.94737  0.05263  1.05263  2.78947 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.2105     0.3035  10.577 9.03e-15 ***
## ConditionBefore   1.7368     0.4293   4.046 0.000167 ***
## ConditionNone     0.1579     0.4293   0.368 0.714436    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.323 on 54 degrees of freedom
## Multiple R-squared:  0.2705, Adjusted R-squared:  0.2435 
## F-statistic: 10.01 on 2 and 54 DF,  p-value: 0.0002002
##############
## AxB ANOVA #
##############

data(ToothGrowth)

str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

# AxB ANOVA
aov.out = aov(len ~ supp * dose, data=ToothGrowth)
summary(aov.out)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## supp         1  205.4   205.4  15.572 0.000231 ***
## dose         2 2426.4  1213.2  92.000  < 2e-16 ***
## supp:dose    2  108.3    54.2   4.107 0.021860 *  
## Residuals   54  712.1    13.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# As regression
tooth.model <- lm(len ~ supp * dose, data=ToothGrowth)
anova(tooth.model)
## Analysis of Variance Table
## 
## Response: len
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## supp       1  205.35  205.35  15.572 0.0002312 ***
## dose       2 2426.43 1213.22  92.000 < 2.2e-16 ***
## supp:dose  2  108.32   54.16   4.107 0.0218603 *  
## Residuals 54  712.11   13.19                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(tooth.model)
## 
## Call:
## lm(formula = len ~ supp * dose, data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.20  -2.72  -0.27   2.65   8.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    13.230      1.148  11.521 3.60e-16 ***
## suppVC         -5.250      1.624  -3.233  0.00209 ** 
## dose1           9.470      1.624   5.831 3.18e-07 ***
## dose2          12.830      1.624   7.900 1.43e-10 ***
## suppVC:dose1   -0.680      2.297  -0.296  0.76831    
## suppVC:dose2    5.330      2.297   2.321  0.02411 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.631 on 54 degrees of freedom
## Multiple R-squared:  0.7937, Adjusted R-squared:  0.7746 
## F-statistic: 41.56 on 5 and 54 DF,  p-value: < 2.2e-16