Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)


AxB ANOVA

I’m going to use a dataset that comes with a standard R installation. The dataset is from a study on the effect of vitamin C on tooth growth in guinea pigs

len = tooth length

supp = the way the supplement was administered (OJ = orange juice) (VC = absorbic acid)

dose = amount of supplement in mg (levels are 0.5, 1.0, 2.0)

## Load data
data(ToothGrowth)

## The data has 60 observations on 3 variables:
head(ToothGrowth)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
## This shows that the "dose" variable is numeric
## We want to change it to a factor variable
  ## where 0.5 = low, 1.0 = med, 2.0 = high
str(ToothGrowth)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
ToothGrowth$dose = factor(ToothGrowth$dose, levels=c(0.5,1.0,2.0),
                          labels=c("low","med","high"))

## Let's look at the sample data per cell
ToothGrowth %>%
  group_by(supp, dose) %>%
  summarize(n=n(), mean=mean(len), sd=sd(len))
## Source: local data frame [6 x 5]
## Groups: supp
## 
##   supp dose  n  mean       sd
## 1   OJ  low 10 13.23 4.459709
## 2   OJ  med 10 22.70 3.910953
## 3   OJ high 10 26.06 2.655058
## 4   VC  low 10  7.98 2.746634
## 5   VC  med 10 16.77 2.515309
## 6   VC high 10 26.14 4.797731

Here are the means and (standard deviations) for each cell. Note that each cell has a sample size of n=10.

…… | Orange Juice | .AbsorbAcid. | Total

—— | ——————- | —————– | ————————-

Low | 13.23 (4.460) | 07.98 (2.747) | 10.605 (4.4998)

Med | 22.70 (3.911) | 16.77 (2.515) | 19.735 (4.4154)

High | 26.06 (2.655) | 26.14 (4.798) | 26.100 (3.7742)

—— | ——————- | —————– | ————————-

Total | 20.66 (6.606) | 16.96 (8.266) | N=60, 18.813 (7.6493)

There’s an easier way to get these summary statistics, but it requires running the ANOVA first.

### Let's take a look at some visualizations of this data

## Density plots to investigate normality
densityplot(~len|dose*supp, data=ToothGrowth,
    main="Tooth Growth by Dose and Supplement",
   xlab="Tooth Growth", 
   layout=c(3,2))

## Let's also run a test to check for homogeneity of variances
## Bartlett's test (run separately for dose and supplement groups)
bartlett.test(len~dose, data=ToothGrowth)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  len by dose
## Bartlett's K-squared = 0.6655, df = 2, p-value = 0.717
bartlett.test(len~supp, data=ToothGrowth)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  len by supp
## Bartlett's K-squared = 1.4217, df = 1, p-value = 0.2331
## What do those relatively high p-values tell us?


## Boxplots to investigate group differences
bwplot(supp~len|dose, data=ToothGrowth,
    ylab="supplement", xlab="Tooth Growth", 
   main="Tooth Growth by Dose and Supplement", 
   layout=(c(1,3)))

## Let's take a look at a means plot to check for interaction
with(ToothGrowth, interaction.plot(x.factor=dose, trace.factor=supp,
    response=len, fun=mean, type="b", legend=T,
    ylab="Tooth Length", main="Interaction Plot",
    pch=c(1,19)))

** We’re now ready to run the AxB ANOVA**

## AxB ANOVA
# Note that it models the LEN as a function of
#   supp * dose.  This means our model is:
#  len ~ f(supp + dose + suppXdose)
aov.out = aov(len ~ supp * dose, data=ToothGrowth)

## Let's get a summary of the model
summary(aov.out)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## supp         1  205.4   205.4  15.572 0.000231 ***
## dose         2 2426.4  1213.2  92.000  < 2e-16 ***
## supp:dose    2  108.3    54.2   4.107 0.021860 *  
## Residuals   54  712.1    13.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Assuming we're using an alpha of 0.05, we have a significant
## interaction.  We can now split the data by dose and compare
## supplements using a Tukey test
TukeyHSD(aov.out, which=c("supp"), conf.level=.99)
##   Tukey multiple comparisons of means
##     99% family-wise confidence level
## 
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
## 
## $supp
##       diff       lwr       upr     p adj
## VC-OJ -3.7 -6.203448 -1.196552 0.0002312
## The p-value indicates the difference is statistically significant
## The confidence interval for the difference ranges from -1.2 to -6.2.
## We can plot this confidence interval
plot(TukeyHSD(aov.out, which=c("supp")))

## We could have also split our data by supplement and compare doses
TukeyHSD(aov.out, which=c("dose"), conf.level=.99)
##   Tukey multiple comparisons of means
##     99% family-wise confidence level
## 
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
## 
## $dose
##            diff       lwr       upr   p adj
## med-low   9.130  5.637681 12.622319 0.0e+00
## high-low 15.495 12.002681 18.987319 0.0e+00
## high-med  6.365  2.872681  9.857319 2.7e-06
plot(TukeyHSD(aov.out, which=c("dose")))

## Finally, we could have also run Tukey tests to compare all cell means
TukeyHSD(aov.out, conf.level=.99)
##   Tukey multiple comparisons of means
##     99% family-wise confidence level
## 
## Fit: aov(formula = len ~ supp * dose, data = ToothGrowth)
## 
## $supp
##       diff       lwr       upr     p adj
## VC-OJ -3.7 -6.203448 -1.196552 0.0002312
## 
## $dose
##            diff       lwr       upr   p adj
## med-low   9.130  5.637681 12.622319 0.0e+00
## high-low 15.495 12.002681 18.987319 0.0e+00
## high-med  6.365  2.872681  9.857319 2.7e-06
## 
## $`supp:dose`
##                  diff        lwr        upr     p adj
## VC:low-OJ:low   -5.25 -11.012826  0.5128257 0.0242521
## OJ:med-OJ:low    9.47   3.707174 15.2328257 0.0000046
## VC:med-OJ:low    3.54  -2.222826  9.3028257 0.2640208
## OJ:high-OJ:low  12.83   7.067174 18.5928257 0.0000000
## VC:high-OJ:low  12.91   7.147174 18.6728257 0.0000000
## OJ:med-VC:low   14.72   8.957174 20.4828257 0.0000000
## VC:med-VC:low    8.79   3.027174 14.5528257 0.0000210
## OJ:high-VC:low  18.08  12.317174 23.8428257 0.0000000
## VC:high-VC:low  18.16  12.397174 23.9228257 0.0000000
## VC:med-OJ:med   -5.93 -11.692826 -0.1671743 0.0073930
## OJ:high-OJ:med   3.36  -2.402826  9.1228257 0.3187361
## VC:high-OJ:med   3.44  -2.322826  9.2028257 0.2936430
## OJ:high-VC:med   9.29   3.527174 15.0528257 0.0000069
## VC:high-VC:med   9.37   3.607174 15.1328257 0.0000058
## VC:high-OJ:high  0.08  -5.682826  5.8428257 1.0000000
plot(TukeyHSD(aov.out))

## We could also use t-tests with the Bonferroni adjustment
## to compare pairs of means
# This compares tooth lengths by dose levels.  The table reports p-values:
with(ToothGrowth, pairwise.t.test(len, dose, p.adjust.method="bonferroni"))
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  len and dose 
## 
##      low     med    
## med  2.0e-08 -      
## high 4.4e-16 4.3e-05
## 
## P value adjustment method: bonferroni
# This compares tooth lengths by supplements.  The table reports p-values:
with(ToothGrowth, pairwise.t.test(len, supp, p.adjust.method="bonferroni"))
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  len and supp 
## 
##    OJ  
## VC 0.06
## 
## P value adjustment method: bonferroni
## Let's take another look at our ANOVA summary table
summary(aov.out)
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## supp         1  205.4   205.4  15.572 0.000231 ***
## dose         2 2426.4  1213.2  92.000  < 2e-16 ***
## supp:dose    2  108.3    54.2   4.107 0.021860 *  
## Residuals   54  712.1    13.2                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Let's calculate an eta-squared for our model
## SStotal = 205 + 2426 + 108 + 712 = 3451
## SSerror = 712
## SSmodel = SS explained by our model = SSsupp + SSdose + SSinteraction
### SSmodel = 205 + 2426 + 108 = 2739
#### eta-squared = 2739 / 3451 = 0.79
### This eta-squared will be reported as R-squared in
### the following table.  We'll understand everything
### about these tables once we get to the next unit.
summary.lm(aov.out)
## 
## Call:
## aov(formula = len ~ supp * dose, data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.20  -2.72  -0.27   2.65   8.27 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       13.230      1.148  11.521 3.60e-16 ***
## suppVC            -5.250      1.624  -3.233  0.00209 ** 
## dosemed            9.470      1.624   5.831 3.18e-07 ***
## dosehigh          12.830      1.624   7.900 1.43e-10 ***
## suppVC:dosemed    -0.680      2.297  -0.296  0.76831    
## suppVC:dosehigh    5.330      2.297   2.321  0.02411 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.631 on 54 degrees of freedom
## Multiple R-squared:  0.7937, Adjusted R-squared:  0.7746 
## F-statistic: 41.56 on 5 and 54 DF,  p-value: < 2.2e-16
## Now that we've conducted our AxB ANOVA, let's
## evaluate the assumptions
## This command creates a series of plots...
plot(aov.out)

## The first plot shows residuals (errors) as a function of
## fitted (predicted) values.  We want to see equal variances 
## in our residuals across all levels of the fitted values.
## This plot looks fine.
#### The second plot shows a Q-Q plot of the residuals.  We
#### assume they follow a normal distribution.  This assumption
#### is a bit off, but that's to be expected with such low
#### sample sizes.  We might want to run a different analysis
#### to check our conclusions
## The third and fourth plots go beyond what we'll study in this class.

##### Finally, let's take a look at a plot of our model effects
plot.design(len~supp*dose, data=ToothGrowth)

Randomization-based approach using eta-squared values

Let’s analyze the cholesterol dataset (from class) using a randomization-based approach.

## Cholesterol example from activity
chol <- read.csv("http://www.bradthiessen.com/html5/data/cholesterol.csv")

# Run the AxB ANOVA
chol_model <- lm(cholest ~ drug*obese, data=chol)
anova(chol_model)
## Analysis of Variance Table
## 
## Response: cholest
##            Df  Sum Sq Mean Sq F value   Pr(>F)   
## drug        1   46.34   46.34  0.6322 0.431743   
## obese       1  752.82  752.82 10.2704 0.002831 **
## drug:obese  1  355.24  355.24  4.8465 0.034197 * 
## Residuals  36 2638.77   73.30                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(cholest ~ drug*obese, data=chol))
## Analysis of Variance Table
## 
## Response: cholest
##            Df  Sum Sq Mean Sq F value   Pr(>F)   
## drug        1   46.34   46.34  0.6322 0.431743   
## obese       1  752.82  752.82 10.2704 0.002831 **
## drug:obese  1  355.24  355.24  4.8465 0.034197 * 
## Residuals  36 2638.77   73.30                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run it again but collect the eta-squared value
chol_aov <- do(1) * lm(cholest ~ drug*obese, data=chol)
observed_r_squared <- chol_aov$r.squared
observed_r_squared
## [1] 0.304337
# Shuffle interaction term 10,000 times and record eta-squared
random_interact <- do(10000) * 
  lm(cholest ~ drug + obese + shuffle(drug):shuffle(obese), data=chol)

# Plot those 10,000 eta-squared values
histogram(~r.squared, data = random_interact, col="grey", xlim=c(.2, .45), xlab = "eta-squared", width=.005)

# Find how unlikely our observed eta-squared was
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_interact)
## 
##   TRUE  FALSE 
## 0.0328 0.9672
## That p-value is similar to what we got from our AxB ANOVA.



# Shuffle drug 
random_drug <- do(10000) * lm(cholest ~ shuffle(drug) + obese + drug:obese, data=chol)

histogram(~r.squared, data = random_drug, col="grey", xlim=c(.2, .45), xlab = "eta-squared", width=.005)

tally(~ (r.squared >= observed_r_squared), format="prop", data=random_drug)
## 
##   TRUE  FALSE 
## 0.3214 0.6786
# Shuffle obese 
random_obese <- do(10000) * lm(cholest ~ drug + shuffle(obese) + drug:obese, data=chol)

histogram(~r.squared, data = random_obese, col="grey", xlim=c(0, .35), xlab = "eta-squared", width=.005)

tally(~ (r.squared >= observed_r_squared), format="prop", data=random_obese)
## 
##   TRUE  FALSE 
## 0.0003 0.9997

Randomization-based approach for guinea pig tooth growth example

# Run the AxB ANOVA
tooth_model = lm(len ~ supp + dose + supp:dose, data=ToothGrowth)
anova(tooth_model)
## Analysis of Variance Table
## 
## Response: len
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## supp       1  205.35  205.35  15.572 0.0002312 ***
## dose       2 2426.43 1213.22  92.000 < 2.2e-16 ***
## supp:dose  2  108.32   54.16   4.107 0.0218603 *  
## Residuals 54  712.11   13.19                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Run it again but collect the eta-squared value
tooth_aov <- do(1) * lm(len ~ supp + dose + supp:dose, data=ToothGrowth)
observed_r_squared <- tooth_aov$r.squared
observed_r_squared
## [1] 0.7937246
# Shuffle interaction term 10,000 times and record eta-squared
random_interact <- do(10000) * 
  lm(len ~ supp + dose + shuffle(supp):shuffle(dose), data=ToothGrowth)

# Plot those 10,000 eta-squared values
histogram(~r.squared, data = random_interact, col="grey", xlim=c(.7, .9), xlab = "eta-squared", width=.005)

# Find how unlikely our observed eta-squared was
tally(~ (r.squared >= observed_r_squared), format="prop", data=random_interact)
## 
##   TRUE  FALSE 
## 0.1906 0.8094
# Shuffle supplement 
random_supp <- do(10000) * lm(len ~ shuffle(supp) + dose + supp:dose, data=ToothGrowth)

histogram(~r.squared, data = random_supp, col="grey", xlim=c(.75, .85), xlab = "eta-squared", width=.003)

tally(~ (r.squared >= observed_r_squared), format="prop", data=random_supp)
## 
##   TRUE  FALSE 
## 0.9997 0.0003
# Shuffle dose 
random_dose <- do(10000) * lm(len ~ supp + shuffle(dose) + supp:dose, data=ToothGrowth)

histogram(~r.squared, data = random_dose, col="grey", xlim=c(.75, .9), xlab = "eta-squared", width=.005)

tally(~ (r.squared >= observed_r_squared), format="prop", data=random_dose)
## 
##  TRUE FALSE 
##     1     0