# Load packages
library(mosaic)
library(ggvis)
library(aod)
library(MASS)
library(corrplot)
library(car)
library(reshape2)
library(bestglm)

Binge drinking example

# Input binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(gender = factor(male, labels=c("female", "male")), 
                    drink = factor(binge, labels=c("Not binge", "binge")))
# Look at data
head(drink)
##   gender drink
## 1   male binge
## 2   male binge
## 3   male binge
## 4   male binge
## 5   male binge
## 6   male binge
tail(drink)
##       gender     drink
## 17091 female Not binge
## 17092 female Not binge
## 17093 female Not binge
## 17094 female Not binge
## 17095 female Not binge
## 17096 female Not binge
# Table of results
tally(drink ~ gender, margins=TRUE, data=drink)
##            gender
## drink       female male
##   Not binge   8254 5528
##   binge       1690 1624
##   Total       9944 7152
# Plot results
ggplot(drink, aes(x=male, y=binge)) + 
  geom_point(shape=1, alpha=.008, position=position_jitter(width=.05,height=.05)) + 
  stat_smooth(method="glm", family="binomial", se=F)


Maximum likelihood estimation

# Set parameters of our binomial distribution
N <- 17096
X <- 3314
# Set range of values to look for max likelihood estimate
p <- seq(.001,1-.001,by=.001)

# Calculate likelihoods of our data across the range of possible probabilities
L <- lapply(p, dbinom, size=N, x=X)

# Plot the likelihoods
plot(p, as.numeric(L), main="max(Likelihood): p=0.19385", type="l", xlab="Probability p", ylab="L(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)

# Let's plot the log-likelihood
plot(p, log(as.numeric(L)), main="max(Log-Likelihood): p=0.19385", type="l",
     xlab="Probability p", ylab="l(p)")
# Add a line at the maximum
abline(v=X/N,lty=2)


Calculate probabilities, odds, odds ratios

# Calculate for females
p.binge.female <- 1690/9944
odds.binge.female <- p.binge.female / (1 - p.binge.female)
ln.odds.b.f <- log(odds.binge.female)

# Calculate for males
p.binge.male <- 1624/7152
odds.binge.male <- p.binge.male / (1 - p.binge.male)
ln.odds.b.m <- log(odds.binge.male)

# Calculate odds ratio
odds.ratio <- odds.binge.male / odds.binge.female

# Paste results all together
cat("Females: prob =", round(p.binge.female, 5), "; odds =", round(odds.binge.female, 3),
      "; ln(odds) =", round(ln.odds.b.f, 3), "\n ",
    "males: prob =", round(p.binge.male, 5), "; odds =", round(odds.binge.male, 3),
      "; ln(odds) =", round(ln.odds.b.m, 3), "\nodds ratio =",
    round(odds.binge.male / odds.binge.female, 4), "\nrelative probability",
    round(p.binge.male / p.binge.female, 4))
## Females: prob = 0.16995 ; odds = 0.205 ; ln(odds) = -1.586 
##   males: prob = 0.22707 ; odds = 0.294 ; ln(odds) = -1.225 
## odds ratio = 1.4348 
## relative probability 1.3361


Logistic regression model for binge drinking

# Fit model (notice glm and family=binomial)
drink.logmodel <- glm(drink ~ gender, data=drink, family=binomial)
# Summarize model
summary(drink.logmodel)
## 
## Call:
## glm(formula = drink ~ gender, family = binomial, data = drink)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7177  -0.7177  -0.6104  -0.6104   1.8827  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58597    0.02670 -59.400   <2e-16 ***
## gendermale   0.36104    0.03885   9.292   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16814  on 17095  degrees of freedom
## Residual deviance: 16728  on 17094  degrees of freedom
## AIC: 16732
## 
## Number of Fisher Scoring iterations: 4
# Odds ratio (exponential function of coefficients) 
# Notice the odds ratio matches what we calculated earlier
exp(coef(drink.logmodel))
## (Intercept)  gendermale 
##   0.2047492   1.4348145
# Create new dataset of 1 female and 1 male
gender <- c(0,1)
new <- data.frame(gender = factor(gender, labels=c("female", "male")))

# Predict the probability of binge drinking for the new data.frame
# Notice that these match the probabilities we calculated earlier
predict(drink.logmodel, newdata=new, type="response")
##         1         2 
## 0.1699517 0.2270694




Graduate school admissions example

# Clear out memory
rm(list = ls())

# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
# Look at structure
str(admit)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
# Calculate means
sapply(admit, mean)
##    admit      gre      gpa     rank 
##   0.3175 587.7000   3.3899   2.4850
# Convert rank to a factor
admit$rank <- factor(admit$rank)

# Function to plot the scatterplot matrix
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)


Fit logistic model with GRE as predictor

# Fit logistic model with GRE as predictor
gre.model <- glm(admit ~ gre, family=binomial, data=admit)
# Summarize model
summary(gre.model)
## 
## Call:
## glm(formula = admit ~ gre, family = binomial, data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1623  -0.9052  -0.7547   1.3486   1.9879  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.901344   0.606038  -4.787 1.69e-06 ***
## gre          0.003582   0.000986   3.633  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.06  on 398  degrees of freedom
## AIC: 490.06
## 
## Number of Fisher Scoring iterations: 4

Calculate deviance manually

# Predicted probabilities from our model
admit$prob <- predict(gre.model, type="response")
# Calculate likelihood of model:  Product of [P^y * (1-P)^(1-y)]
admit$like <- (admit$prob ^ admit$admit) * ((1 - admit$prob) ^ (1-admit$admit))
model.likelihood <- prod(admit$like)
# Calculate deviance:  -2LL
-2 * log(model.likelihood)
## [1] 486.0561
# Fit null model with no predictors
null.model <- glm(admit ~ 1, family=binomial, data=admit)
# Predicted probabilities from our null model
admit$nullprob <- predict(null.model, type="response")
# Calculate likelihood of null model
admit$nulllike <- (admit$nullprob ^ admit$admit) * ((1 - admit$nullprob) ^ (1-admit$admit))
null.likelihood <- prod(admit$nulllike)
# Calculate deviance of model
-2 * log(null.likelihood)
## [1] 499.9765

Notice how those values match the output from the glm() command.


Chi-square test for improvement in deviance

# Chi-square = difference in deviance (null model - GRE model)
modelChi <- gre.model$null.deviance - gre.model$deviance
# Degrees-of-freedom (parameters in null model - parameters in GRE model)
chidf <- gre.model$df.null - gre.model$df.residual
# Calculate p-value for that chi-square statistic
chisq.prob <- 1 - pchisq(modelChi, chidf)
# Paste results
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 5))
## [1] "p-value for chi-square test of difference in deviance = 0.00019"


Comparing predictions: GRE of 400 vs 401

# Generate data.frame with GRE scores of 400 and 401
gre.new <- c(400, 401)
gre.new <- data.frame(gre = gre.new)

# Generate predicted log-odds
gre.new$log.odds <- -2.901344 + (0.003582 * gre.new$gre)
# Generate predicted odds
gre.new$odds <- exp(-2.901344 + (0.003582 * gre.new$gre))
# Calculate odds ratio
gre.odds.ratio <- exp(-2.901344 + 0.003582) / exp(-2.901344)
# Generate predicted probabilities
gre.new$predicted <- predict(gre.model, newdata=gre.new, type="response")

# Paste results
cat("GRE of 401: log-odds =", round(gre.new$log.odds[gre.new$gre==401], 5), "; odds =",
    round(gre.new$odds[gre.new$gre==401], 5), "; prob =",
    round(gre.new$predicted[gre.new$gre==401], 5), "\nGRE of 400:",
      "log-odds =", round(gre.new$log.odds[gre.new$gre==400], 5), "; odds =",
    round(gre.new$odds[gre.new$gre==400], 5), "; prob =",
    round(gre.new$predicted[gre.new$gre==400], 5), "\nOdds ratio:",
    round(gre.odds.ratio, 6))
## GRE of 401: log-odds = -1.46496 ; odds = 0.23109 ; prob = 0.18772 
## GRE of 400: log-odds = -1.46854 ; odds = 0.23026 ; prob = 0.18718 
## Odds ratio: 1.003588
# We can get this odds ratio by exponentiating our logistic model
# Here I will get the odds ratio and 95% confidence intervals
# Notice we get the same odds ratio of 1.003588
exp(cbind(OR = coef(gre.model), confint(gre.model)))
## Waiting for profiling to be done...
##                    OR      2.5 %    97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre         1.0035886 1.00168137 1.0055682


Getting probabilities of admission for various GRE scores

# Create data.frame with various GRE scores
gre.new.2 <- c(200, 300, 400, 500, 600, 700, 800)
gre.new.2 <- data.frame(gre = gre.new.2)
# Generate predicted log-odds
gre.new.2$log.odds <- -2.901344 + (0.003582 * gre.new.2$gre)
# Generate predicted odds
gre.new.2$odds <- exp(-2.901344 + (0.003582 * gre.new.2$gre))
# Generate predicted probabilities
gre.new.2$predicted <- predict(gre.model, newdata=gre.new.2, type="response")

# Table of predicted odds and probabilities
gre.new.2 %>%
  group_by(gre) %>%
  summarize(predicted.Odds = round(mean(odds),3), Probability = round(mean(predicted),3))
## Source: local data frame [7 x 3]
## 
##     gre predicted.Odds Probability
##   (dbl)          (dbl)       (dbl)
## 1   200          0.112       0.101
## 2   300          0.161       0.139
## 3   400          0.230       0.187
## 4   500          0.329       0.248
## 5   600          0.471       0.320
## 6   700          0.674       0.403
## 7   800          0.965       0.491
# Plot logistic curve through proportions
# Summarize proportion admitted by GRE score
admitprop <- admit %>%
  group_by(gre) %>%
  summarize(admitprop=mean(admit))
# Predict probabilities for these GRE scores
admitprop$predicted <- predict(gre.model, admitprop)
# Plot proportions admitted by GRE score
plot(admitprop ~ gre, data=admitprop)
# Add a line for the predicted probabilities
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)


Predict admissions using all 4 predictors

# Fit logistic model with all predictors
admit.model <- glm(admit ~ gre + gpa + rank, family=binomial, data=admit)
# Summarize model
summary(admit.model)
## 
## Call:
## glm(formula = admit ~ gre + gpa + rank, family = binomial, data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6268  -0.8662  -0.6388   1.1490   2.0790  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.989979   1.139951  -3.500 0.000465 ***
## gre          0.002264   0.001094   2.070 0.038465 *  
## gpa          0.804038   0.331819   2.423 0.015388 *  
## rank2       -0.675443   0.316490  -2.134 0.032829 *  
## rank3       -1.340204   0.345306  -3.881 0.000104 ***
## rank4       -1.551464   0.417832  -3.713 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 458.52  on 394  degrees of freedom
## AIC: 470.52
## 
## Number of Fisher Scoring iterations: 4
# Test improvement in deviance
# Compared to model with only gre as a predictor
modelChi <- gre.model$deviance - admit.model$deviance
chidf <- gre.model$df.residual - admit.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
paste("p-value for chi-square test of difference in deviance =", round(chisq.prob, 8))
## [1] "p-value for chi-square test of difference in deviance = 1.547e-05"
# Get the odds ratios and confidence intervals
exp(cbind(OR = coef(admit.model), confint(admit.model)))
## Waiting for profiling to be done...
##                    OR       2.5 %    97.5 %
## (Intercept) 0.0185001 0.001889165 0.1665354
## gre         1.0022670 1.000137602 1.0044457
## gpa         2.2345448 1.173858216 4.3238349
## rank2       0.5089310 0.272289674 0.9448343
## rank3       0.2617923 0.131641717 0.5115181
## rank4       0.2119375 0.090715546 0.4706961
# Create the colorful graph (lots of work)
newdata1 <- with(admit, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))
newdata1$rankP <- predict(admit.model, newdata = newdata1, type = "response")
newdata2 <- with(admit, data.frame(gre = rep(seq(from = 200, to = 800, length.out = 100),
    4), gpa = mean(gpa), rank = factor(rep(1:4, each = 100))))
newdata3 <- cbind(newdata2, predict(admit.model, newdata = newdata2, type = "link",
    se = TRUE))
newdata3 <- within(newdata3, {
    PredictedProb <- plogis(fit)
    LL <- plogis(fit - (1.96 * se.fit))
    UL <- plogis(fit + (1.96 * se.fit))
})

# view first few rows of final dataset
head(newdata3)
##        gre    gpa rank        fit    se.fit residual.scale        UL
## 1 200.0000 3.3899    1 -0.8114870 0.5147714              1 0.5492064
## 2 206.0606 3.3899    1 -0.7977632 0.5090986              1 0.5498513
## 3 212.1212 3.3899    1 -0.7840394 0.5034491              1 0.5505074
## 4 218.1818 3.3899    1 -0.7703156 0.4978239              1 0.5511750
## 5 224.2424 3.3899    1 -0.7565919 0.4922237              1 0.5518545
## 6 230.3030 3.3899    1 -0.7428681 0.4866494              1 0.5525464
##          LL PredictedProb
## 1 0.1393812     0.3075737
## 2 0.1423880     0.3105042
## 3 0.1454429     0.3134499
## 4 0.1485460     0.3164108
## 5 0.1516973     0.3193867
## 6 0.1548966     0.3223773
# Create the plot
ggplot(newdata3, aes(x = gre, y = PredictedProb)) + geom_ribbon(aes(ymin = LL,
    ymax = UL, fill = rank), alpha = 0.2) + geom_line(aes(colour = rank),
    size = 1)


Wald statistic

# Test rank of undergraduate institution using Wald statistic
# Requires library(aod)
# Compare terms 4-6 in our logistic model vs. a null model
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), Terms = 4:6)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 20.9, df = 3, P(> X2) = 0.00011
# Compare terms 4 vs 5 (rank 2 vs rank 3)
# Set-up a contrast (like we did with the Scheffe method)
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(admit.model), Sigma = vcov(admit.model), L = l)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 5.5, df = 1, P(> X2) = 0.019




Breast cancer example

# Load data from MASS library
# library(MASS)
data(biopsy)
# Examine structure
str(biopsy)
## 'data.frame':    699 obs. of  11 variables:
##  $ ID   : chr  "1000025" "1002945" "1015425" "1016277" ...
##  $ V1   : int  5 5 3 6 4 8 1 2 2 4 ...
##  $ V2   : int  1 4 1 8 1 10 1 1 1 2 ...
##  $ V3   : int  1 4 1 8 1 10 1 2 1 1 ...
##  $ V4   : int  1 5 1 1 3 8 1 1 1 1 ...
##  $ V5   : int  2 7 2 3 2 7 2 2 2 2 ...
##  $ V6   : int  1 10 2 4 1 10 10 1 1 1 ...
##  $ V7   : int  3 3 3 3 3 9 3 3 1 2 ...
##  $ V8   : int  1 2 1 7 1 7 1 1 1 1 ...
##  $ V9   : int  1 1 1 1 1 1 1 1 5 1 ...
##  $ class: Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
# Eliminate ID variable
biopsy <- biopsy[, -1]
# Name variables
names(biopsy) <- c("thick", "u.size", "u.shape", "adhsn", "s.size", "nucl", "chrom",
                   "n.nuc", "mit", "class")
# Eliminate cases with missing values
biopsy2 <- na.omit(biopsy)
head(biopsy2)
##   thick u.size u.shape adhsn s.size nucl chrom n.nuc mit     class
## 1     5      1       1     1      2    1     3     1   1    benign
## 2     5      4       4     5      7   10     3     2   1    benign
## 3     3      1       1     1      2    2     3     1   1    benign
## 4     6      8       8     1      3    4     3     7   1    benign
## 5     4      1       1     3      2    1     3     1   1    benign
## 6     8     10      10     8      7   10     9     7   1 malignant
# Construct boxplots of predictor variables by each value of our dependent variable
# Convert to wide dataset (in order to get the boxplots) 
biop.m <- melt(biopsy2, id.var="class")
# Construct the boxplots
ggplot(data=biop.m, aes(x=class, y=value)) +
  geom_boxplot() +
  facet_wrap(~variable, ncol=3)

# Nucl looks important; mit does not
# Let's look at correlations
library(corrplot)
bc = cor(biopsy2[, 1:9])
corrplot.mixed(bc)

# Collinearity problem?

# Check VIF
# library(car)
# Fit full model
full.model <- glm(class ~ ., family=binomial, data=biopsy2)
# Calculate VIF
round(vif(full.model),3)
##   thick  u.size u.shape   adhsn  s.size    nucl   chrom   n.nuc     mit 
##   1.188   2.819   2.763   1.186   1.347   1.142   1.215   1.220   1.042


Split into training and test datasets

# Create training and test datasets
# Set random number seed
set.seed(71930)
# Create random integers ~70% of them 1s; ~30% of them 2s (1, 2)
ind <- sample(2, nrow(biopsy2), replace=TRUE, prob=c(.7, .3))
# Training dataset = 1; test = 2
train <- biopsy2[ind==1, ]
test <- biopsy2[ind==2, ]
# See if we have relatively equal occurance of cancer in each set
tally(~class, data=train, format="prop")
## 
##    benign malignant 
## 0.6582278 0.3417722
tally(~class, data=test, format="prop")
## 
##    benign malignant 
## 0.6315789 0.3684211


Fit and test logistic model with all predictors

# Fit logistic model with all predictors
full.model <- glm(class ~ ., family=binomial, data=train)
summary(full.model)
## 
## Call:
## glm(formula = class ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4692  -0.0778  -0.0358   0.0045   2.2562  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.07441    1.97761  -6.106 1.02e-09 ***
## thick         0.75495    0.22554   3.347 0.000816 ***
## u.size       -0.26684    0.28954  -0.922 0.356730    
## u.shape       0.22282    0.28379   0.785 0.432360    
## adhsn         0.58666    0.20702   2.834 0.004600 ** 
## s.size       -0.06207    0.23246  -0.267 0.789455    
## nucl          0.52526    0.14448   3.635 0.000277 ***
## chrom         0.71157    0.23284   3.056 0.002243 ** 
## n.nuc         0.44724    0.17153   2.607 0.009125 ** 
## mit           0.44050    0.41143   1.071 0.284318    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 608.809  on 473  degrees of freedom
## Residual deviance:  55.022  on 464  degrees of freedom
## AIC: 75.022
## 
## Number of Fisher Scoring iterations: 9
# Chi-square test for improvement in deviance
modelChi <- full.model$null.deviance - full.model$deviance
chidf <- full.model$df.null - full.model$df.residual
chisq.prob <- 1 - pchisq(modelChi, chidf)
chisq.prob
## [1] 0
# Odds ratios and confidence intervals
exp(cbind(OR = coef(full.model), confint(full.model)))
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                       OR        2.5 %       97.5 %
## (Intercept) 5.703626e-06 5.036254e-08 0.0001420459
## thick       2.127503e+00 1.434655e+00 3.5320359815
## u.size      7.657944e-01 4.385016e-01 1.4097698097
## u.shape     1.249599e+00 6.721431e-01 2.1223295619
## adhsn       1.797980e+00 1.210043e+00 2.8107870239
## s.size      9.398173e-01 5.959236e-01 1.5117183251
## nucl        1.690893e+00 1.304239e+00 2.3273236618
## chrom       2.037192e+00 1.335426e+00 3.3920793079
## n.nuc       1.563996e+00 1.124172e+00 2.2327291779
## mit         1.553485e+00 8.836812e-01 3.5117318200
# See how it fits our training dataset
train$probs <- predict(full.model, type="response")
train$predict <- rep("benign", 474)
train$predict[train$probs > 0.50] = "malignant"
table(train$predict, train$class)
##            
##             benign malignant
##   benign       306         4
##   malignant      6       158
# Let's get the overall % correct
mean(train$predict == train$class)
## [1] 0.978903
# Let's try it on the test dataset
test$probs <- predict(full.model, newdata=test, type="response")
test$predict <- rep("benign", 209)
test$predict[test$probs > 0.50] = "malignant"
table(test$predict, test$class)
##            
##             benign malignant
##   benign       127         4
##   malignant      5        73
mean(test$predict == test$class)
## [1] 0.9569378


Cross validation

# Let's try cross-validation
# library(bestglm)
# Recode our outcome variable as 0-1
train$y <- rep(0, 474)
train$y[train$class == "malignant"] <- 1
# Could have used this code instead of the previous 2 lines
# train$y <- as.numeric(train$class)-1

# Remove all extraneous variables and make sure
# outcome variable is in last column
biopsy.cv <- train[ , -10:-12]
head(biopsy.cv)
##    thick u.size u.shape adhsn s.size nucl chrom n.nuc mit y
## 1      5      1       1     1      2    1     3     1   1 0
## 4      6      8       8     1      3    4     3     7   1 0
## 5      4      1       1     3      2    1     3     1   1 0
## 6      8     10      10     8      7   10     9     7   1 1
## 7      1      1       1     1      2   10     3     1   1 0
## 10     4      2       1     1      2    1     2     1   1 0
# Find best model
bestglm(Xy = biopsy.cv, IC="CV", CVArgs = list(Method="HTF", K=10, REP=1), family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## CV(K = 10, REP = 1)
## BICq equivalent for q in (0.0421013207411016, 0.311136342210368)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -10.6047190  1.4089470 -7.526698 5.203948e-14
## thick         0.7600489  0.1645563  4.618776 3.860096e-06
## nucl          0.5420547  0.1166692  4.646082 3.382978e-06
## chrom         0.7864741  0.1933391  4.067847 4.744949e-05
## n.nuc         0.4006591  0.1256526  3.188625 1.429511e-03
# Fit that reduced model to our training dataset
reduce.model <- glm(class ~ thick + nucl + chrom + n.nuc, family=binomial, data=train)
# Test fit
train$cv.probs <- predict(reduce.model, type="response")
train$cv.predict <- rep("benign", 474)
train$cv.predict[train$cv.probs > 0.50] = "malignant"
table(train$cv.predict, train$class)
##            
##             benign malignant
##   benign       306         7
##   malignant      6       155
# Let's get the overall % correct
mean(train$cv.predict == train$class)
## [1] 0.9725738
# Test reduced model on new data
test$cv.probs <- predict(reduce.model, newdata=test, type="response")
test$cv.predict <- rep("benign", 209)
test$cv.predict[test$cv.probs > 0.50] = "malignant"
table(test$cv.predict, test$class)
##            
##             benign malignant
##   benign       127         6
##   malignant      5        71
mean(test$cv.predict == test$class)
## [1] 0.9473684
# That didn't really improve anything
# Let's try again with BIC as our criterion

bestglm(Xy = biopsy.cv, IC="BIC", family=binomial)
## Morgan-Tatar search since family is non-gaussian.
## BIC
## BICq equivalent for q in (0.311136342210368, 0.896220081100116)
## Best Model:
##                Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -11.5782399  1.7150431 -6.750991 1.468389e-11
## thick         0.8101726  0.1836012  4.412675 1.021013e-05
## adhsn         0.5290541  0.1932962  2.737012 6.200000e-03
## nucl          0.5007588  0.1269485  3.944582 7.993917e-05
## chrom         0.6695838  0.2124473  3.151764 1.622873e-03
## n.nuc         0.4021677  0.1288032  3.122341 1.794187e-03
BIC.model <- glm(class ~ thick + adhsn + nucl + chrom + n.nuc, family=binomial, data=train)
train$BIC.probs <- predict(BIC.model, type="response")
train$BIC.predict <- rep("benign", 474)
train$BIC.predict[train$BIC.probs > 0.50] = "malignant"
table(train$BIC.predict, train$class)
##            
##             benign malignant
##   benign       306         6
##   malignant      6       156
# Let's get the overall % correct
mean(train$BIC.predict == train$class)
## [1] 0.9746835
# Test reduced model on new data
test$BIC.probs <- predict(BIC.model, newdata=test, type="response")
test$BIC.predict <- rep("benign", 209)
test$BIC.predict[test$BIC.probs > 0.50] = "malignant"
table(test$BIC.predict, test$class)
##            
##             benign malignant
##   benign       127         4
##   malignant      5        73
mean(test$BIC.predict == test$class)
## [1] 0.9569378
# We can turn to discriminant analysis for further identification



Toenail infection

# Clear out memory
rm(list = ls())
# Load data
toenail <- read.csv("http://www.bradthiessen.com/html5/data/toenail.csv")

# Plot proportions over time
toenailprop <- toenail %>%
  group_by(treatment, visit) %>%
  summarize(propinfect=mean(outcome), meanmonth=mean(month))

ggplot(toenailprop, aes(meanmonth, propinfect)) +
  geom_point() + geom_line(size=2) +
  facet_grid(. ~ treatment)

## Fit model
toenail.model <- glm(outcome ~ treatment*month, data=toenail, family="binomial")
summary(toenail.model)
## 
## Call:
## glm(formula = outcome ~ treatment * month, family = "binomial", 
##     data = toenail)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9519  -0.7892  -0.5080  -0.2532   2.7339  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.5566273  0.1089628  -5.108 3.25e-07 ***
## treatmentTerbinafine       -0.0005817  0.1561463  -0.004   0.9970    
## month                      -0.1703078  0.0236199  -7.210 5.58e-13 ***
## treatmentTerbinafine:month -0.0672216  0.0375235  -1.791   0.0732 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1980.5  on 1907  degrees of freedom
## Residual deviance: 1816.0  on 1904  degrees of freedom
## AIC: 1824
## 
## Number of Fisher Scoring iterations: 5
## CIs using profiled log-likelihood
confint(toenail.model)
## Waiting for profiling to be done...
##                                 2.5 %       97.5 %
## (Intercept)                -0.7719213 -0.344475861
## treatmentTerbinafine       -0.3067395  0.305615999
## month                      -0.2182556 -0.125489310
## treatmentTerbinafine:month -0.1419292  0.005563446
## CIs using standard errors
confint.default(toenail.model)
##                                 2.5 %       97.5 %
## (Intercept)                -0.7701903 -0.343064177
## treatmentTerbinafine       -0.3066229  0.305459549
## month                      -0.2166020 -0.124013574
## treatmentTerbinafine:month -0.1407662  0.006323002
## odds ratios only
exp(coef(toenail.model))
##                (Intercept)       treatmentTerbinafine 
##                  0.5731389                  0.9994185 
##                      month treatmentTerbinafine:month 
##                  0.8434052                  0.9349880
## odds ratios and 95% CI
exp(cbind(OR = coef(toenail.model), confint(toenail.model)))
## Waiting for profiling to be done...
##                                   OR     2.5 %    97.5 %
## (Intercept)                0.5731389 0.4621244 0.7085917
## treatmentTerbinafine       0.9994185 0.7358422 1.3574609
## month                      0.8434052 0.8039199 0.8820652
## treatmentTerbinafine:month 0.9349880 0.8676827 1.0055790
# Plot logistic curves through proportions
# I need to rename a variable back to "month"
toenailprop$month <- toenailprop$meanmonth
toenailprop$predicted <- predict(toenail.model, toenailprop)
ggplot(toenailprop, aes(month, propinfect)) +
  geom_point() +
  geom_line(aes(month, (exp(predicted)/(1+exp(predicted)))), 
            size=2, col="red", linetype="dashed") +
  geom_line(aes(month, propinfect), size=2, col="black") +
  facet_grid(. ~ treatment)




Poisson regression: Elephants

# Let's load a dataset on the mating of elephants
# Mating is a rare event that may follow a Poisson distribution.
library(gpk)
data(elephant)
head(elephant)
##   Age_in_Years Number_of_Matings
## 1           27                 0
## 2           28                 1
## 3           28                 1
## 4           28                 1
## 5           28                 3
## 6           29                 0
tail(elephant)
##    Age_in_Years Number_of_Matings
## 36           43                 9
## 37           44                 3
## 38           45                 5
## 39           47                 7
## 40           48                 2
## 41           52                 9
elephant$Age <- elephant$Age_in_Years
elephant$Matings <- elephant$Number_of_Matings

## Scatterplot
elephant %>% 
  ggvis(x=~Age, y=~jitter(Matings), fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.8) %>%
  layer_points() %>%
  add_axis("x", title = "Age") %>%
  scale_numeric("x", domain=c(25, 55), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Matings") %>%
  scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE)

# It looks like the number of mates tends to be higher
# for older elephants, AND there seems to be more variability 
# in the number of mates as age increases.

mean(elephant$Matings)
## [1] 2.682927
var(elephant$Matings)
## [1] 5.071951
mean(elephant$Matings)/var(elephant$Matings)
## [1] 0.5289733
mean(elephant$Age)
## [1] 35.85366
var(elephant$Age)
## [1] 43.27805
eg <- elephant %>%
  group_by(Age) %>%
  summarize(mean=mean(Matings), var=var(Matings))

plot(log(eg$mean), log(eg$var), xlab="log(Mean matings)",  ylab="log(Variance of matings)",
     main="Figure 4.1. Mean-Variance Relationship")
mtext("matings by age", padj=-0.5)
abline(0,1)

## Null model
null.model <- glm(Matings ~ 1, family=poisson, data=elephant)
summary(null.model)
## 
## Call:
## glm(formula = Matings ~ 1, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3164  -1.1799  -0.4368   0.1899   3.0252  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.98691    0.09535   10.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 75.372  on 40  degrees of freedom
## AIC: 178.82
## 
## Number of Fisher Scoring iterations: 5
exp(coef(null.model))
## (Intercept) 
##    2.682927
mean(elephant$Matings)
## [1] 2.682927
## Show predictions
elephant$null.pred <- exp(predict(null.model, elephant))
plot(elephant$Age,jitter(elephant$Matings))
lines(elephant$Age,elephant$null.pred, lw=3, col=1)

## Deviance
deviance(null.model)
## [1] 75.37174
pchisq(0.95, df.residual(null.model))
## [1] 8.940942e-26
## Model with age
model <- glm(Matings ~ Age, family=poisson, data=elephant)
summary(model)
## 
## Call:
## glm(formula = Matings ~ Age, family = poisson, data = elephant)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.80798  -0.86137  -0.08629   0.60087   2.17777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58201    0.54462  -2.905  0.00368 ** 
## Age          0.06869    0.01375   4.997 5.81e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 75.372  on 40  degrees of freedom
## Residual deviance: 51.012  on 39  degrees of freedom
## AIC: 156.46
## 
## Number of Fisher Scoring iterations: 5
exp(coef(model))
## (Intercept)         Age 
##   0.2055619   1.0711071
## Predict matings from age
elephant$predicted <- exp(predict(model, elephant))
lines(elephant$Age,elephant$predicted, lw=3, col=2)

## Prediction for 30 year-old elephant
exp(coef(model)[1]+coef(model)[2]*30)
## (Intercept) 
##    1.614098
## Test model fit
deviance(null.model)
## [1] 75.37174
deviance(model)
## [1] 51.01163
pchisq(0.95, df.residual(model))
## [1] 5.841306e-25
1-pchisq(deviance(null.model)-deviance(model), df.residual(null.model)-df.residual(model))
## [1] 7.99062e-07
# Since the p-value is small, 
# there is evidence that the addition of age 
# explains a significant amount (more) of the deviance


Example: Student absences

# Load data
absences <- read.csv("http://www.bradthiessen.com/html5/data/absences.csv")
absences$prog <- factor(absences$prog, levels = 1:3, 
                        labels = c("General", "Academic", "Vocational"))
head(absences)
##     id gender math daysabs     prog
## 1 1001   male   63       4 Academic
## 2 1002   male   27       4 Academic
## 3 1003 female   20       2 Academic
## 4 1004 female   16       3 Academic
## 5 1005 female    2       3 Academic
## 6 1006 female   71      13 Academic
tail(absences)
##       id gender math daysabs       prog
## 309 2152 female   46       1 Vocational
## 310 2153   male   26       1   Academic
## 311 2154 female   79       3 Vocational
## 312 2155 female   59       0   Academic
## 313 2156 female   90       0 Vocational
## 314 2157 female   77       2 Vocational
# Summarize
summary(absences)
##        id          gender         math          daysabs      
##  Min.   :1001   female:160   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.:1079   male  :154   1st Qu.:28.00   1st Qu.: 1.000  
##  Median :1158                Median :48.00   Median : 4.000  
##  Mean   :1576                Mean   :48.27   Mean   : 5.955  
##  3rd Qu.:2078                3rd Qu.:70.00   3rd Qu.: 8.000  
##  Max.   :2157                Max.   :99.00   Max.   :35.000  
##          prog    
##  General   : 40  
##  Academic  :167  
##  Vocational:107  
##                  
##                  
## 
mean(absences$daysabs)
## [1] 5.955414
var(absences$daysabs)
## [1] 49.51877
ggplot(absences, aes(daysabs, fill = prog)) + 
  geom_histogram(binwidth = 1, alpha=.5) + 
  facet_grid(prog ~ ., margins = TRUE, scales = "free")

plot(absences$math, absences$daysabs)

# Let's try a model with all the predictors
pois.model <- glm(daysabs ~ gender + math + prog, family=poisson, data=absences)
summary(pois.model)
## 
## Call:
## glm(formula = daysabs ~ gender + math + prog, family = poisson, 
##     data = absences)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.2218  -2.1892  -0.9238   0.7218   7.0048  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.7594786  0.0637731  43.270  < 2e-16 ***
## gendermale     -0.2424762  0.0467765  -5.184 2.18e-07 ***
## math           -0.0069561  0.0009354  -7.437 1.03e-13 ***
## progAcademic   -0.4260327  0.0567308  -7.510 5.92e-14 ***
## progVocational -1.2707199  0.0779143 -16.309  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 2217.7  on 313  degrees of freedom
## Residual deviance: 1746.8  on 309  degrees of freedom
## AIC: 2640.2
## 
## Number of Fisher Scoring iterations: 5
exp(coef(pois.model))
##    (Intercept)     gendermale           math   progAcademic progVocational 
##     15.7916072      0.7846824      0.9930680      0.6530950      0.2806295
## Fit is terrible
## We're assuming mean = variance
mean(absences$daysabs ~ absences$prog)
##    General   Academic Vocational 
##  10.650000   6.934132   2.672897
var(absences$daysabs ~ absences$prog)
##    General   Academic Vocational 
##   67.25897   55.44744   13.93916
## What can we do with this overdispersion?
## We could try a negative binomial regression
## It has an extra parameter to model overdispersion
library(MASS)
neg.bin.model <- glm.nb(daysabs ~ gender + math + prog, data=absences)
summary(neg.bin.model)
## 
## Call:
## glm.nb(formula = daysabs ~ gender + math + prog, data = absences, 
##     init.theta = 1.047288915, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1567  -1.0761  -0.3810   0.2856   2.7235  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     2.707484   0.204275  13.254  < 2e-16 ***
## gendermale     -0.211086   0.121989  -1.730   0.0836 .  
## math           -0.006236   0.002492  -2.502   0.0124 *  
## progAcademic   -0.424540   0.181725  -2.336   0.0195 *  
## progVocational -1.252615   0.199699  -6.273 3.55e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0473) family taken to be 1)
## 
##     Null deviance: 431.67  on 313  degrees of freedom
## Residual deviance: 358.87  on 309  degrees of freedom
## AIC: 1740.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.047 
##           Std. Err.:  0.108 
## 
##  2 x log-likelihood:  -1728.307
exp(coef(neg.bin.model))
##    (Intercept)     gendermale           math   progAcademic progVocational 
##     14.9915148      0.8097046      0.9937839      0.6540708      0.2857565
1-pchisq(deviance(neg.bin.model), 
         df.residual(neg.bin.model))
## [1] 0.02656535
neg.bin.no.prog <- update(neg.bin.model, . ~ . - prog)
anova(neg.bin.model, neg.bin.no.prog)
## Likelihood ratio tests of Negative Binomial Models
## 
## Response: daysabs
##                  Model     theta Resid. df    2 x log-lik.   Test    df
## 1        gender + math 0.8705939       311       -1772.074             
## 2 gender + math + prog 1.0472889       309       -1728.307 1 vs 2     2
##   LR stat.      Pr(Chi)
## 1                      
## 2 43.76737 3.133546e-10
neg.bin.model <- glm.nb(daysabs ~ math + prog, data=absences)
newdata1 <- data.frame(math = mean(absences$math), prog = factor(1:3, levels = 1:3, 
    labels = levels(absences$prog)))
newdata1$phat <- predict(neg.bin.model, newdata1, type = "response")
newdata1
##       math       prog      phat
## 1 48.26752    General 10.236899
## 2 48.26752   Academic  6.587927
## 3 48.26752 Vocational  2.850083
newdata2 <- data.frame(
  math = rep(seq(from = min(absences$math), to = max(absences$math), length.out = 100), 3),
  prog = factor(rep(1:3, each = 100), levels = 1:3, labels =
  levels(absences$prog)))

newdata2 <- cbind(newdata2, predict(neg.bin.model, newdata2, type = "link", se.fit=TRUE))
newdata2 <- within(newdata2, {
  DaysAbsent <- exp(fit)
  LL <- exp(fit - 1.96 * se.fit)
  UL <- exp(fit + 1.96 * se.fit)
})

ggplot(newdata2, aes(math, DaysAbsent)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = prog), alpha = .25) +
  geom_line(aes(colour = prog), size = 2) +
  labs(x = "Math Score", y = "Predicted Days Absent")