Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)
library(aod)


Logistic Regression

## Binge drinking data
male <- c(rep(1,7152), rep(0,9944))
binge <- c(rep(1,1624), rep(0,5528), rep(1,1690), rep(0,8254))
drink <- data.frame(male, binge)
table(male, binge)
##     binge
## male    0    1
##    0 8254 1690
##    1 5528 1624
## Logistic regression
drink.logmodel <- glm(binge ~ male, data=drink, family=binomial(link="logit"))
summary(drink.logmodel)
## 
## Call:
## glm(formula = binge ~ male, family = binomial(link = "logit"), 
##     data = drink)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7177  -0.7177  -0.6104  -0.6104   1.8827  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.58597    0.02670 -59.400   <2e-16 ***
## male         0.36104    0.03885   9.292   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16814  on 17095  degrees of freedom
## Residual deviance: 16728  on 17094  degrees of freedom
## AIC: 16732
## 
## Number of Fisher Scoring iterations: 4
# Plot result
ggplot(drink, aes(x=male, y=binge)) + 
  geom_point(shape=1, alpha=.01, position=position_jitter(width=.05,height=.05)) + 
  stat_smooth(method="glm", family="binomial", se=FALSE)

### Plot logistic curve through proportions
## Create data containing proportion of binge drinkers by gender
binge.drink.proportions <- drink %>%
  group_by(male) %>%
  summarize(drinkprop=mean(binge))
## Find predicted values from our logistic model
binge.drink.proportions$predicted <- predict(drink.logmodel, binge.drink.proportions)
plot(drinkprop ~ male, data=binge.drink.proportions)
lines(binge.drink.proportions$male, (exp(binge.drink.proportions$predicted)/(1+exp(binge.drink.proportions$predicted))), type="l", col="red", lw=3)

## Grad school admissions
## Example 2. A researcher is interested in how variables,
## such as GRE (Graduate Record Exam scores), 
## GPA (grade point average) and prestige of the undergraduate
## institution, affect admission into graduate school. 
## The response variable, admit/don't admit, is a binary variable.

# Load data
admit <- read.csv("http://www.bradthiessen.com/html5/data/admit.csv")
head(admit)
##   admit gre  gpa rank
## 1     0 380 3.61    3
## 2     1 660 3.67    3
## 3     1 800 4.00    1
## 4     1 640 3.19    4
## 5     0 520 2.93    4
## 6     1 760 3.00    2
# Convert rank to a
str(admit)
## 'data.frame':    400 obs. of  4 variables:
##  $ admit: int  0 1 1 1 0 1 1 0 1 0 ...
##  $ gre  : int  380 660 800 640 520 760 560 400 540 700 ...
##  $ gpa  : num  3.61 3.67 4 3.19 2.93 3 2.98 3.08 3.39 3.92 ...
##  $ rank : int  3 3 1 4 4 2 1 2 3 2 ...
admit$rank <- factor(admit$rank)

# Calculate means and standard deviations
sapply(admit, sd)
##       admit         gre         gpa        rank 
##   0.4660867 115.5165364   0.3805668   0.9444602
sapply(admit, mean)
## Warning in mean.default(evalF$right[, 1], ...): argument is not numeric or
## logical: returning NA
##    admit      gre      gpa     rank 
##   0.3175 587.7000   3.3899       NA
# Plot the scatterplot matrix
panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
pairs(admit,
      cex = 1, pch = 1, bg = "steelblue",
      diag.panel = panel.hist, cex.labels = 1, font.labels = 2)

## Predict admission from gre scores
admit.gre <- glm(admit ~ gre, data=admit, family="binomial")
summary(admit.gre)
## 
## Call:
## glm(formula = admit ~ gre, family = "binomial", data = admit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1623  -0.9052  -0.7547   1.3486   1.9879  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.901344   0.606038  -4.787 1.69e-06 ***
## gre          0.003582   0.000986   3.633  0.00028 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 499.98  on 399  degrees of freedom
## Residual deviance: 486.06  on 398  degrees of freedom
## AIC: 490.06
## 
## Number of Fisher Scoring iterations: 4
## CIs using profiled log-likelihood
confint(admit.gre)
##                    2.5 %       97.5 %
## (Intercept) -4.119988259 -1.739756286
## gre          0.001679963  0.005552748
## CIs using standard errors
confint.default(admit.gre)
##                    2.5 %       97.5 %
## (Intercept) -4.089156159 -1.713532380
## gre          0.001649677  0.005514746
library(aod)
wald.test(b = coef(admit.gre), Sigma = vcov(admit.gre), Terms = 2)
## Wald test:
## ----------
## 
## Chi-squared test:
## X2 = 13.2, df = 1, P(> X2) = 0.00028
## odds ratios only
exp(coef(admit.gre))
## (Intercept)         gre 
##   0.0549493   1.0035886
## odds ratios and 95% CI
exp(cbind(OR = coef(admit.gre), confint(admit.gre)))
##                    OR      2.5 %    97.5 %
## (Intercept) 0.0549493 0.01624471 0.1755632
## gre         1.0035886 1.00168137 1.0055682
# Plot logistic curve through proportions
admitprop <- admit %>%
  group_by(gre) %>%
  summarize(admitprop=mean(admit), meangre=mean(gre), admittotal=sum(admit), count=n(),
            notadmit=count-admittotal)

admitprop$predicted <- predict(admit.gre, admitprop)
plot(admitprop ~ gre, data=admitprop)
lines(admitprop$gre, (exp(admitprop$predicted)/(1+exp(admitprop$predicted))), type="l", col="red", lw=3)

## Challenger example
temp <- c(66, 69, 68, 72, 73, 70, 78, 67, 79, 70, 67, 57, 81,
          76, 76, 63, 70, 53, 67, 75, 70, 75, 58, 80, 31)
damage <- c(rep(0,9), rep(1,14), rep(NA,2))
shuttle <- data.frame(temp, damage)
head(shuttle)
##   temp damage
## 1   66      0
## 2   69      0
## 3   68      0
## 4   72      0
## 5   73      0
## 6   70      0
## Predict damage from temp
shuttle.temp <- glm(damage ~ temp, data=shuttle, family="binomial")
summary(shuttle.temp)
## 
## Call:
## glm(formula = damage ~ temp, family = "binomial", data = shuttle)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4859  -1.2959   0.7172   0.9978   1.3002  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)  5.03663    4.81697   1.046    0.296
## temp        -0.06569    0.06819  -0.963    0.335
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 30.789  on 22  degrees of freedom
## Residual deviance: 29.778  on 21  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: 33.778
## 
## Number of Fisher Scoring iterations: 4
## CIs using profiled log-likelihood
confint(shuttle.temp)
##                  2.5 %      97.5 %
## (Intercept) -3.7413959 15.89616057
## temp        -0.2179992  0.05973434
## CIs using standard errors
confint.default(shuttle.temp)
##                  2.5 %      97.5 %
## (Intercept) -4.4044608 14.47771468
## temp        -0.1993375  0.06795919
## odds ratios only
exp(coef(shuttle.temp))
## (Intercept)        temp 
## 153.9498532   0.9364219
## odds ratios and 95% CI
exp(cbind(OR = coef(shuttle.temp), confint(shuttle.temp)))
##                      OR      2.5 %       97.5 %
## (Intercept) 153.9498532 0.02372097 8.009674e+06
## temp          0.9364219 0.80412610 1.061554e+00
# Plot proportions and logistic function
shuttleprob <- shuttle %>%
  group_by(temp) %>%
  summarize(damageprop=mean(damage), temperature=mean(temp))

shuttleprob$predicted <- predict(shuttle.temp, shuttleprob)
plot(damageprop ~ temperature, data=shuttleprob)
lines(shuttleprob$temperature, (exp(shuttleprob$predicted)/(1+exp(shuttleprob$predicted))), type="l", col="red", lw=3)