--- title: 'MATH 301: Activity 12' author: "Brad Thiessen" date: "October 26, 2014" output: html_document --- ***** ###### Load packages ```{r, message=FALSE} library(mosaic) library(dplyr) library(ggplot2) library(ggvis) library(parallel) ```
```{r , message=FALSE} ######################## ######################## ### Polynomial Reg. ### ######################## ######################## ## Load data geiger <- read.csv("http://www.bradthiessen.com/html5/data/geiger.csv") head(geiger) tail(geiger) cor(counts, time, data=geiger) ## Fit linear model lin.mod <- lm(counts ~ time, data=geiger) summary(lin.mod) anova(lin.mod) ## R-squared is 0.7687 (looks good) ## Assumptions check par(mfrow=c(2,2)) # visualize four graphs at once plot(lin.mod) par(mfrow=c(1,1)) # reset the graphics defaults ## Show linear model geiger %>% ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>% add_axis("x", title = "time") %>% scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "counts") %>% scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE) ## Show lowess geiger %>% ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>% add_axis("x", title = "time") %>% scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "counts") %>% scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE) ## LOWESS with slider for span library(shiny) geiger %>% ggvis(x=~time, y=~counts, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red", span = input_slider(min=0.1, max=1, value=.5, step=.1, label="smoothing span")) %>% add_axis("x", title = "time") %>% scale_numeric("x", domain=c(0,35), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "counts") %>% scale_numeric("y", domain=c(0, 130), nice=FALSE, clamp=TRUE) ## Add a quadratic term quad.mod = lm(counts ~ time + I(time^2), data=geiger) summary(quad.mod) ## R-squared = 0.9079 ## Check assumptions par(mfrow=c(2,2)) # visualize four graphs at once plot(quad.mod) par(mfrow=c(1,1)) # reset the graphics defaults ## Plot model plot(counts~time, data=geiger) curve(107.97130 - 7.3433*x + 0.15057*x^2,add=T, lw=4, col=2) ## Compare to linear model null.mod <- lm(counts ~ 1, data=geiger) anova(null.mod, lin.mod, quad.mod) ## Add a cubic term cub.mod = lm(counts ~ time + I(time^2) + I(time^3), data=geiger) summary(cub.mod) ## R-squared = 0.9150 ## Check assumptions par(mfrow=c(2,2)) # visualize four graphs at once plot(quad.mod) par(mfrow=c(1,1)) # reset the graphics defaults ## Plot model plot(counts~time, data=geiger) curve(113.25135 - 9.646057*x + 0.345644*x^2 - 0.004335*x^3,add=T, lw=4, col=2) ## Compare to other model anova(null.mod, lin.mod, quad.mod, cub.mod) ## Let's go to a 7th degree model seven.mod = lm(counts ~ time + I(time^2) + I(time^3) + I(time^4) + I(time^5) + I(time^6) + I(time^7), data=geiger) summary(seven.mod) ## R-squared = 0.9314 ## Check assumptions par(mfrow=c(2,2)) # visualize four graphs at once plot(seven.mod) par(mfrow=c(1,1)) # reset the graphics defaults ## Plot model coef(seven.mod) plot(counts~time, data=geiger) curve(123.8349 - 31.64351*x + 9.881051*x^2 - 1.711959*x^3 + 0.1523853*x^4 - 0.00713867*x^5 + 0.0001677408*x^6 - 0.00000155787*x^7 ,add=T, lw=4, col=2) ## Compare to other model anova(quad.mod, seven.mod) ## Overfits the data ## Cross-validate to choose best model library(DAAG) geiger$time2 <- geiger$time^2 geiger$time3 <- geiger$time^3 geiger$time4 <- geiger$time^4 cross.validated.model <- lm(counts ~ time + time2 + time3 + time4, data=geiger) ## This command yields ms = 110 # CVlm(geiger, cross.validated.model, m=10) cross.validated.model <- lm(counts ~ time + time2 + time3, data=geiger) ## This command yields ms = 105 # CVlm(geiger, cross.validated.model, m=10) cross.validated.model <- lm(counts ~ time + time2, data=geiger) ## This command yields ms = 103 # CVlm(geiger, cross.validated.model, m=10) cross.validated.model <- lm(counts ~ time, data=geiger) ## This command yields ms = 219 # CVlm(geiger, cross.validated.model, m=10) ## The model with the squared term has the lowest average mean square error ### Another example - speed vs stopping distance data(cars) head(cars) ## Scatterplot with lowess cars %>% ggvis(x=~speed, y=~dist, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>% add_axis("x", title = "speed") %>% scale_numeric("x", domain=c(0,26), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "stopping distance") %>% scale_numeric("y", domain=c(0, 125), nice=FALSE, clamp=TRUE) ## Set-up models null.model <- lm(dist ~ 1, data=cars) linear.model <- lm(dist ~ speed, data=cars) quadratic.model <- lm(dist ~ speed + I(speed^2), data=cars) cubic.model <- lm(dist ~ speed + I(speed^2) + I(speed^3), data=cars) ## Test models anova(null.model, linear.model, quadratic.model, cubic.model) plot(cars, main="lowess cars") plot(dist~speed, data=cars) curve(-17.579095 + 3.932409*x , add=T, lw=3) curve(2.4701378 + 0.9132876*x + 0.0999593*x^2 , add=T, col=2, lw=3) ## Load data prestige <- read.csv("http://www.bradthiessen.com/html5/data/prestige.csv") ## Set-up models mod0 <- lm(prestige ~ 1, data=prestige) mod1 <- lm(prestige ~ income, data=prestige) mod2 <- lm(prestige ~ income + I(income^2), data=prestige) mod3 <- lm(prestige ~ income + I(income^2) + I(income^3), data=prestige) ## Test models anova(mod0, mod1, mod2, mod3) plot(prestige~income, data=prestige) curve(27.141176368 + 0.002896799*x , add=T, lw=3) curve(14.18319 + 0.006153506*x - 0.0000001433097*x^2 , add=T, col=2, lw=3) ## Cross validate prestige$income2 <- prestige$income^2 prestige$income3 <- prestige$income^3 prestige$income4 <- prestige$income^4 cross.validated.model <- lm(prestige ~ income + income2 + income3 + income4, data=prestige) ## This command yields ms = 134 # CVlm(prestige, cross.validated.model, m=10) cross.validated.model <- lm(prestige ~ income + income2 + income3, data=prestige) ## This command yields ms = 131 # CVlm(prestige, cross.validated.model, m=10) cross.validated.model <- lm(prestige ~ income + income2, data=prestige) ## This command yields ms = 130 # CVlm(prestige, cross.validated.model, m=10) cross.validated.model <- lm(prestige ~ income, data=prestige) ## This command yields ms = 154 # CVlm(prestige, cross.validated.model, m=10) ### Cross-validation indicates to include quadratic term ## Check assumptions par(mfrow=c(2,2)) # visualize four graphs at once plot(mod2) par(mfrow=c(1,1)) # reset the graphics defaults ######################## ######################## #### Nonlinear Reg. #### -- Skip? ######################## ######################## ## Temperature and vapor pressure of mercury (mm of mercury) # Handbook of Chemistry and Physics, CRC Press (1973) data(pressure) head(pressure) # Change temperature to Kelvin pressure$temperature = pressure$temperature + 273.15 # Change pressure to kiloPascals pressure$pressure = pressure$pressure * .1333 # Source: http://ww2.coastal.edu/kingw/statistics/R-tutorials/simplenonlinear.html # When a liquid is placed in an evacuated, closed chamber # (i.e., in a "vacuum"), it begins to evaporate. This # vaporization will continue until the vapor and the liquid # reach a dynamic equilibrium (the number of particles of # liquid that go into the vapor is equal to the number of # particles of vapor that go into the liquid). At this point, # the pressure exerted by the vapor on the liquid is called # the vapor pressure (or equilibrium vapor pressure) of the # substance. Vapor pressure has a complex relationship with # temperature, and so far as I know, there is no law in theory # that specifies this relationship (at least not in the case of # mercury). Since we are dealing with a liquid/vapor phase # transition, and not just a gas, the simple gas laws you # learned back in high school chemistry don't apply. The vapor # pressure of mercury and its relationship to temperature has # been an important question, for example, in the calibration # of scientific instruments like mercury thermometers and barometers. plot(pressure) summary(pressure) # Plot various log transformations ## Identity, log(y), power, log(x) ## Identity, exponential, power, log par(mfrow=c(1,4)) # one row of four graphs plot(pressure ~ temperature, data=pressure, main="Vapor Pressure\nof Mercury", xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)") plot(pressure ~ temperature, data=pressure, main="Vapor Pressure\nof Mercury", xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="y") plot(pressure ~ temperature, data=pressure, main="Vapor Pressure\nof Mercury", xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="xy") plot(pressure ~ temperature, data=pressure, main="Vapor Pressure\nof Mercury", xlab="Temperature (degrees Kelvin)", ylab="Pressure (kPascals)", log="x") par(mfrow=c(1,1)) # reset the graphics defaults ## Plot power model linear.model <- lm(pressure ~ temperature, data=pressure) summary(linear.model) quad.model <- lm(pressure ~ temperature + I(temperature^2), data=pressure) summary(quad.model) power.model <- lm(log(pressure) ~ log(temperature), data=pressure) summary(power.model) newdf <- data.frame(temperature=seq(min(pressure$temperature), max(pressure$temperature), len=19)) plot(pressure ~ temperature, data=pressure) lines(newdf$temperature, exp(predict(power.model, newdf)), col=2, lw=4) lines(newdf$temperature, predict(linear.model, newdf), col=3, lw=4) lines(newdf$temperature, predict(quad.model, newdf), col=4, lw=4) text(600, .75, substitute(plain("R-square: ") * r2, list(r2=summary(power.model)$r.squared))) ## Try with prestige par(mfrow=c(1,4)) # one row of four graphs plot(prestige ~ income, data=prestige, xlab="Income", ylab="Prestige") plot(prestige ~ income, data=prestige, xlab="Income", ylab="Prestige", log="x") plot(prestige ~ income, data=prestige, xlab="Income", ylab="Prestige", log="y") plot(prestige ~ income, data=prestige, xlab="Income", ylab="Prestige", log="xy") par(mfrow=c(1,1)) # reset the graphics defaults linear.model <- lm(prestige ~ income, data=prestige) summary(linear.model) power.model <- lm(log(prestige) ~ log(income), data=prestige) summary(power.model) logx.model <- lm(prestige ~ log(income), data=prestige) summary(logx.model) logy.model <- lm(log(prestige) ~ income, data=prestige) summary(logy.model) newdf <- data.frame(income=seq(min(prestige$income), max(prestige$income), len=100)) plot(prestige ~ income, data=prestige) lines(newdf$income, exp(predict(power.model, newdf)), col=2, lw=4) lines(newdf$income, predict(linear.model, newdf), col=3, lw=4) lines(newdf$income, predict(logx.model, newdf), col=4, lw=4) ## Lowess for prestige prestige %>% ggvis(x=~income, y=~prestige, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_smooths(stroke:="red", strokeWidth:=5) %>% add_axis("x", title = "Income") %>% scale_numeric("x", domain=c(0,25000), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Prestige") %>% scale_numeric("y", domain=c(0, 100), nice=FALSE, clamp=TRUE) ######################## ######################## ##### Robust Reg. ###### ######################## ######################## # First, fit linear model linear.model <- lm(prestige ~ income + education + percwomn, data=prestige) summary(linear.model) # Now, take 5000 bootstrap samples using residual approach # (described in activity) library(car) linear.boot <- Boot(linear.model, R=5000, method="residual") summary(linear.boot) ## Fictitious data with outlier x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 25) y <- c(1.1, 1.9, 3.1, 3.9, 5.1, 5.9, 7.1, 7.9, 9.1, 0) df <- data.frame(x,y) # Create linear model linear.model <- lm(y ~ x, df) summary(linear.model) # Plot linear model newdf <- data.frame(x=seq(min(df$x), max(df$x), len=100)) plot(y ~ x, data=df) lines(newdf$x, predict(linear.model, newdf), col=2, lw=4) # Create bootstrap model with randomly sampled cases boot2 <- Boot(linear.model, R = 10000, method="case") summary(boot2) # Plot bootstrap model plot(y ~ x, data=df) curve((5.0133-1.586) + ((-0.0719 + 0.347)*x), add=T, lw=3, col=2) # Get predicted values from the bootstrap regression df$pred <- 3.43 + 0.275*x ## Fit lowess df %>% ggvis(x=~x, y=~y, fill := "steelblue", stroke:="steelblue", fillOpacity:=.7, strokeOpacity:=.7) %>% layer_points() %>% layer_smooths(strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>% add_axis("x", title = "x") %>% scale_numeric("x", domain=c(0,25), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "y") %>% scale_numeric("y", domain=c(0, 10), nice=FALSE, clamp=TRUE) # Create bootstrap model for prestige data linear.model <- lm(prestige ~ income + education + percwomn, data=prestige) boot3 <- Boot(linear.model, R = 5000, method="case") summary(boot3) ######################## ######################## ##### Quantile Reg ##### ######################## ######################## ## Load data hsb <- read.csv("http://www.bradthiessen.com/html5/data/hsb.csv") head(hsb) hsb %>% group_by(schid) %>% summarize(students=n(), school.type=mean(schtype), mean.mathach=mean(mathach)) ## Create scatterplot matrix hsb2 <- data.frame(hsb$minority, hsb$female, scale(hsb$ses), scale(hsb$mathach), hsb$size) pairs(hsb2, col=rgb(0, 0, 0, 0.01)) # Is math achievement significantly related to SES? linear.model <- lm(scale(mathach) ~ scale(ses), data=hsb) summary(linear.model) anova(linear.model) ## Show linear model hsb %>% ggvis(x=~scale(ses), y=~scale(mathach), fill := "steelblue", stroke:="steelblue", fillOpacity:=.05, strokeOpacity:=.1) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red", se=T) %>% add_axis("x", title = "SES") %>% scale_numeric("x", domain=c(-4,3), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Math Achievement") %>% scale_numeric("y", domain=c(-3, 2), nice=FALSE, clamp=TRUE) # Quantile regression asks whether a relation # changes across the distribution of the outcome. # Is the relation of SES with math achievement # different across levels of math achievement? library(quantreg) # Median regression q50 <- rq(scale(mathach) ~ scale(ses), tau=0.50, data=hsb) summary(q50) # Graph median regression plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement") points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1)) abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4) abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line # 10th-90th percentiles quantregtens <- rq(scale(mathach) ~ scale(ses), data=hsb, tau=c(0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90)) quantreg.plot <- summary(quantregtens) plot(quantreg.plot, mfrow=c(1,2)) plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement") points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1)) abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=.5),col=rgb(0, 0, 1, 1), lw=4) abline(lm(scale(hsb$mathach)~scale(hsb$ses)),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line taus <- c(.1,.3,.5,.7,.9) for( i in 1:length(taus)){ abline(rq(scale(hsb$mathach)~scale(hsb$ses),tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) } ## Test difference in slopes at 25th and 50th percentiles q25 <- rq(scale(mathach) ~ scale(ses), tau=0.25, data=hsb) summary(q25) anova(q25,q50) ## Minority achievement gap # 10th-90th percentiles quantregtens <- rq(scale(mathach) ~ minority, data=hsb, tau=c(0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90)) quantreg.plot <- summary(quantregtens) plot(quantreg.plot, mfrow=c(1,2)) plot(hsb$minority, scale(hsb$mathach),cex=.25,type="n",xlab="Minority", ylab="Math Achievement") points(hsb$minority, scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1)) abline(rq(scale(hsb$mathach)~hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4) abline(lm(scale(hsb$mathach)~hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line taus <- c(.1,.3,.5,.7,.9) for( i in 1:length(taus)){ abline(rq(scale(hsb$mathach)~hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) } # Multiple Quantile Regression quantregtens <- rq(scale(mathach) ~ minority + scale(ses), data=hsb, tau=c(0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90)) quantreg.plot <- summary(quantregtens) plot(quantreg.plot, mfrow=c(1,3)) # Compare 25th and 50th percentiles q25<-rq(scale(mathach)~scale(ses)+minority, tau=.25,data=hsb) q50<-rq(scale(mathach)~scale(ses)+minority, tau=.50,data=hsb) anova(q25, q50) # Plot quantile lines plot(scale(hsb$ses), scale(hsb$mathach),cex=.25,type="n",xlab="SES", ylab="Math Achievement") points(scale(hsb$ses), scale(hsb$mathach),cex=.5,col=rgb(0, 0, 0, 0.1)) abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=.5),col=rgb(0, 0, 1, 1), lw=4) abline(lm(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority),lty=2,col=rgb(1, 0, 0, 1), lw=4) #the dreaded ols line taus <- c(.1,.3,.5,.7,.9) for( i in 1:length(taus)){ abline(rq(scale(hsb$mathach)~scale(hsb$ses)+hsb$minority,tau=taus[i]),col=rgb(0.5, 0.5, 0.5, 1), lw=2) } ######################## ######################## ##### ANOVA as Reg ##### ######################## ######################## #### Ambiguous prose #### Load the data from the web ambiguous <- read.csv("http://www.bradthiessen.com/html5/data/ambiguousprose.csv") str(ambiguous) dotPlot( ~ Comprehension | Condition, data = ambiguous, nint = 17, endpoints = c(-2, 10), layout = c(1,3), aspect = .35, cex=.7, xlab = "Comprehension score (0-7)") #### Run an ANOVA modeling comprehension as a function of condition (treatments) mod1 <- aov(Comprehension ~ Condition, data=ambiguous) anova(mod1) mean(Comprehension ~ Condition, data=ambiguous) ## Create possible coding (ordinal code for condition) ## Note that this is a bad idea! ambiguous$condcode[ambiguous$Condition=="After"] <- 1 ambiguous$condcode[ambiguous$Condition=="Before"] <- 2 ambiguous$condcode[ambiguous$Condition=="None"] <- 3 ## Run a linear regression with this condition code ## Still a bad idea! cond.code <- lm(Comprehension ~ condcode, data=ambiguous) summary(cond.code) ## Show linear model ambiguous %>% ggvis(x=~condcode, y=~Comprehension, fill := "steelblue", stroke:="steelblue", fillOpacity:=.3, strokeOpacity:=.5) %>% layer_points() %>% layer_model_predictions(model = "lm", strokeWidth:=5, strokeOpacity:=.7, stroke:="red") %>% add_axis("x", title = "time") %>% scale_numeric("x", domain=c(1,3), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "counts") %>% scale_numeric("y", domain=c(0, 7), nice=FALSE, clamp=TRUE) ## Create dummy variables x1, x2, x3 ## Note that creating the 3rd dummy variable is a bad idea! ambiguous$x1 <- ifelse(ambiguous$Condition =="After", 1, 0) ambiguous$x2 <- ifelse(ambiguous$Condition =="Before", 1, 0) ambiguous$x3 <- ifelse(ambiguous$Condition =="None", 1, 0) ## Run linear regression with all 3 dummy variables ## Bad idea! lin.mod <- lm(Comprehension ~ x1 + x2 + x3, data=ambiguous) anova(lin.mod) summary(lin.mod) ## Run linear regression with the first two dummy variables ## Bad idea! lin.mod <- lm(Comprehension ~ x1 + x2, data=ambiguous) anova(lin.mod) summary(lin.mod) ## Run linear regression with factor variable ## Same output as above lin.mod <- lm(Comprehension ~ Condition, data=ambiguous) anova(lin.mod) summary(lin.mod) ############## ## AxB ANOVA # ############## data(ToothGrowth) str(ToothGrowth) ToothGrowth$dose <- as.factor(ToothGrowth$dose) # AxB ANOVA aov.out = aov(len ~ supp * dose, data=ToothGrowth) summary(aov.out) # As regression tooth.model <- lm(len ~ supp * dose, data=ToothGrowth) anova(tooth.model) summary(tooth.model) ```