--- title: 'MATH 301: Activity 1' author: "Brad Thiessen" date: "October 17, 2014" output: html_document --- ***** ###### Load packages ```{r, message=FALSE} library(mosaic) library(dplyr) library(ggplot2) library(ggvis) library(parallel) ```
#### St. Ambrose study hours How many hours do you study in a typical week? How does that compare to the hours you spent studying as a freshman? To investigate this, we're going to look at some data collected from students who entered St. Ambrose in Fall 2013. Specifically, we're going to look at student responses to these questions: • `S1_NA150`: In an average week, how many hours do you spend studying? • `NA152`: (The same question asked to the same students the following year) • `S1_D148`: In an average day, how many hours do you spend relaxing or socializing? • `D150`: (The same question asked to the same students the following year) Let's load the data and take a look: ```{r fig.width=8, fig.height=4, message=FALSE} #### Load the data from the web mapworks <- read.csv("http://www.bradthiessen.com/html5/data/f13s14.csv") dim(mapworks) #### The data contains 926 variables for 569 students #### Let's drop all the variables except our 4 questions of interest hours <- mapworks %>% select(S1_NA150, NA152, S1_D148, D150) #### Let's rename the variables hours$fresh_study <- hours$S1_NA150 hours$soph_study <- hours$NA152 hours$fresh_relax <- hours$S1_D148 hours$soph_relax <- hours$D150 #### Now let's take a look at our data head(hours) dim(hours) #### We have 569 observations #### Eliminate students who did not answer any question hours <- hours %>% select(fresh_study, soph_study, fresh_relax, soph_relax) %>% filter(!is.na(fresh_study) | !is.na(soph_study) | !is.na(fresh_relax) | !is.na(soph_relax)) dim(hours) ### Histogram histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), xlab = "Hours per week spent studying (first-year)", width=4, main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) ### Density plot ggplot(hours, aes(x=fresh_study)) + geom_density(binwidth=15, colour = "steelblue", fill = "white", size=1) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) + ggtitle(paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) + xlim(0, 60) + xlab("Hours studying per week (first-year)") ### Comparison with responses the following year histogram(~soph_study, data = hours, col="grey", xlim=c(-2, 60), xlab = "Hours per week spent studying (second-year)", width=4, main=paste("mean =", mean(hours$soph_study, na.rm=T), "; std. dev =", sd(hours$soph_study, na.rm=T))) ### Scatterplot qplot(x = fresh_study, y = soph_study, data = hours, colour = I("steelblue"), geom="point", xlim=c(0,60), ylim=c(0,60), xlab="Initial topics mastered", ylab="Final topics mastered") hours %>% filter(!is.na(fresh_study)) %>% filter(!is.na(soph_study)) %>% ggvis(x=~fresh_study, y=~soph_study, fill := "steelblue", stroke:="steelblue", fillOpacity:=.2) %>% layer_points() %>% add_axis("x", title = "Study hours as freshman") %>% scale_numeric("x", domain=c(0, 60), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Study hours as sophomore") %>% scale_numeric("y", domain=c(0, 60), nice=FALSE, clamp=TRUE) hours %>% filter(!is.na(fresh_study)) %>% filter(!is.na(soph_study)) %>% ggvis(x=~fresh_study, y=~soph_study, fill := "steelblue", stroke:="steelblue", strokeOpacity:=.2, fillOpacity:=.2) %>% layer_points() %>% layer_smooths(stroke:="black", strokeOpacity:=.9) %>% layer_model_predictions(model = "lm", stroke:="red", strokeOpacity:=.9) %>% add_axis("x", title = "Study hours as freshman") %>% scale_numeric("x", domain=c(0, 50), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Study hours as sophomore") %>% scale_numeric("y", domain=c(0, 55), nice=FALSE, clamp=TRUE) ## Hours studying vs. hours relaxing ## The hours relaxing variable actually represents hours x 2 ## Convert to hours per week hours$fresh_relax_week <- (hours$fresh_relax / 2) * 7 hours$soph_relax_week <- (hours$soph_relax / 2) * 7 head(hours) histogram(~fresh_relax_week, data = hours, col="grey", xlim=c(-2, 104), xlab = "Hours per week spent relaxing (freshmen)", width=4, main=paste("mean =", mean(hours$fresh_relax_week, na.rm=T), "; std. dev =", sd(hours$fresh_relax_week, na.rm=T))) histogram(~soph_relax_week, data = hours, col="grey", xlim=c(-2, 104), xlab = "Hours per week spent relaxing (freshmen)", width=4, main=paste("mean =", mean(hours$fresh_relax_week, na.rm=T), "; std. dev =", sd(hours$fresh_relax_week, na.rm=T))) hours %>% filter(!is.na(fresh_study)) %>% filter(!is.na(fresh_relax_week)) %>% ggvis(x=~fresh_relax_week, y=~fresh_study, fill := "steelblue", stroke:="steelblue", strokeOpacity:=.2, fillOpacity:=.2) %>% layer_points() %>% layer_smooths(stroke:="black", strokeOpacity:=.9) %>% layer_model_predictions(model = "lm", stroke:="red", strokeOpacity:=.9) %>% add_axis("x", title = "Hours relaxing each week") %>% scale_numeric("x", domain=c(0, 100), nice=FALSE, clamp=TRUE) %>% add_axis("y", title = "Hours studying each week") %>% scale_numeric("y", domain=c(0, 55), nice=FALSE, clamp=TRUE) ``` That was (hopefully) interesting. I want to focus on one of the distributions: ```{r fig.width=8, fig.height=4, message=FALSE} histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), xlab = "Hours per week spent studying (freshmen)", width=4, main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) ggplot(hours, aes(x=fresh_study)) + geom_density(binwidth=15, colour = "steelblue", fill = "white", size=1) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) + ggtitle(paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) + xlim(0, 60) + xlab("Hours studying per week") ### The distribution is obviously not a normal distribution histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), xlab = "Hours per week spent studying (freshmen)", width=4, main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))); plotDist("norm", params=list(mean=mean(hours$fresh_study, na.rm=T), sd=(sd(hours$fresh_study, na.rm=T)/(1)^.5)), col="red", lwd=1, add=TRUE) ## Q-Q Plot qqnorm(hours$fresh_study, ylab = "Hours"); qqline(hours$fresh_study, ylab = "Hours") ``` I want to repeatedly take samples from this distribution and calculate a statistic. I am interested in the distribution of that statistic (if I were to take a large number of samples). ```{r fig.width=8, fig.height=4, message=FALSE} ##### We'll start with the sampling distribution of the sample mean #### Let's remove missing values fresh_study <- hours %>% select(fresh_study) %>% filter(!is.na(fresh_study)) ## Sample n=3 students and calculate average time studying sample_size = 3 mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T)) histogram(~result, data = mean3, col="grey", xlim=c(0, 60), xlab = "n=3", width=2, main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); plotDist("norm", params=list(mean=mean(fresh_study$fresh_study, na.rm=T), sd=(sd(fresh_study$fresh_study, na.rm=T)/(sample_size)^.5)), col="red", lwd=3, add=TRUE) qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results") ## Sample n=30 students and calculate average time studying sample_size = 30 mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T)) histogram(~result, data = mean3, col="grey", xlim=c(0, 60), xlab = "n=30", width=.25, main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))) histogram(~result, data = mean3, col="grey", xlim=c(0, 20), xlab = "n=30", width=.25, main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); plotDist("norm", params=list(mean=mean(fresh_study$fresh_study, na.rm=T), sd=(sd(fresh_study$fresh_study, na.rm=T)/(sample_size)^.5)), col="red", lwd=5, add=TRUE) qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results") ## Sample n=1000 students and calculate average time studying sample_size = 1000 mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T)) histogram(~result, data = mean3, col="grey", xlim=c(0, 20), xlab = "n=1000", width=.05, main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); histogram(~result, data = mean3, col="grey", xlim=c(9.5, 12), xlab = "n=1000", width=.025, main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); plotDist("norm", params=list(mean=mean(fresh_study$fresh_study), sd=(sd(fresh_study$fresh_study)/(sample_size)^.5)), col="red", lwd=5, add=TRUE) # Bias mean(mean3) - mean(fresh_study$fresh_study) # Standard error sd(mean3) - sd(fresh_study$fresh_study)/(sample_size)^.5 qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results") ``` So that's the sampling distribution of the sample mean. This time, suppose we take a sample of n observations and: 1. Convert all the observations to z-scores 2. Square each of those z-scores 3. Find the sum of the squared z-scores What would the distribution look like if we did this for a large number of independent, random samples from a distribution? Let's find out. ```{r fig.width=8, fig.height=4, message=FALSE} ### We'll start with a variable that follows a normal distribution data <- data.frame(rnorm(100000, 100, 16)) colnames(data) <- c("x") histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1, main=paste("mean =", mean(data), "; std. dev =", sd(data))) # The scale() function converts scores to z-scores data$z <- scale(data$x) # I might as well square them before we take a sample data$z2 <- data$z^2 # Now take a sample and find the sum ## Sample n=3 sample_size <- 3 chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size), prob = c(0.1, 0.6), col = 2) ## Sample n=10 sample_size <- 10 chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=10", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 10])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size), prob = c(0.1, 0.6), col = 2) ## Sample n=100 sample_size <- 100 chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 200), xlab = "n=100", width=2, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 100])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size), prob = c(0.1, 0.6), col = 2) ``` What if we calculate the variance of each sample? ```{r fig.width=8, fig.height=4, message=FALSE} ggplot(data, aes(x=x)) + geom_density(binwidth=10, colour = "steelblue", fill = "white", size=2, alpha=.25) + ggtitle(paste("mean =", mean(data$x, na.rm=T), "; var =", var(data$x, na.rm=T))) + xlim(40, 160) + xlab("x") ### We'll start with a variable that follows a normal distribution histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1, main=paste("mean =", mean(data), "; std. dev =", sd(data))) # Take samples and calculate the variance of each sample ## Sample n=3 sample_size <- 3 chisquare3<- do(50000) * var(sample(data$x, sample_size, replace=T)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 2500), xlab = "n=3", width=25, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size), prob = c(0.1, 0.6), col = 2) ## It's not a chi-squared distribution ## Let's transform each variance that we calculate ## (n-1)s^2 / popvar(x) sample_size <- 3 chisquare3<- do(50000) * ((sample_size - 1) *var(sample(data$x, sample_size, replace=T))/(var(data$x))) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size), col="red", lwd=3, add=TRUE) ## It looks like a chi-squared distribution, but the degrees-of-freedom are off ## The mean is 2 and the variance is 4, so the df = 2 = 3-1 histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size-1), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1), prob = c(0.1, 0.6), col = 2) ## Let's try it with n = 10 sample_size <- 10 chisquare3<- do(50000) * ((sample_size - 1) *var(sample(data$x, sample_size, replace=T))/var(data$x)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=10", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size-1), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 9])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1), prob = c(0.1, 0.6), col = 2) ``` That worked with a normal population. What about our skewed "hours studying" distribution? ```{r fig.width=8, fig.height=4, message=FALSE} ggplot(hours, aes(x=fresh_study)) + geom_density(binwidth=15, colour = "steelblue", fill = "white", size=2, alpha=.25) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) + ggtitle(paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) + xlim(0, 60) + xlab("Hours studying per week (first-year)") histogram(~fresh_study, data = fresh_study, col="grey", xlim=c(-2, 60), xlab = "Hours per week spent studying", width=4, main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) # Take samples and calculate the variance of each sample ## Sample n=3 sample_size <- 3 chisquare3<- do(50000) * ((sample_size - 1) *var(sample(fresh_study$fresh_study, sample_size, replace=T))/var(fresh_study$fresh_study)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 10), xlab = "n=3", width=.5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size-1), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1), prob = c(0.1, 0.6), col = 2) ### Hmm... it didn't work. Let's try a bigger sample size ## Let's try it with n = 100 sample_size <- 100 chisquare3<- do(50000) * ((sample_size - 1) *var(sample(fresh_study$fresh_study, sample_size, replace=T))/var(fresh_study$fresh_study)) histogram(~result, data = chisquare3, col="grey", xlim=c(0, 250), xlab = "n=100", width=5, main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); plotDist("chisq", params=list(df=sample_size-1), col="red", lwd=3, add=TRUE) qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result, main = expression("Q-Q plot for" ~~ {chi^2}[nu == 99])) qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1), prob = c(0.1, 0.6), col = 2) ## Still didn't work ``` ----- Extra example in in-class activity: test <- c(125, 123, 117, 123, 112, 128, 118, 124, 116, 109, 125, 120, 123, 112, 118, 121, 122, 115, 105, 118, 115, 111, 113, 118, 131) test <- data.frame(test) ggplot(test, aes(x=test)) + geom_density(binwidth=15, colour = "steelblue", fill = "white", size=2, alpha=.25) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=2) + ggtitle(paste("mean =", mean(test$test, na.rm=T), "; std. dev =", sd(test$test, na.rm=T))) + xlim(95, 140) + xlab("Results in mg/dl") trials <- do(100000) * sd(resample(test)) with(trials, quantile(result, c(0.025, 0.975))) ----- What if we took samples from two populations, calculated the variance of each sample, and then found the ratio of the variances? What would that distribution look like? ```{r fig.width=8, fig.height=4, message=FALSE} ### We'll start with two populations that follow normal distributions ##### Different means but same variances data <- data.frame(rnorm(10000, 100, 16)) colnames(data) <- c("x") data2 <- data.frame(rnorm(100000, 80, 16)) colnames(data2) <- c("x") histogram(~x, data=data, col="grey", xlim=c(20, 160), width=1, main=paste("mean =", mean(data), "; std. dev =", sd(data))) histogram(~x, data=data2, col="grey", xlim=c(20, 160), width=1, main=paste("mean =", mean(data2), "; std. dev =", sd(data2))) ## Take a sample of n=100 sample_size <- 100 # Take 25,000 samples of size n and calculate the variance of each sample sample_1 <- do(25000) * var(sample(data$x, sample_size, replace=T)) sample_2 <- do(25000) * var(sample(data2$x, sample_size, replace=T)) F3 <- data.frame(sample_1, sample_2) # Identify the bigger and smaller variances F3$max <- apply(F3[, 1:2], 1, max) F3$min <- apply(F3[, 1:2], 1, min) head(F3) # Calculate the ratio of variances (bigger / smaller) F3$ratio <- F3$max / F3$min head(F3) # Plot sampling distribution of the ratios histogram(~ratio, data = F3, col="grey", xlim=c(0.5, 3), xlab = "n=100", width=.05, main=paste("mean =", mean(F3$ratio), "; var =", var(F3$ratio))) ggplot(F3, aes(x=ratio)) + geom_density(binwidth=.1, colour = "steelblue", fill = "white", size=2, alpha=.25) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) + ggtitle(paste("mean =", mean(F3$ratio, na.rm=T), "; std. dev =", sd(F3$ratio, na.rm=T))) + xlim(0.9, 2.5) + xlab("Ratio of variances (larger / smaller)") ### What if the second sample comes from a distribution with a different variance? ##### Different means and different variances data <- data.frame(rnorm(10000, 100, 16)) colnames(data) <- c("x") data2 <- data.frame(rnorm(100000, 110, 30)) colnames(data2) <- c("x") histogram(~x, data=data, col="grey", xlim=c(20, 210), width=1, main=paste("mean =", mean(data), "; std. dev =", sd(data))) histogram(~x, data=data2, col="grey", xlim=c(20, 210), width=2, main=paste("mean =", mean(data2), "; std. dev =", sd(data2))) ## Take a sample of n=100 sample_size <- 100 # Take 25,000 samples of size n and calculate the variance of each sample sample_1 <- do(25000) * var(sample(data$x, sample_size, replace=T)) sample_2 <- do(25000) * var(sample(data2$x, sample_size, replace=T)) F3 <- data.frame(sample_1, sample_2) # Identify the bigger and smaller variances F3$max <- apply(F3[, 1:2], 1, max) F3$min <- apply(F3[, 1:2], 1, min) head(F3) # Calculate the ratio of variances (bigger / smaller) F3$ratio <- F3$max / F3$min head(F3) # Plot sampling distribution of the ratios histogram(~ratio, data = F3, col="grey", xlim=c(0.5, 8), xlab = "n=100", width=.05, main=paste("mean =", mean(F3$ratio), "; var =", var(F3$ratio))) ggplot(F3, aes(x=ratio)) + geom_density(binwidth=.1, colour = "steelblue", fill = "white", size=2, alpha=.25) + geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) + ggtitle(paste("mean =", mean(F3$ratio, na.rm=T), "; std. dev =", sd(F3$ratio, na.rm=T))) + xlim(0.9, 8) + xlab("Ratio of variances") ``` #### Eliminate students who did not answer all questions hours_all <- mapworks %>% select(S1_NA150, NA152, S1_D148, D150) %>% filter(!is.na(S1_NA150) & !is.na(NA152) & !is.na(S1_D148) & !is.na(D150)) dim(hours)