Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)


St. Ambrose study hours

How many hours do you study in a typical week? How does that compare to the hours you spent studying as a freshman?

To investigate this, we’re going to look at some data collected from students who entered St. Ambrose in Fall 2013. Specifically, we’re going to look at student responses to these questions:

S1_NA150: In an average week, how many hours do you spend studying?

NA152: (The same question asked to the same students the following year)

S1_D148: In an average day, how many hours do you spend relaxing or socializing?

D150: (The same question asked to the same students the following year)

Let’s load the data and take a look:

#### Load the data from the web
mapworks <- read.csv("http://www.bradthiessen.com/html5/data/f13s14.csv")
dim(mapworks)
## [1] 569 926
#### The data contains 926 variables for 569 students
#### Let's drop all the variables except our 4 questions of interest 
hours <- mapworks %>%
  select(S1_NA150, NA152, S1_D148, D150)

#### Let's rename the variables
hours$fresh_study <- hours$S1_NA150
hours$soph_study <- hours$NA152
hours$fresh_relax <- hours$S1_D148
hours$soph_relax <- hours$D150

#### Now let's take a look at our data
head(hours)
##   S1_NA150 NA152 S1_D148 D150 fresh_study soph_study fresh_relax
## 1       15    NA       6   NA          15         NA           6
## 2       20    NA       2   NA          20         NA           2
## 3        5    NA       6   NA           5         NA           6
## 4        7    NA       4   NA           7         NA           4
## 5       15    NA       4   NA          15         NA           4
## 6       15    NA       4   NA          15         NA           4
##   soph_relax
## 1         NA
## 2         NA
## 3         NA
## 4         NA
## 5         NA
## 6         NA
dim(hours)
## [1] 569   8
#### We have 569 observations
#### Eliminate students who did not answer any question
hours <- hours %>%
  select(fresh_study, soph_study, fresh_relax, soph_relax) %>%
  filter(!is.na(fresh_study) | !is.na(soph_study) | !is.na(fresh_relax) | !is.na(soph_relax))
dim(hours)
## [1] 528   4
### Histogram
histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), 
          xlab = "Hours per week spent studying", width=4,
          main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T)))

### Density plot
ggplot(hours, aes(x=fresh_study)) + 
  geom_density(binwidth=15, colour = "steelblue", fill = "white", size=1) + 
  geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) +
  ggtitle(paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) +
  xlim(0, 60) +
  xlab("Hours studying per week")
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning: Removed 31 rows containing missing values (geom_point).

### Comparison with responses the following year
histogram(~soph_study, data = hours, col="grey", xlim=c(-2, 60), 
          xlab = "Hours per week spent studying", width=4,
          main=paste("mean =", mean(hours$soph_study, na.rm=T), "; std. dev =", sd(hours$soph_study, na.rm=T)))

### Scatterplot
qplot(x = fresh_study, y = soph_study, data = hours, colour = I("steelblue"), geom="point", xlim=c(0,60), ylim=c(0,60), xlab="Initial topics mastered", ylab="Final topics mastered")
## Warning: Removed 309 rows containing missing values (geom_point).

hours %>% 
  filter(!is.na(fresh_study)) %>%
  filter(!is.na(soph_study)) %>%
  ggvis(x=~fresh_study, y=~soph_study,
        fill := "steelblue", stroke:="steelblue", fillOpacity:=.2) %>%
  layer_points() %>%
  add_axis("x", title = "Study hours as freshman") %>%
  scale_numeric("x", domain=c(0, 60), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Study hours as sophomore") %>%
  scale_numeric("y", domain=c(0, 60), nice=FALSE, clamp=TRUE) 

hours %>% 
  filter(!is.na(fresh_study)) %>%
  filter(!is.na(soph_study)) %>%
  ggvis(x=~fresh_study, y=~soph_study,
        fill := "steelblue", stroke:="steelblue", strokeOpacity:=.2, fillOpacity:=.2) %>%
  layer_points() %>%
  layer_smooths(stroke:="black", strokeOpacity:=.9) %>%
  layer_model_predictions(model = "lm", stroke:="red", strokeOpacity:=.9) %>%
  add_axis("x", title = "Study hours as freshman") %>%
  scale_numeric("x", domain=c(0, 60), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Study hours as sophomore") %>%
  scale_numeric("y", domain=c(0, 60), nice=FALSE, clamp=TRUE) 

## Hours studying vs. hours relaxing
## The hours relaxing variable actually represents hours x 2
## Convert to hours per week
hours$fresh_relax_week <- (hours$fresh_relax / 2) * 7
hours$soph_relax_week <- (hours$soph_relax / 2) * 7
head(hours)
##   fresh_study soph_study fresh_relax soph_relax fresh_relax_week
## 1          15         NA           6         NA               21
## 2          20         NA           2         NA                7
## 3           5         NA           6         NA               21
## 4           7         NA           4         NA               14
## 5          15         NA           4         NA               14
## 6          15         NA           4         NA               14
##   soph_relax_week
## 1              NA
## 2              NA
## 3              NA
## 4              NA
## 5              NA
## 6              NA
histogram(~fresh_relax_week, data = hours, col="grey", xlim=c(-2, 104), 
          xlab = "Hours per week spent relaxing (freshmen)", width=4,
          main=paste("mean =", mean(hours$fresh_relax_week, na.rm=T), "; std. dev =", sd(hours$fresh_relax_week, na.rm=T)))

histogram(~soph_relax_week, data = hours, col="grey", xlim=c(-2, 104), 
          xlab = "Hours per week spent relaxing (freshmen)", width=4,
          main=paste("mean =", mean(hours$fresh_relax_week, na.rm=T), "; std. dev =", sd(hours$fresh_relax_week, na.rm=T)))

hours %>% 
  filter(!is.na(fresh_study)) %>%
  filter(!is.na(fresh_relax_week)) %>%
  ggvis(x=~fresh_relax_week, y=~fresh_study,
        fill := "steelblue", stroke:="steelblue", strokeOpacity:=.2, fillOpacity:=.2) %>%
  layer_points() %>%
  layer_smooths(stroke:="black", strokeOpacity:=.9) %>%
  layer_model_predictions(model = "lm", stroke:="red", strokeOpacity:=.9) %>%
  add_axis("x", title = "Relax hours as freshman") %>%
  scale_numeric("x", domain=c(0, 104), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Study hours as freshman") %>%
  scale_numeric("y", domain=c(0, 60), nice=FALSE, clamp=TRUE) 

That was (hopefully) interesting. I want to focus on one of the distributions:

histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), 
          xlab = "Hours per week spent studying (freshmen)", width=4,
          main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =",
                     sd(hours$fresh_study, na.rm=T)))

ggplot(hours, aes(x=fresh_study)) + 
  geom_density(binwidth=15, colour = "steelblue", fill = "white", size=1) + 
  geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) +
  ggtitle(paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T))) +
  xlim(0, 60) +
  xlab("Hours studying per week")
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning: Removed 29 rows containing missing values (geom_point).

### The distribution is obviously not a normal distribution
histogram(~fresh_study, data = hours, col="grey", xlim=c(-2, 60), 
          xlab = "Hours per week spent studying (freshmen)", width=4,
          main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =",
                     sd(hours$fresh_study, na.rm=T)));

plotDist("norm", params=list(mean=mean(hours$fresh_study, na.rm=T), sd=(sd(hours$fresh_study, na.rm=T)/(1)^.5)), col="red", lwd=1, add=TRUE)

## Q-Q Plot
qqnorm(hours$fresh_study, ylab = "Hours"); qqline(hours$fresh_study, ylab = "Hours")

I want to repeatedly take samples from this distribution and calculate a statistic. I am interested in the distribution of that statistic (if I were to take a large number of samples).

##### We'll start with the sampling distribution of the sample mean
#### Let's remove missing values
fresh_study <- hours %>%
  select(fresh_study) %>%
  filter(!is.na(fresh_study))

## Sample n=3 students and calculate average time studying
sample_size = 3

mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T))

histogram(~result, data = mean3, col="grey", xlim=c(0, 60), xlab = "n=3", width=2,
          main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); 

plotDist("norm", params=list(mean=mean(fresh_study$fresh_study, na.rm=T),
                             sd=(sd(fresh_study$fresh_study, na.rm=T)/(sample_size)^.5)),
         col="red", lwd=3, add=TRUE)

qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results")

## Sample n=30 students and calculate average time studying
sample_size = 30

mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T))

histogram(~result, data = mean3, col="grey", xlim=c(0, 60), xlab = "n=30", width=.25,
          main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3)))

histogram(~result, data = mean3, col="grey", xlim=c(0, 20), xlab = "n=30", width=.25,
          main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); 

plotDist("norm", params=list(mean=mean(fresh_study$fresh_study, na.rm=T), sd=(sd(fresh_study$fresh_study, na.rm=T)/(sample_size)^.5)), col="red", lwd=5, add=TRUE)

qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results")

## Sample n=100 students and calculate average time studying
sample_size = 1000

mean3<- do(50000) * mean(sample(fresh_study$fresh_study, sample_size, replace=T))

histogram(~result, data = mean3, col="grey", xlim=c(0, 20), xlab = "n=1000", width=.05,
          main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); 

histogram(~result, data = mean3, col="grey", xlim=c(9.5, 12), xlab = "n=1900", width=.025,
          main=paste("mean =", mean(mean3), "; std. dev =", sd(mean3))); 

plotDist("norm", params=list(mean=mean(fresh_study$fresh_study), sd=(sd(fresh_study$fresh_study)/(sample_size)^.5)), col="red", lwd=5, add=TRUE)

# Bias
mean(mean3) - mean(fresh_study$fresh_study)
## [1] -0.001426863
# Standard error
sd(mean3) - sd(fresh_study$fresh_study)/(sample_size)^.5
## [1] -0.001313933
qqnorm(mean3$result, ylab = "Results"); qqline(mean3$result, ylab = "Results")

So that’s the sampling distribution of the sample mean.

This time, suppose we take a sample of n observations and:

  1. Convert all the observations to z-scores

  2. Square each of those z-scores

  3. Find the sum of the squared z-scores

What would the distribution look like if we did this for a large number of independent, random samples from a distribution? Let’s find out.

### We'll start with a variable that follows a normal distribution
data <- data.frame(rnorm(10000, 100, 16))
colnames(data) <- c("x")

histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1,
          main=paste("mean =", mean(data), "; std. dev =", sd(data)))

# The scale() function converts scores to z-scores
data$z <- scale(data$x)
# I might as well square them before we take a sample
data$z2 <- data$z^2

# Now take a sample and find the sum
## Sample n=3
sample_size <- 3

chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size),
       prob = c(0.1, 0.6), col = 2)

## Sample n=10
sample_size <- 10

chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=10", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 10]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size),
       prob = c(0.1, 0.6), col = 2)

## Sample n=100
sample_size <- 100

chisquare3<- do(50000) * sum(sample(data$z2, sample_size, replace=T))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 200), xlab = "n=100", width=2,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 100]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size),
       prob = c(0.1, 0.6), col = 2)

What if we calculate the variance of each sample?

### We'll start with a variable that follows a normal distribution
histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1,
          main=paste("mean =", mean(data), "; std. dev =", sd(data)))
## Warning in maggregate(formula = ~(data), data = <environment>, FUN =
## base::mean, : Too many variables in rhs; ignoring all but first.
## Warning in maggregate(formula = ~(data), data = <environment>, FUN =
## stats::sd, : Too many variables in rhs; ignoring all but first.

# Take samples and calculate the variance of each sample
## Sample n=3 
sample_size <- 3

chisquare3<- do(50000) * var(sample(data$x, sample_size, replace=T))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 4000), xlab = "n=3", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 3]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size),
       prob = c(0.1, 0.6), col = 2)

## It's not a chi-squared distribution
## Let's transform each variance that we calculate
## (n-1)s^2 / popvar(x)
sample_size <- 3

chisquare3<- do(50000) * ((sample_size - 1) *var(sample(data$x, sample_size, replace=T))/(var(data$x)))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size),  col="red", lwd=3, add=TRUE)

## It looks like a chi-squared distribution, but the degrees-of-freedom are off
## The mean is 2 and the variance is 4, so the df = 2 = 3-1

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=3", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size-1),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1),
       prob = c(0.1, 0.6), col = 2)

## Let's try it with n = 10
sample_size <- 10

chisquare3<- do(50000) * ((sample_size - 1) *var(sample(data$x, sample_size, replace=T))/var(data$x))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 40), xlab = "n=10", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size-1),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1),
       prob = c(0.1, 0.6), col = 2)

That worked with a normal population. What about our skewed “hours studying” distribution?

histogram(~fresh_study, data = fresh_study, col="grey", xlim=c(-2, 60), 
          xlab = "Hours per week spent studying", width=4,
          main=paste("mean =", mean(hours$fresh_study, na.rm=T), "; std. dev =", sd(hours$fresh_study, na.rm=T)))

# Take samples and calculate the variance of each sample
## Sample n=3 
sample_size <- 3

chisquare3<- do(50000) * ((sample_size - 1) *var(sample(fresh_study$fresh_study, sample_size, replace=T))/var(fresh_study$fresh_study))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 10), xlab = "n=3", width=.5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size-1),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 2]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1),
       prob = c(0.1, 0.6), col = 2)

### Hmm... it didn't work.  Let's try a bigger sample size
## Let's try it with n = 100
sample_size <- 100

chisquare3<- do(50000) * ((sample_size - 1) *var(sample(fresh_study$fresh_study, sample_size, replace=T))/var(fresh_study$fresh_study))

histogram(~result, data = chisquare3, col="grey", xlim=c(0, 250), xlab = "n=100", width=5,
          main=paste("mean =", mean(chisquare3), "; var =", var(chisquare3))); 

plotDist("chisq", params=list(df=sample_size-1),  col="red", lwd=3, add=TRUE)

qqplot(qchisq(ppoints(500), df = sample_size-1), chisquare3$result,
       main = expression("Q-Q plot for" ~~ {chi^2}[nu == 99]))
qqline(chisquare3$result, distribution = function(p) qchisq(p, df = sample_size-1),
       prob = c(0.1, 0.6), col = 2)

## Still didn't work

What if we took samples from two populations, calculated the variance of each sample, and then found the ratio of the variances? What would that distribution look like?

### We'll start with two populations that follow normal distributions
##### Different means but same variances
data <- data.frame(rnorm(10000, 100, 16))
colnames(data) <- c("x")
data2 <- data.frame(rnorm(10000, 80, 16))
colnames(data2) <- c("x")

histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1,
          main=paste("mean =", mean(data), "; std. dev =", sd(data)))

histogram(~x, data=data2, col="grey", xlim=c(40, 160), width=1,
          main=paste("mean =", mean(data2), "; std. dev =", sd(data2)))

## Take a sample of n=100
sample_size <- 100

# Take 25,000 samples of size n and calculate the variance of each sample
sample_1 <- do(25000) * var(sample(data$x, sample_size, replace=T))
sample_2 <- do(25000) * var(sample(data2$x, sample_size, replace=T))
F3 <- data.frame(sample_1, sample_2)

# Identify the bigger and smaller variances
F3$max <- apply(F3[, 1:2], 1, max)
F3$min <- apply(F3[, 1:2], 1, min)
head(F3)
##     result result.1      max      min
## 1 254.7380 220.7760 254.7380 220.7760
## 2 265.8503 247.5478 265.8503 247.5478
## 3 259.0388 215.7104 259.0388 215.7104
## 4 290.3282 247.8766 290.3282 247.8766
## 5 250.7863 258.0033 258.0033 250.7863
## 6 221.5918 209.2571 221.5918 209.2571
# Calculate the ratio of variances (bigger / smaller)
F3$ratio <- F3$max / F3$min
head(F3)
##     result result.1      max      min    ratio
## 1 254.7380 220.7760 254.7380 220.7760 1.153830
## 2 265.8503 247.5478 265.8503 247.5478 1.073935
## 3 259.0388 215.7104 259.0388 215.7104 1.200864
## 4 290.3282 247.8766 290.3282 247.8766 1.171261
## 5 250.7863 258.0033 258.0033 250.7863 1.028778
## 6 221.5918 209.2571 221.5918 209.2571 1.058945
# Plot sampling distribution of the ratios
histogram(~ratio, data = F3, col="grey", xlim=c(0.5, 3), xlab = "n=100", width=.05,
          main=paste("mean =", mean(F3$ratio), "; var =", var(F3$ratio)))

ggplot(F3, aes(x=ratio)) + 
  geom_density(binwidth=.1, colour = "steelblue", fill = "white", size=1) + 
  geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) +
  ggtitle(paste("mean =", mean(F3$ratio, na.rm=T), "; std. dev =", sd(F3$ratio, na.rm=T))) +
  xlim(0.9, 2.5) +
  xlab("Ratio of variances")

### What if the second sample comes from a distribution with a different variance?
##### Different means and different variances
data <- data.frame(rnorm(10000, 100, 16))
colnames(data) <- c("x")
data2 <- data.frame(rnorm(10000, 110, 20))
colnames(data2) <- c("x")

histogram(~x, data=data, col="grey", xlim=c(40, 160), width=1,
          main=paste("mean =", mean(data), "; std. dev =", sd(data)))

histogram(~x, data=data2, col="grey", xlim=c(20, 210), width=2,
          main=paste("mean =", mean(data2), "; std. dev =", sd(data2)))

## Take a sample of n=100
sample_size <- 100

# Take 25,000 samples of size n and calculate the variance of each sample
sample_1 <- do(25000) * var(sample(data$x, sample_size, replace=T))
sample_2 <- do(25000) * var(sample(data2$x, sample_size, replace=T))
F3 <- data.frame(sample_1, sample_2)

# Identify the bigger and smaller variances
F3$max <- apply(F3[, 1:2], 1, max)
F3$min <- apply(F3[, 1:2], 1, min)
head(F3)
##     result result.1      max      min
## 1 323.7467 421.1862 421.1862 323.7467
## 2 289.9975 400.0285 400.0285 289.9975
## 3 309.0911 434.5638 434.5638 309.0911
## 4 240.4700 430.0943 430.0943 240.4700
## 5 202.1981 337.3503 337.3503 202.1981
## 6 311.9610 389.1376 389.1376 311.9610
# Calculate the ratio of variances (bigger / smaller)
F3$ratio <- F3$max / F3$min
head(F3)
##     result result.1      max      min    ratio
## 1 323.7467 421.1862 421.1862 323.7467 1.300975
## 2 289.9975 400.0285 400.0285 289.9975 1.379420
## 3 309.0911 434.5638 434.5638 309.0911 1.405941
## 4 240.4700 430.0943 430.0943 240.4700 1.788557
## 5 202.1981 337.3503 337.3503 202.1981 1.668415
## 6 311.9610 389.1376 389.1376 311.9610 1.247392
# Plot sampling distribution of the ratios
histogram(~ratio, data = F3, col="grey", xlim=c(0.5, 5), xlab = "n=100", width=.05,
          main=paste("mean =", mean(F3$ratio), "; var =", var(F3$ratio)))

ggplot(F3, aes(x=ratio)) + 
  geom_density(binwidth=.1, colour = "steelblue", fill = "white", size=1) + 
  geom_point(aes(y = -.005), position = position_jitter(height = .0025), size=1) +
  ggtitle(paste("mean =", mean(F3$ratio, na.rm=T), "; std. dev =", sd(F3$ratio, na.rm=T))) +
  xlim(0.9, 5) +
  xlab("Ratio of variances")

Eliminate students who did not answer all questions

hours_all <- mapworks %>% select(S1_NA150, NA152, S1_D148, D150) %>% filter(!is.na(S1_NA150) & !is.na(NA152) & !is.na(S1_D148) & !is.na(D150)) dim(hours)