Loading the Mosaic Package

Let’s load the Mosaic package:

require(mosaic)



Examples from Activity #16

1) Describe the distribution of sample means I would get by repeatedly sampling 2 babies. How would this compare to the distribution I would get by repeatedly sampling 100 babies?

I put the baby weight dataset on my website, so let’s download that data:

babyweights <- read.csv(url("http://bradthiessen.com/html5/stats/m300/babyweights.csv"))
head(babyweights)
##   mother birthorder birthwt age child
## 1     80          1    3175  18     1
## 2     80          2    3572  21     2
## 3     80          3    3317  24     3
## 4     80          4    4281  26     4
## 5     80          5    3827  28     5
## 6     84          1    2892  14     6

You can see the dataset contains:
mother = an ID number for each mother
birthorder = child number for each mother (1 = first born child)
birthwt = weight of baby, in grams
ch = age of mother when baby was born
child = an ID number for each child

Let’s plot the birthweights:

histogram(~birthwt, data = babyweights, col="grey", xlim=c(-1, 6001), xlab = "Baby weights, in grams", width=130)

plot of chunk unnamed-chunk-4

Now let’s get sampling distributions for the mean birthweight using sample sizes of 2, 16, and 100 (to match the activity):

# Create the estimated sampling distributions (10,000 samples) for each sample size
meanweight2<- do(10000) * mean(sample(babyweights$birthwt, 2))
## Loading required package: parallel
meanweight16<- do(10000) * mean(sample(babyweights$birthwt, 16))
meanweight100<- do(10000) * mean(sample(babyweights$birthwt, 100))

# Create histograms to display sampling distributions
histogram(~result, data = meanweight2, col="grey", xlim=c(1000, 5000), xlab = "n=2", width=100)

plot of chunk unnamed-chunk-5

plotDist("norm", params=list(mean=mean(babyweights$birthwt), sd=(sd(babyweights$birthwt)/(2)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-5

histogram(~result, data = meanweight16, col="grey", xlim=c(1000, 5000), xlab = "n=16", width=25)

plot of chunk unnamed-chunk-5

plotDist("norm", params=list(mean=mean(babyweights$birthwt), sd=(sd(babyweights$birthwt)/(16)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-5

histogram(~result, data = meanweight100, col="grey", xlim=c(1000, 5000), xlab = "n=100", width=5)

plot of chunk unnamed-chunk-5

plotDist("norm", params=list(mean=mean(babyweights$birthwt), sd=(sd(babyweights$birthwt)/(100)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-5

# Change axis
histogram(~result, data = meanweight100, col="grey", xlim=c(2900, 3400), xlab = "n=100", width=5)

plot of chunk unnamed-chunk-5

plotDist("norm", params=list(mean=mean(babyweights$birthwt), sd=(sd(babyweights$birthwt)/(100)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-5

# Q-Q plot to investigate normality
qqnorm(meanweight2$result, ylab = "Results"); qqline(meanweight2$result, ylab = "Results")

plot of chunk unnamed-chunk-5

qqnorm(meanweight16$result, ylab = "Results"); qqline(meanweight16$result, ylab = "Results")

plot of chunk unnamed-chunk-5

qqnorm(meanweight100$result, ylab = "Results"); qqline(meanweight100$result, ylab = "Results")

plot of chunk unnamed-chunk-5

We can now estimate the means and standard deviations of these distributions:

paste("(n=2) Mean: ", mean(meanweight2))
## [1] "(n=2) Mean:  3159.53205"
paste("(n=2) SD: ", sd(meanweight2))
## [1] "(n=2) SD:  403.106275087227"
paste("(n=16) Mean: ", mean(meanweight16))
## [1] "(n=16) Mean:  3156.25655"
paste("(n=16) SD: ", sd(meanweight16))
## [1] "(n=16) SD:  142.768550499571"
paste("(n=100) Mean: ", mean(meanweight100))
## [1] "(n=100) Mean:  3156.901646"
paste("(n=100) SD: ", sd(meanweight100))
## [1] "(n=100) SD:  55.8722978675263"


3) Do our results hold for a population that does not have a normal distribution?

This question deals with high school GPAs from St. Ambrose students in 2012, so let’s download that data:

hsgpa <- read.csv(url("http://bradthiessen.com/html5/stats/m300/sauhsgpa.csv"))
histogram(~hsgpa, data = hsgpa, col="grey", xlim=c(-.1, 4.1), xlab = "High school GPAs", width=.08)

plot of chunk unnamed-chunk-7

paste("Mean: ", mean(hsgpa))
## [1] "Mean:  3.27071942589928"
paste("SD: ", sd(hsgpa))
## [1] "SD:  0.552701660333363"

Let’s estimate the sampling distributions for sizes n=2, 16, and 100:

# Create the estimated sampling distributions (10,000 samples) for each sample size
meanhsgpa2<- do(10000) * mean(sample(hsgpa$hsgpa, 2))
meanhsgpa16<- do(10000) * mean(sample(hsgpa$hsgpa, 16))
meanhsgpa100<- do(10000) * mean(sample(hsgpa$hsgpa, 100))

# Create histograms to display sampling distributions
histogram(~result, data = meanhsgpa2, col="grey", xlim=c(2, 4.05), xlab = "n=2", width=.03)

plot of chunk unnamed-chunk-8

plotDist("norm", params=list(mean=mean(hsgpa$hsgpa), sd=(sd(hsgpa$hsgpa)/(2)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-8

histogram(~result, data = meanhsgpa16, col="grey", xlim=c(2, 4.05), xlab = "n=16", width=.02)

plot of chunk unnamed-chunk-8

plotDist("norm", params=list(mean=mean(hsgpa$hsgpa), sd=(sd(hsgpa$hsgpa)/(16)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-8

histogram(~result, data = meanhsgpa100, col="grey", xlim=c(2, 4.05), xlab = "n=100", width=.01)

plot of chunk unnamed-chunk-8

plotDist("norm", params=list(mean=mean(hsgpa$hsgpa), sd=(sd(hsgpa$hsgpa)/(100)^.5)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-8

# Q-Q Plots
qqnorm(meanhsgpa2$result, ylab = "Results"); qqline(meanhsgpa2$result, ylab = "Results")

plot of chunk unnamed-chunk-8

qqnorm(meanhsgpa16$result, ylab = "Results"); qqline(meanhsgpa16$result, ylab = "Results")

plot of chunk unnamed-chunk-8

qqnorm(meanhsgpa100$result, ylab = "Results"); qqline(meanhsgpa100$result, ylab = "Results")

plot of chunk unnamed-chunk-8

As the sample size increases, the sampling distribution approaches a normal distribution. We can now estimate the means and standard deviations of these distributions:

paste("(n=2) Mean: ", mean(meanhsgpa2))
## [1] "(n=2) Mean:  3.27327650154"
paste("(n=2) SD: ", sd(meanhsgpa2))
## [1] "(n=2) SD:  0.389843061393316"
paste("(n=16) Mean: ", mean(meanhsgpa16))
## [1] "(n=16) Mean:  3.27027700123688"
paste("(n=16) SD: ", sd(meanhsgpa16))
## [1] "(n=16) SD:  0.13687828669586"
paste("(n=100) Mean: ", mean(meanhsgpa100))
## [1] "(n=100) Mean:  3.2703688114778"
paste("(n=100) SD: ", sd(meanhsgpa100))
## [1] "(n=100) SD:  0.049995418671005"


Rather than going through another example, let’s look at the CLT in action.

First, suppose we have a population that follows a normal distribution with a mean of 50 and a standard deviation of 20.

plotDist("norm", params=list(mean=50, sd=20), col="steelblue", lwd=2)

plot of chunk unnamed-chunk-10

Let’s repeatedly take samples of size n=25 from this population. Under the CLT, we’d expect the distribution of sample means to have: * an approximately normal shape * a mean of 50 * a standard error of 20/√25 = 4

# Let's sample 25 observations 100,000 times
means1<- do(100000) * mean(rnorm(n=25, mean=50, sd=20))
# Let's plot the sampling distribution
densityplot(~result, data=means1, col="steelblue", plot.points = FALSE, lwd=1)
# and add the theoretical distribution we should get from the CLT
plotDist("norm", params=list(mean=50, sd=4), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-11

# Let's find the mean and standard error
mean(means1)
## [1] 50.01
sd(means1)
## [1] 3.997

Our simulated results appear in blue, while the theoretical results appear in red.


This time, let’s start with a non-normal distribution.

Suppose we have a skewed distribution with a mean of 3 and a standard deviation of 2.4495 (square root of 6).

plotDist("chisq", params=list(df=3), col="steelblue", lwd=2)

plot of chunk unnamed-chunk-12

Let’s repeatedly take samples of size n=9 from this population. Since our population distribution is NOT normal and our sample size is small, we don’t expect the CLT to hold.

If the CLT did hold, we’d have a normal distribution with a mean of 3 and a standard error of 0.8165 (=√6/3). Let’s see what we get:

# Let's sample 9 observations 100,000 times
means2<- do(100000) * mean(rchisq(n=9, df=3))
# Let's plot the sampling distribution
densityplot(~result, data=means2, col="steelblue", plot.points = FALSE, lwd=1)
# and add the theoretical distribution we should get from the CLT
plotDist("norm", params=list(mean=3, sd=(6^.5/3)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-13

# Let's find the mean and standard error
mean(means2)
## [1] 3
sd(means2)
## [1] 0.8174

The (red) theoretical results (assuming the CLT holds) differ from what we got from our simulation. This demonstrates that the CLT did not hold in this case.

This time, let’s repeatedly take samples of size n=100 from this population. Since our sample size is relatively large, we expect the CLT to hold. We expect a mean of 3 and a standard error of .24495 (√6/10).

# Let's sample 100 observations 100,000 times
means3<- do(100000) * mean(rchisq(n=100, df=3))
# Let's plot the sampling distribution
densityplot(~result, data=means3, col="steelblue", plot.points = FALSE, lwd=1)
# and add the theoretical distribution we should get from the CLT
plotDist("norm", params=list(mean=3, sd=(6^.5/10)), col="red", lwd=1, add=TRUE)

plot of chunk unnamed-chunk-14

# Let's find the mean and standard error
mean(means3)
## [1] 3
sd(means3)
## [1] 0.2449

The theoretical results are very close to our simulated results. This shows a sample size of n=100 may be enough to give us a good approximation to a normal distribution.