Loading the Mosaic Package

Let’s load some packages we’ll use in these assignments:

require(ggplot2)
require(dplyr)
require(mosaic)
require(parallel)



Assignment #1: Simulation Assignment


Do couples tend to lean to the right when kissing?
Kissing <- do(50000) * rflip(12)
histogram( ~ heads, data=Kissing, v=8, width=1, col="lightgrey")

plot of chunk unnamed-chunk-3

prop( ~(heads >= 8), data=Kissing)
##   TRUE 
## 0.1932
Kissing <- do(50000) * rflip(124)
histogram( ~ heads, data=Kissing, v=80, width=1, col="lightgrey")

plot of chunk unnamed-chunk-4

prop( ~(heads >= 80), data=Kissing)
##  TRUE 
## 8e-04


Did Phil Mickelson putt worse than should be expected in 2011?
Putts <- do(50000) * rflip(28, prob=0.412)
histogram( ~ heads, data=Putts, v=7, width=1, col="lightgrey")

plot of chunk unnamed-chunk-5

prop( ~(heads <= 7), data=Putts)
##   TRUE 
## 0.0574




Assignment #2: Tricky Problems


What’s the probability that Amy has 2 female children?

The solutions show how to solve this problem through exact enumeration. Let’s try a simulation approach by first simulating 50,000 women each having two children:

## Create a vector containing the gender choices for each child
PossibleKids <- c("boy", "girl")
## Sample two children 50,000 times (to simulate 50,000 women) 
TwoKids <- do(50000) * sample(PossibleKids, 2, replace=TRUE)
## Show a table of the results
table(TwoKids$V1, TwoKids$V2)
##       
##          boy  girl
##   boy  12507 12442
##   girl 12401 12650
## Given the youngest child (V1) is a girl, estimate the probability that both children are girls
# First, let's select the simulated women with youngest children = girls
FirstGirl <- subset(TwoKids, TwoKids$V1=="girl")
table(FirstGirl)
##       V2
## V1       boy  girl
##   boy      0     0
##   girl 12401 12650
# From these results, find the proportion where the other child (V2) was also a girl
# The answer will be close to 0.50.  That's the probability for Amy.
prop( ~(V2=="girl"), data=TwoKids)
##   TRUE 
## 0.5018
## To find the probability for Betty, we need to select the simulated women
## who have one girl (either the younger or older child):
OneGirl <- subset(TwoKids, TwoKids$V1=="girl" | TwoKids$V2=="girl")
table(OneGirl)
##       V2
## V1       boy  girl
##   boy      0 12442
##   girl 12401 12650
# From these results, find the proportion where both children are girls
# The answer will be close to 0.33.  That's the probability for Betty.
prop( ~(V1=="girl" & V2=="girl"), data=OneGirl)
##   TRUE 
## 0.3374
What’s the probability that Chris has two boys?

Let’s also simulate this situation. I’ll re-use the 50,000 simulated pairs of children from the previous question, but I’ll add to this 50,000 randomly selected days of the week:

## Create a vector of days of the week
PossibleDays <- c("Sun", "Mon", "Tues", "Wed", "Thurs", "Fri", "Sat")

## Sample two days of the week (birth days for both kids) 
TwoDays <- do(50000) * sample(PossibleDays, 2, replace=TRUE)

## Merge the days of the week with the kids
TwoKids$V1day <- TwoDays$V1
TwoKids$V2day <- TwoDays$V2
colnames(TwoKids) <- c("younger", "older", "youngerDay", "olderDay")
## Let's take a look at the first several rows of our simulated data
head(TwoKids)
##   younger older youngerDay olderDay
## 1    girl   boy        Mon      Fri
## 2    girl   boy        Fri     Tues
## 3     boy   boy        Sun     Tues
## 4     boy   boy        Fri    Thurs
## 5    girl  girl        Mon      Sun
## 6     boy   boy       Tues      Fri
## Given one child is a boy born on Tuesday, estimate the probability that both children are boys
# First, let's select the subset of data where at least one child is a boy
OneBoy <- subset(TwoKids, TwoKids$younger=="boy" | TwoKids$older=="boy")
# Next, let's subset this data to only include cases in which at least
# one of those boys was born on a Tuesday
OneBoyTues <- subset(OneBoy, (OneBoy$younger=="boy" & OneBoy$youngerDay=="Tues") | (OneBoy$older=="boy" & OneBoy$olderDay=="Tues"))
## Let's take a look at some of this data to make sure each row has at least
## one boy born on a Tuesday
head(OneBoyTues)
##    younger older youngerDay olderDay
## 2     girl   boy        Fri     Tues
## 3      boy   boy        Sun     Tues
## 6      boy   boy       Tues      Fri
## 15     boy  girl       Tues      Sat
## 16     boy   boy       Tues      Wed
## 18     boy  girl       Tues     Tues
# From these results, find the proportion where both children are boys
# The answer will be close to 13/27 = 0.48.  That's the probability for Chris.
prop( ~(younger=="boy" & older=="boy"), data=OneBoyTues)
##   TRUE 
## 0.4833


Let’s make a deal problem.

Let’s try to simulate this situation.

## Create a vector of possible prizes (1 car and 2 goats)
Prizes <- c("car", "goat", "goat")

## Sample 3 prizes to behind 3 doors (sampling without replacement
## to ensure we always get 1 car and 2 goats
ThreeDoors <- do(50000) * sample(Prizes, 3, replace=FALSE)
colnames(ThreeDoors) <- c("door1", "door2", "door3")
## Take a look at some of our data
head(ThreeDoors)
##   door1 door2 door3
## 1  goat  goat   car
## 2  goat  goat   car
## 3  goat   car  goat
## 4  goat   car  goat
## 5   car  goat  goat
## 6   car  goat  goat
## It doesn't matter which door is initially selected, so let's
## arbitrarily choose Door #1 as the initial selection.  If we
## chose to NOT switch doors, the probability of winning a car
## is simply the proportion of cars behind Door #1.
## This answer will be, approximately, 0.33
prop( ~(door1=="car"), data=ThreeDoors)
##  TRUE 
## 0.334
## Now let's have the game show host show us a door hiding a goat.
## We'll call this door "X"
## If a goat is behind door #2, let's replace it with an X
ThreeDoors$door2 <-as.character(ThreeDoors$door2)
ThreeDoors$door2[ThreeDoors$door2 == "goat"] <- "X"
## If a car is behind door #3 and a goat is behind door #3,
## let's replace door #3 with an X
ThreeDoors$door3 <-as.character(ThreeDoors$door3)
ThreeDoors$door3[ThreeDoors$door3 == "goat" & ThreeDoors$door2 == "car"] <- "X"
## Let's look at our dataset one more time...
head(ThreeDoors)
##   door1 door2 door3
## 1  goat     X   car
## 2  goat     X   car
## 3  goat   car     X
## 4  goat   car     X
## 5   car     X  goat
## 6   car     X  goat
## If we switch, we'll switch from door #1 to the other door that's not an X
## (the door we haven't seen yet).
## That means a switch wins if our simulated doors are...
## 1 = Goat, 2 = car, 3 = X
## or
## 1 = Goat, 2 = X, 3 = car
## Let's find the proportion of times this happened.
## This answer will be close to 0.67
prop( ~(door1=="goat" & door2=="car" & door3=="X"), data=ThreeDoors) + prop( ~(door1=="goat" & door2=="X" & door3=="car"), data=ThreeDoors)
##  TRUE 
## 0.666


Probability of 2+ students (out of 25) having the same birthday.

Let’s try to simulate this situation.

## We'll use a for-loop for this simulation

## Set the parameters; we'll simulate 50,000 classes of 25 students
students <- 25
replications <- 50000
x <- as.numeric(replications)

## Create the for-loop
for (i in 1:replications)  {
    b <- sample(1:365, students, replace=TRUE)
    x[i] <- students - length(unique(b))  }

## Find the proportion of cases with no birthdays in common
## This answer should be close to 0.43
mean(x==0)
## [1] 0.4297
## We can create a histogram to show the probabiity
## of finding 0, 1, 2, 3, ... common birthdays from 25 students
cut <- (0:(max(x) + 1)) - 0.5
hist(x, breaks=cut, freq=F, col=8)

plot of chunk unnamed-chunk-9

## We can also plot the probability of at least two common birthdays
## as a function of the number of students in the class
p <- numeric(50)
for (n in 1:50)      {
        q <- 1 - (0:(n - 1))/365
            p[n] <- 1 - prod(q)   }
plot(p)

plot of chunk unnamed-chunk-9




Assignment #3: School bus inspector


Given 20 buses with 3 unsafe. You sample 5 buses. What’s the probability you find at least one unsafe bus?

Once again, we’ll show how a simulation can estimate the solution.

## Create a vector containing 20 buses (17 safe, 3 unsafe).
## To simplify this simulation, let's use
##   unsafe bus = 1  and  safe bus = 0
Buses <- c(rep(0, 17), rep(1, 3))
Buses
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
## Sample 5 buses 50,000 times 
InspectFive <- do(50000) * sample(Buses, 5, replace=FALSE)
head(InspectFive)
##   V1 V2 V3 V4 V5
## 1  0  0  1  0  0
## 2  0  0  1  0  0
## 3  0  0  1  0  0
## 4  0  0  0  0  0
## 5  0  0  0  0  0
## 6  0  0  0  0  0
## Create a column to find the sum of the number of unsafe buses
InspectFive$sum <- rowSums(InspectFive)
head(InspectFive)
##   V1 V2 V3 V4 V5 sum
## 1  0  0  1  0  0   1
## 2  0  0  1  0  0   1
## 3  0  0  1  0  0   1
## 4  0  0  0  0  0   0
## 5  0  0  0  0  0   0
## 6  0  0  0  0  0   0
## Find the proportion of times we found at least one unsafe bus
## (the proportion of times the sum is at least 1)
## This answer will be close to our answer of 0.601
prop( ~(sum>=1), data=InspectFive)
##   TRUE 
## 0.6054
## We can also see the proportion of times we found
## 0, 1, 2, and 3 unsafe buses in our sample
ggplot(InspectFive, aes(x=sum)) +
  geom_histogram(aes(y=..count../sum(..count..)))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-10

## Now let's do the same thing for a sample of n=3
InspectThree <- do(50000) * sample(Buses, 3, replace=FALSE)
InspectThree$sum <- rowSums(InspectThree)
## We'll get an answer close to our solution of 0.404
prop( ~sum>=1, data=InspectThree)
##  TRUE 
## 0.404




Assignment #3b: Probability & Permutations


You have 50 cents and roll two dice. The price of ice cream will be the larger number followed by the smaller number (in cents). What’s the probability you can afford the ice cream?

The assignment shows some R code that will work. I’ll use a slightly different approach here.

## Simulate 10,000 rolls of two dice (d1 and d2)
d1 <- sample(1:6,10000, replace=TRUE)
d2 <- sample(1:6,10000, replace=TRUE)

## The price = 10 x (bigger number) + (smaller number)
price <- 10*pmax(d1,d2) + pmin(d1,d2)

## We can afford it if the price is <= 50
afford <- (price<=50)
## Sum the number of times we can afford the ice cream
sum(afford)
## [1] 4493
## Convert this to a probability
sum(afford) / nrow(afford)
## numeric(0)


The Mind Reader. Pick a two-digit number where both digits are unique odd numbers. I correctly guess the number of 350 people in the audience (out of 1000 people total). How likely was that to have happened (assuming each person chooses a number at random)?

Let’s once again simulate this scenario.

## Create a vector of the possible 2-digit numbers
Choices <- c(13, 15, 17, 19, 31, 35, 37, 39, 51, 53, 57, 59, 71, 73, 75, 79, 91, 93, 95, 97)

## Simulate 10,000 audiences of size 1000.  Each audience member chooses a number at random.
Audience <- do(10000) * sample(Choices, 1000, replace=TRUE)

## Suppose I guess the number 35.  Let's assign a value of zero to all my incorrect guesses.
Audience[Audience != 35] <- 0
## Let's assign a value of 1 to all my correct guesses
Audience[Audience == 35] <- 1

## Now we simply find the row sums to determine the number of correct guesses
Audience$sum <- rowSums(Audience)

## Let's create a histogram of the number of correct guesses
ggplot(Audience, aes(x=sum)) +
  geom_histogram(aes(y=..count..))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-12

## Finally, let's find the proportion of times I correctly guessed
## the number of at least 350 people
## Chances are, this simulation will never correctly guess 350+ numbers
prop( ~sum>=350, data=Audience)
## TRUE 
##    0




Assignment #3c: Simulation Methods


Dr. Flipper

Here’s our dataset:

Observed Dolphin Therapy Control Group Total
Improved 10 3 13
Did not Improve 5 12 17
Total 15 15 30

Our sample of 30 individuals includes 13 people who improved and 17 people who did not improve. Under our null hypothesis, we can assume these same 13 people would have improved regardless of whether they received dolphin therapy.

We can now simulate this study.

## Create 30 people (13 who will improve and 17 who will not improve)
##    Improve = 1   and   Not improve = 0
People <- c(rep(1, 13), rep(0, 17))
People
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Let's select 15 of these people at random to receive dolphin therapy
## We'll simulate this process 50,000 times
Therapy <- do(50000) * sample(People, 15, replace=FALSE) 

## Now, let's count the number of people who improved in each replication
Therapy$sum <- rowSums(Therapy)

## Let's create a histogram showing the number of people who improved
histogram( ~sum, width=1, main="People who improved in the Therapy Group", v=9.5, col="lightgrey", data=Therapy)

plot of chunk unnamed-chunk-13

## Find the p-value -- P(10 or more improve | therapy has no effect)
## The answer will be close to our solution of 0.0127
prop( ~(sum>=10), data=Therapy)
##    TRUE 
## 0.01292


Nurse Gilbert

Here’s our dataset:

Observed Gilbert Shift No Gilbert Shift Total
Death 40 34 74
No Death 217 1350 1567
Total 257 1384 1641

Our sample of 1641 shifts includes 74 deaths. Under our null hypothesis, these 74 people would have died whether or not Nurse Gilbert was working.

We can now simulate this study.

## Create 1641 people (74 who will die and 1567 who will not die)
##    Die = 1   and   Not die = 0
Patients <- c(rep(1, 74), rep(0, 1567))

## Let's select 257 of these people at random to be in the hospital when
## Nurse Gilbert is working.  We'll simulate this process 30,000 times
Gilbert <- do(30000) * sample(Patients, 257, replace=FALSE)

## Now, let's count the number of people who died in each replication
Gilbert$sum <- rowSums(Gilbert)

## Let's create a histogram showing the number of people who improved
histogram( ~sum, width=1, main="People who died in a Nurse Gilbert Shift", v=73.5, col="lightgrey", data=Gilbert)

plot of chunk unnamed-chunk-14

## Find the p-value -- P(74 or more die | Nurse Gilbert has no effect)
## The answer will, most likely, be zero.
prop( ~(sum>=74), data=Gilbert)
## TRUE 
##    0