# Load the tidyverse and mosaic packages
library(tidyverse)
library(mosaic)24 arachnophobe volunteers are randomly assigned to spend 10 minutes one of two rooms:
Click the tabs to see what’s in each room…
The room contains a real spider (in a cage).
The room contains only a photo of that same spider.
     Click code to see how the data were entered –>
# Enter the data manually
spider <- tibble(
  group = c( rep("photo", 12), rep("real", 12) ),
  anxiety = c(30, 35, 45, 40, 50, 35, 55, 25, 30, 45, 40, 50, 
              40, 35, 50, 55, 65, 55, 50, 35, 30, 50, 60, 39))
# Display some data (ordered by anxiety levels)
spider %>%
  arrange(anxiety)
#    # A tibble: 24 × 2
#       group anxiety
#       <chr>   <dbl>
#    1  photo      25
#    2  photo      30
#    3  photo      30
#    4   real      30
#    5  photo      35
#    6  photo      35
#    7   real      35
#    8   real      35
#    9   real      39
#    10 photo      40
#    # ... with 14 more rows# Display summary statistics
spider %>%
  group_by(group) %>%
  summarize(mean = mean(anxiety),
            median = median(anxiety),
            sd = sd(anxiety),
            n = n())
#    # A tibble: 2 × 5
#      group  mean median        sd     n
#      <chr> <dbl>  <dbl>     <dbl> <int>
#    1 photo    40     40  9.293204    12
#    2  real    47     50 11.028888    12| Group | n | mean | sd | 
|---|---|---|---|
| Real | 12 | 47 | 11.03 | 
| Photo | 12 | 40 | 9.29 | 
| Difference | – | 7 | – | 
spider %>%
  ggplot(aes(x = group, y = anxiety)) +
  geom_dotplot(binaxis = "y", binpositions="all", stackdir = "center", fill = "steelblue", color = "white") +
  scale_y_continuous(breaks=seq(20, 70, 10), minor_breaks=NULL) +
  theme(axis.title.x = element_text(size = 12, color = "#777777")) +
  theme(axis.text.x = element_text(size = 12)) +
  theme(axis.title.y = element_text(size = 12, color="#777777")) +
  theme(axis.text.y = element_text(size = 12)) +
  labs(
    title = "Anxiety by group"
  )
spider %>%
  ggplot(aes(y = anxiety, x = group)) +
  geom_boxplot(fill = "white", color = "black", alpha = 0.9) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "steelblue", color = "white", alpha = 0.6) +
  scale_y_continuous(breaks=seq(20, 70, 10), minor_breaks=NULL) +
  theme(axis.title.x = element_text(size = 12, color = "#777777")) +
  theme(axis.text.x = element_text(size = 12)) +
  theme(axis.title.y = element_text(size = 12, color="#777777")) +
  theme(axis.text.y = element_text(size = 12)) +
  labs(
    title = "Anxiety by group"
  )Let’s construct a model that could have generated the data in this study. First, we’ll define:
• \(y_{ig}=\) the anxiety of subject i in group g
• \(\mu=\) a constant, overall mean anxiety level all humans possess
• \(\alpha _{g}=(\mu _{g}-\mu )=\) the change in anxiety associated with being assigned to group g
• \(\epsilon_{ig}=(y_{ig}-\mu _{g})=\) errors or deviations in anxiety among individuals in a group (due to other factors not analyzed in this study).
With this notation, we can write our full model: \(y_{ig}=\mu +\alpha _{g}+\epsilon_{ig}\).
To estimate \(\alpha _{g}=(\mu _{g}-\mu )\) – or to decide if the real spider elicits greater anxiety – we will use methods that can be classified according to their goals and frameworks:
| Framework: | Goal: Choose model | Goal: Estimate effect | 
|---|---|---|
| Randomization-based framework | Randomized NHST | Bootstrap confidence interval | 
| Theory-based framework | t-test | Confidence interval; Effect size | 
We want to estimate the increased anxiety resulting from the real spider.
From our data, we can calculate: \(\overline{X}_{real}-\overline{X}_{photo}=47-7=7.\)
We can calculate: \(\mathrm{Cohen's \ d}=\frac{\overline{X}_{1}-\overline{X}_{2}}{\sigma}\), where \(\sigma\) represents a standard deviation.
Idea #1: We could simply calculate the standard deviation of all our observations, regardless of group.
This would give us: \({\sigma}=10.595\)
sd(spider$anxiety)
#    [1] 10.59532Idea #2: We could calculate a weighted average of the group standard deviations.
\(\sigma_{pooled}=\sqrt{\frac{(n_{1}-1)s_{1}^{2}+(n_{2}-1)s_{2}^{2}}{n_{1}+n_{2}-2}}=\sqrt{\frac{(12-1)11.03^{2}+(12-1)9.29^{2}}{12+12-2}}=10.2\)
# Std. dev. for the real spider group
s_real <- spider %>%
  filter(group == "real") %>%
  summarize(sd(anxiety))
# Std. dev. for the photo group
s_photo <- spider %>%
  filter(group == "photo") %>%
  summarize(sd(anxiety))
# Note:
# We could have just used this code:
# sd(spider$anxiety[spider$group=="real"])
# sd(spider$anxiety[spider$group=="photo"])
# We know the sample size for each group is 12
n <- 12
# Weighted average standard deviation
numerator <- ( (n-1) * (s_real^2) ) + ( (n-1) * (s_photo^2) )
denominator <- n+n-2
spooled <- sqrt(numerator / denominator)
spooled
#         sd(anxiety)
#    [1,]    10.19804# Here's a function I wrote to calculate
# Cohen's d from a data frame with two groups
cohens_d <- function (x, ..., data = parent.frame(), only.2 = TRUE) 
{
    m <- mean_(x, ..., data = data)      # means
    v1 <- var(x, ..., data = data)[1]    # variance of group 1
    v2 <- var(x, ..., data = data)[2]    # variance of group 2
    nms <- names(m)                      # names of groups
    nn <- aggregate(x, ..., data = data, length) # sample sizes
    n1 <- nn[1,2]-1                      # n1 - 1
    n2 <- nn[2,2]-1                      # n2 - 1
    md <- abs(diff(m))                   # difference in means
    sp <- (n1 * v1) + (n2 * v2)          # spooled step 1
    sp <- sp / (n1 + n2)                 # spooled step 2
    sp <- sqrt(sp)                       # spooled step 3
    cd  <- md/sp                         # cohen's d
    if (length(nms) > 2 && only.2) {
        stop("You have more than two groups")
    }
    paste("Cohen's d = ", round(cd,3))
}
# To use this function, we call cohens_d() like this
cohens_d(anxiety ~ group, data=spider)
#    [1] "Cohen's d =  0.686"Instead of estimating the magnitude of the difference in means, we might be interested in comparing individual subjects across groups. If we randomly select one subject from each group, what’s the probability the subject with the real spider exhibits higher anxiety than a subject with the photo of the spider?
In other words, we want to estimate: \(P(x_{real}>x_{photo})\) or \(P(x_{real}-x_{photo}>0)\).
To do this, we need to derive some characteristics of the distribution of \(x_{1}-x_{2}\):
If we also assume both populations follow normal distributions, we can calculate:
\(D = P(x_{1}-x_{2}>0)=P\left ( z>\frac{0-(\overline{X}_1-\overline{X}_2)}{\sigma_x\sqrt{2}}\right )=P\left ( z<\frac{\overline{X}_2-\overline{X}_1}{\sigma_x\sqrt{2}}\right )\)
# Calculate difference in means (47-40 = 7)
meandiff <- diffmean(anxiety ~ group, data=spider)
# Calculate denominator using the spooled we calculated earlier
denom <- spooled * sqrt(2)
# Calculate fraction
Deffect <- meandiff / denom
# Calculate probability under normal distribution
pnorm(Deffect, mean=0, sd = 1, lower.tail=TRUE)
#         sd(anxiety)
#    [1,]   0.6862904
# Here's a function to calculate this directly
# from a data frame
D_effect <- function (x, ..., data = parent.frame(), only.2 = TRUE) 
{
    m <- mean_(x, ..., data = data)      # means
    v1 <- var(x, ..., data = data)[1]    # variance of group 1
    v2 <- var(x, ..., data = data)[2]    # variance of group 2
    nms <- names(m)                      # names of groups
    nn <- aggregate(x, ..., data = data, length) # sample sizes
    n1 <- nn[1,2]-1                      # n1 - 1
    n2 <- nn[2,2]-1                      # n2 - 1
    md <- abs(diff(m))                   # difference in means
    sp <- (n1 * v1) + (n2 * v2)          # spooled step 1
    sp <- sp / (n1 + n2)                 # spooled step 2
    sp <- sqrt(sp)                       # spooled step 3
    DD  <- md / (sp * sqrt(2))           
    PD  <- pnorm(DD, mean=0, sd=1, lower.tail=TRUE)  # Calculate prob.
    if (length(nms) > 2 && only.2) {
        stop("You have more than two groups")
    }
    paste("D = ", round(PD,3))
}
D_effect(anxiety ~ group, data = spider)
#    [1] "D =  0.686"
# Can we simulate this effect size?  Sure!
# Let's take a random sample from each group and see
# which individual has higher anxiety.  We can then
# repeat this 10,000 times and estimate D.
# Why won't this estimate agree with our calculation of D?
sum( Do(10000) * (sample(spider$anxiety[spider$group=="real"], 1) > sample(spider$anxiety[spider$group=="photo"], 1)) / 10000)
#    [1] 0.618Subjects in this study were randomly assigned to rooms. If our null hypothesis were true, the real spider and the photo elicit identical levels of anxiety.
How likely were we to get a 7-point difference in means if our null hypothesis were true? To estimate this, we can:
Expand the code to see how we do this in R.
# Calculate the test statistic
# (the difference between the group means)
test.stat <- diffmean(anxiety ~ group, data=spider)
test.stat
#    diffmean 
#           7
# To simulate random assignment of subjects to groups
# we can shuffle the group assignments.
# Let's do this a couple times to verify the test statistic differs
diffmean(anxiety ~ shuffle(group), data=spider)
#     diffmean 
#    0.3333333
diffmean(anxiety ~ shuffle(group), data=spider)
#    diffmean 
#    1.166667
# We want to "Do" this a large number of times
randomized_mean_diffs <- Do(10000) * diffmean(anxiety ~ shuffle(group), data=spider)
# We could also use dplyr for this, but it's slower
# r_m_d <- Do(10000) * spider %>%
#   mutate(shuffled = sample(anxiety)) %>%
#   group_by(group) %>%
#   summarize(mean = mean(shuffled)) %>%
#   summarize(diffmean = diff(mean))# Histogram of the randomized sampling distribution
# Here's the code to create a histogram using lattice graphics
# histogram(~diffmean, data=randomized_mean_diffs,
#           xlim=c(-15,15), # Set the x-axis range
#           width=2,      # Set the width of each bar
#           xlab="Possible mean differences assuming null hypothesis is true", 
#           groups=diffmean >= test.stat)   # Highlight values > test statistic
# ladd(panel.abline(v=test.stat))   # Add vertical line at test statistic
# Here's the ggplot code I used to create the histogram
# that appears in the activity
ggplot(data = randomized_mean_diffs, aes(x = diffmean)) +
  geom_histogram(binwidth = 2, fill="lightblue", color="white", alpha = 0.8) +
  annotate("segment", x = 7, xend = 7, y = 0, yend = 1200, color = "red") +
  labs(
      title = "Randomized mean differences",
      x = "difference between group means"
      ) +
  scale_x_continuous(breaks=seq(-15, 15, 5), minor_breaks=NULL) +
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  )# Calculate the proportion of our randomized mean
# differences that are more extreme than our test.stat
prop(~ (diffmean >= test.stat), data=randomized_mean_diffs)
#      TRUE 
#    0.0584choose(24,12) / factorial(2)
#    [1] 1352078Instead of calculating the likelihood of observing a mean difference of 7 (or more) assuming a true null hypothesis, we might be interested in a range of likely values for that mean difference.
Whether we can actually do that is a question we’ll deal with in a bit. For now, let’s construct a bootstrap confidence interval for \(\mu_{real}-\mu_{photo}\).
To do this, we’ll repeatedly resample our data with replacement and calculate \(\overline{X}_{real}-\overline{X}_{photo}\). In this way, we’re converting our sample data into a “population” (and taking random samples from this “population”).
# Let's start by taking 3 bootstrap samples from our data
# and calculating the mean difference in each bootstrap sample
# Notice the difference between means differs for each 
# bootstrap sample.  Why is this?
Do(3) * diffmean( anxiety ~ group, data = resample(spider) )
#    # A tibble: 3 × 1
#      diffmean
#         <dbl>
#    1  7.12500
#    2  9.31250
#    3 12.28671
# Now let's "Do" it 10,000 times
# Sample data with replacement
# Calculate the difference between the means
boots <- Do(10000) * diffmean( anxiety ~ group, data=resample(spider) )
# The next line calculates the endpoints that capture
# 95% of the bootstrap mean differences
bootstrapCI <- confint(boots, level = 0.95, method = "quantile")
# I'll store the lower and upper bounds of this confidence interval
# so I can put them on the plot
lower <- as.numeric(bootstrapCI[2]) # Store lower CI bound
upper <- as.numeric(bootstrapCI[3]) # Store upper CI bound
# Density plot
ggplot(data = boots, aes(x = diffmean)) +
  geom_density(fill="lightblue", color="white", alpha = 0.8) +
  labs(
      title = "Bootstrap distribution of mean differences",
      x = "difference between group means (from resampled data)"
      ) +
  scale_x_continuous(breaks=seq(-15, 25, 5), minor_breaks=NULL) +
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  ) +
  annotate("text", x = lower, y = .005, label = round(lower,3)) +
  annotate("text", x = upper, y = .005, label = round(upper,3)) +
  annotate("text", x = mean(boots$diffmean), y = .015, label = paste("95%")) +
  annotate("segment", x = lower, xend = upper, y = 0.01, yend = .01, color = "red")Our test statistic (stored as test.stat) is \(\overline{X}_{1}-\overline{X}_{2}\). For our sample of data, we found this test statistic was equal to 7. How likely were we to obtain a value of 7 (or greater) if the null hypothesis were true?
Earlier, we used randomization-based methods to repeatedly assign subjects randomly into the two groups. For each of these randomized samples, we calculated our test statistic. This gave us a distribution of possible values of \(\overline{X}_{1}-\overline{X}_{2}\) under a true null hypothesis. Recall that this randomized sampling distribution was unimodal and symmetric.
If we used this randomization-based method on other data, we would notice that most of the sampling distributions would look similar (unimodal and symmetric). If this is true, then we might be able to construct these sampling distributions without resampling our data. All we really need to know are the mean and standard deviation of our test statistic (under a true null hypothesis). With these two values (the mean and standard deviation), we can construct our sampling distribution and estimate p-values or construct confidence intervals.
How will we get the mean and standard deviation of \(\overline{X}_{1}-\overline{X}_{2}\)? We’ll need to make some assumptions.
With these assumptions, we know the center (zero) and shape (unimodal, symetric) of our sampling distribution. We only need to figure out its standard error.
The Central Limit Theorem tells us the sampling distribution of \(\overline{X}\) has a standard error of \(\sigma _{\overline{x}}=\frac{\sigma _{x}}{\sqrt{n}}\).
If we assume independence, we can derive the Satterthwaite approximation to the standard error of our test statistic: \(\sigma _{(\overline{x}_{1}-{\overline{x}_{2})}}=\sqrt{\frac{s_{1}^{2}}{n_{1}}+\frac{s_{2}^{2}}{n_{2}}}\). For our data, we can calculate:
\(\sigma _{(\overline{x}_{1}-{\overline{x}_{2})}}=\sqrt{\frac{11.03^{2}}{12}+\frac{9.29^{2}}{12}}=4.163\)
# Satterthwaite approximation
# When I calculated Cohen's d at the beginning of this activity,
# I stored the standard deviations as s_photo and s_real
# I also stored the sample sizes (12) as n
satterthwaite <- sqrt( (s_real^2 / n) + (s_photo^2 / n)  )
satterthwaite
#         sd(anxiety)
#    [1,]    4.163332If we assume independence and the population variances for our two groups are equal, we can use a different formula. SEpooled pools (or finds the weighted average) of the standard deviations from each group: \(SE_{pooled}=s_{pooled}\sqrt{\frac{1}{n_{1}}+\frac{1}{n_{2}}}=\sqrt{\frac{(n_{1}-1)s_{1}^{2}+(n_{2}-1)s_{2}^{2}}{n_{1}+n_{2}-2}}\sqrt{\frac{1}{n_{1}}+\frac{1}{n_{2}}}\). For our data, we can calculate:
\(SE_{pooled}=\sqrt{\frac{(12-1)11.03^{2}+(12-1)9.29^{2}}{12+12-2}}\sqrt{\frac{1}{12}+\frac{1}{12}}=4.163\)
# We already calculated spooled before and stored the result
# as spooled.  We only need to calculate the second radical.
SEpooled <- spooled * sqrt( (1/12) + (1/12) )
SEpooled
#         sd(anxiety)
#    [1,]    4.163332# Standard errors of each group
SD1 <- sd(spider$anxiety[spider$group=="real"])
SD2 <- sd(spider$anxiety[spider$group=="photo"])
# Calculate sample size in each group
n1 <- length(spider$anxiety[spider$group=="real"])
n2 <- length(spider$anxiety[spider$group=="photo"])
# Satterthwaite's approximation
sqrt( ((SD1^2)/(n1)) + ((SD2^2)/(n2)) )
#    [1] 4.163332
# SEpooled
Spooled <- sqrt( (((n1 - 1) * (SD1^2)) + ((n2 - 1) * (SD2^2))) / (n1 + n2 - 2) )
sqrt( (1/n1) + (1/n2) ) * Spooled
#    [1] 4.163332# There are lots of ways to plot this sampling distribution
# It's not technically a normal distribution, but this plot
# will look close enough:
# plotDist("norm", mean=0, sd=4.1633, lw=5, col="steelblue")
# We could also plot the t-distribution directly:
# curve(dt(x-0,4.16), from=-6, to=6 )
# Here's the ggplot2 code I used to create the plot
# appearing in the activity
ggplot(data.frame(x = c(-3.5, 3.5)), aes(x)) +
  stat_function(fun = dt, color="steelblue", args = list(df = 22), size = 2) +
  annotate("segment", x = 7/4.163, xend = 7/4.163, y = 0, yend = 0.23, color = "red") +
  annotate("text", x = 7/4.163, y = .25, label = "observed = 7") +
  labs(
      title = "Sampling distribution of mean differences",
      x = "",
      y = NULL
      ) +
    scale_x_continuous(breaks = c(-3, -2, -1, 0, 1, 2, 3), 
                       labels = c("-_____", "-_____", "-_____", rep("_____", 4))) +
    scale_y_continuous(breaks = NULL) + 
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  )Estimating the p-value
From what you learned in a previous statistics course, you know that we’re conducting an independent samples t-test and the sampling distribution follows a t-distribution with \(n_{1}+n_{2}-2\) degrees-of-freedom.
# We could use:
# plotDist("t", df=22, lw=5, col="steelblue")
# Here's the code I used to create the plot from the activity
ggplot(data.frame(x = c(-3.5, 3.5)), aes(x)) +
  stat_function(fun = dt, color="steelblue", args = list(df = 22), size = 2) +
  annotate("segment", x = 7/4.163, xend = 7/4.163, y = 0, yend = 0.23, color = "red") +
  annotate("text", x = 2.4, y = .25, label = "observed = _______________") +
  labs(
      title = "t-distribution (df = 22)",
      x = "",
      y = NULL
      ) +
    scale_x_continuous(breaks = c(-3, -2, -1, 0, 1, 2, 3)) +
    scale_y_continuous(breaks = NULL) + 
    theme(
    axis.text.x = element_text(size = 11, color="grey10"),
    legend.position = "none",
    panel.grid.major.y = element_line(colour = "white"),
    panel.grid.major.x = element_line(colour = "white", size=.15),
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill = "grey93")
  )As the plot implies, we need to convert our observed test statistic – \(\overline{X}_{real}-\overline{X}_{photo}=7\) – into an observed t-statistic. To do this, we use:
\(\mathrm{t_{observed}} = \frac{\overline{X}_{1}-\overline{X}_{2}}{\mathrm{SE_{pooled}}}=\frac{47-40}{4.163}=1.68=t_{df=22}\).
t_obs <- test.stat / SEpooled
t_obs
#         sd(anxiety)
#    [1,]    1.681346# P(t with df of 22 > 1.6813)
pt(1.6813, df = 22, lower.tail=FALSE)
#    [1] 0.05342408
# The following function (from the mosaic package)
# will calculate the p-value and display it on the
# sampling distribution:
# xpt(c(1.6813), df=22)Function to conduct independent samples t-test
Expand the code to see how to use this function:
# Welch-Satterthwaite method
# (t-test with no equal variance assumption)
# Notice the weird degrees of freedom in the output
t.test(anxiety ~ group, data=spider, 
       alternative = c("less"), var.equal = FALSE, conf.level = 0.95)
#    
#       Welch Two Sample t-test
#    
#    data:  anxiety by group
#    t = -1.6813, df = 21.385, p-value = 0.05362
#    alternative hypothesis: true difference in means is less than 0
#    95 percent confidence interval:
#          -Inf 0.1580825
#    sample estimates:
#    mean in group photo  mean in group real 
#                     40                  47
# t-test with equal variance assumption
# (using SEpooled, like we did in the activity)
t.test(anxiety ~ group, data=spider, 
       alternative = c("less"), var.equal = TRUE, conf.level = 0.95)
#    
#       Two Sample t-test
#    
#    data:  anxiety by group
#    t = -1.6813, df = 22, p-value = 0.05342
#    alternative hypothesis: true difference in means is less than 0
#    95 percent confidence interval:
#          -Inf 0.1490421
#    sample estimates:
#    mean in group photo  mean in group real 
#                     40                  47Since we already know the standard error of our test statistic is 4.163, constructing a 95% confidence interval is simple:
95% CI: \((\overline{X}_{1}-\overline{X}_{2})\pm t_{n_{1}+n_{2}-2}^{\alpha /2}(\mathrm{SE})=(47-40)\pm -2.074(4.163)=7\pm8.634=(-1.634,\space15.634)\)
# These intervals will have the signs flipped
# This is because the groups are defined alphabetically
# by default (so "photo" is group 1 and "real" is group 2).
# Satterthwaite approximation (no equal variance assumption)
confint(t.test(anxiety ~ group, data=spider, conf.level = 0.95, var.equal=FALSE))
#    # A tibble: 1 × 5
#      `mean in group photo` `mean in group real`     lower    upper level
#                      <dbl>                <dbl>     <dbl>    <dbl> <dbl>
#    1                    40                   47 -15.64864 1.648641  0.95
# Assuming equal variances (SEpooled)
confint(t.test(anxiety ~ group, data=spider, conf.level = 0.95, var.equal=TRUE))
#    # A tibble: 1 × 5
#      `mean in group photo` `mean in group real`     lower    upper level
#                      <dbl>                <dbl>     <dbl>    <dbl> <dbl>
#    1                    40                   47 -15.63422 1.634222  0.95Let’s summarize the p-values and confidence intervals from the randomization- and theory-based methods:
Cohen’s d (effect size): 0.686
D (effect size): 0.686
Randomized p-value: 0.058
t-test (equal variances assumed) p-value: 0.053
t-test (equal variances not assumed) p-value: 0.054
Bootstrap 95% confidence interval: (-0.916 , 15.143)
Theory-based 95% confidence interval (equal variances assumed): (-1.634, 15.634)
Theory-based 95% confidence interval (equal variances not assumed): (-1.649, 15.649)
bfeed with variables: group: identifies whether each child was breastfed or notscore: each child’s score on the intelligence test.#    # A tibble: 2 × 4
#          group     n  mean       sd
#         <fctr> <int> <dbl>    <dbl>
#    1 breastfed   237 105.3 14.49998
#    2       not    85 100.9 13.99997simdata data frame contains simulated data from a fictitious study on weightloss. Subjects were randomly assigned to one of two diet groups (Diet A or Diet B). The number of pounds each subject lost (weightloss) was then recorded.# Simulate the data
set.seed(1234)  # Set random number generator to replicate this data
simdata <- tibble(
  diet = c(rep("A", 20000), rep("B", 20000)),
  weightloss = c(rnorm(20000, 2, 20), rnorm(20000, 1, 20))
)
# Summary statistics
simdata %>%
  group_by(diet) %>%
  summarize(n = n(),
            mean = mean(weightloss),
            sd = sd(weightloss))
#    # A tibble: 2 × 4
#       diet     n     mean       sd
#      <chr> <int>    <dbl>    <dbl>
#    1     A 20000 1.983689 19.93599
#    2     B 20000 1.150867 20.03774
# Effect sizes
cohens_d(weightloss ~ diet, data = simdata)
#    [1] "Cohen's d =  0.042"
D_effect(weightloss ~ diet, data = simdata)
#    [1] "D =  0.512"
# t-test
t.test(weightloss ~ diet, data = simdata, alternative="greater")
#    
#       Welch Two Sample t-test
#    
#    data:  weightloss by diet
#    t = 4.1668, df = 39997, p-value = 1.548e-05
#    alternative hypothesis: true difference in means is greater than 0
#    95 percent confidence interval:
#     0.5040582       Inf
#    sample estimates:
#    mean in group A mean in group B 
#           1.983689        1.150867spider data frame. Use bootstrap methods to construct a 95% confidence interval for Cohen’s d (the effect size of the difference in anxiety levels). To do this, you’ll want to use the cohens_d_2 function I wrote (instead of the diffmeans function we used to construct a confidence interval for the mean difference in the activity). Plot the distribution of bootstrap estimates and report the confidence interval limits. From this confidence interval, what can you conclude about the effect size?# Function to calculate Cohen's d (and return numeric result)
cohens_d_2 <- function (x, ..., data = parent.frame(), only.2 = TRUE) 
{
    m <- mean_(x, ..., data = data)      # means
    v1 <- var(x, ..., data = data)[1]    # variance of group 1
    v2 <- var(x, ..., data = data)[2]    # variance of group 2
    nms <- names(m)                      # names of groups
    nn <- aggregate(x, ..., data = data, length) # sample sizes
    n1 <- nn[1,2]-1                      # n1 - 1
    n2 <- nn[2,2]-1                      # n2 - 1
    md <- abs(diff(m))                   # difference in means
    sp <- (n1 * v1) + (n2 * v2)          # spooled step 1
    sp <- sp / (n1 + n2)                 # spooled step 2
    sp <- sqrt(sp)                       # spooled step 3
    cd  <- md/sp                         # cohen's d
    if (length(nms) > 2 && only.2) {
        stop("You have more than two groups")
    }
    cd
}spider data frame. Use randomization-based NHST methods to compare the medians of the real spider vs. photo groups. Plot the randomized sampling distribution and estimate the p-value. To calculate the difference in medians, you can use a diffmedian() function I wrote for this lesson.# We no longer have two independent groups.  We needed that independence
# assumption to calculate either SEpooled or the Satterthwaite approximation.
# What can we do with two dependent groups (the same group measured twice)?
# Let's separate our data into the two repeated measures
# First, let's get anxiety scores related to the real spider
real <- spider %>%
  filter(group == "real") %>%
  transmute(real = anxiety)
# Now, we'll get anxiety scores for those same people with the photo
photo <- spider %>%
  filter(group == "photo") %>%
  transmute(photo = anxiety)
# We can combine these into a single data frame
spider2 <- data.frame(real, photo)
# Let's look at this data
spider2
#    # A tibble: 12 × 2
#        real photo
#       <dbl> <dbl>
#    1     40    30
#    2     35    35
#    3     50    45
#    4     55    40
#    5     65    50
#    6     55    35
#    7     50    55
#    8     35    25
#    9     30    30
#    10    50    45
#    11    60    40
#    12    39    50
# This shows that instead of having 24 independent anxiety scores
# we have a pair of anxiety scores for each of 12 subjects.
# The first subject had an anxiety of 40 with the real spider
#  and 30 with the photo.
# Now, let's calculate the difference in anxiety scores for each subject
spider2 <- spider2 %>%
  mutate(difference = real - photo)
# We can see this difference column now...
spider2
#    # A tibble: 12 × 3
#        real photo difference
#       <dbl> <dbl>      <dbl>
#    1     40    30         10
#    2     35    35          0
#    3     50    45          5
#    4     55    40         15
#    5     65    50         15
#    6     55    35         20
#    7     50    55         -5
#    8     35    25         10
#    9     30    30          0
#    10    50    45          5
#    11    60    40         20
#    12    39    50        -11
# Under a true null hypothesis, we would expect this difference to be zero
# Let's see what our sample mean difference equals
mean(spider2$difference)
#    [1] 7
# How likely were we to get an average difference score of 7
# if the null hypothesis were true?  Let's use a theory-based
# null hypothesis significance test
# Run the following code:
# t_test(anxiety ~ group, data=spider, paired=TRUE, alternative="less")Once you’re satisfied with your solutions, email that html file to thiessenbradleya@sau.edu or give me a printed copy.
If you run into any problems, email me or work with other students in the class.
Sources
Spider dataset: Field, Andy (2012). Discovering Statistics Using R. ISBN: 978-1446200469
This document is released under a Creative Commons Attribution-ShareAlike 3.0 Unported license.