# Your Turn #8
# Input data
crimes <- c(328, 334, 372, 327)

# Get expectation
sum(crimes)/4
expected <- c(rep(340.25, 4))

# Calculate chi-square
chi <- (crimes - expected)^2 / expected
chi2 <- sum(chi)

# Get p-value
help(pchisq)
pchisq(chi2, df=3, lower.tail=FALSE)
library(mosaic)
xpchisq(chi2, df=3)

## Run chi-square test
probs <- c(1/4, 1/4, 1/4, 1/4)
chisq.test(crimes, p = probs, rescale.p=TRUE)
# Randomization-based
chisq.test(crimes, p = probs, rescale.p=TRUE, simulate.p.value = TRUE, B=5000)

# Why significant?
chi
# more in 3rd season
probabilities <- crimes/sum(crimes)
probabilities
probabilities[3]/probabilities[1]
# 13% more likely in 3rd season

# What happens to the p-value with smaller sample size?
crimesmall <- crimes/10
chisq.test(crimesmall, p = probs, rescale.p=TRUE)
# What???


##################################################################
### Test for normality
diet <- read.csv("http://www.bradthiessen.com/html5/data/diet.csv")
weight <- diet$WeightLoss
histogram(~weight, col="lightblue", border="white", width=3)
densityplot(~weight, lw=5)
qqnorm(weight, col="darkorchid", cex=1.1, pch=16); qqline(weight, col="darkcyan", lw=3)
shapiro.test(weight)

# Cut data into 6 bins
cuts <- cut(weight, breaks = c(-Inf, -4, 0, 4, 8, 12, Inf), labels=c("gained 4+ lbs", "gained 0-4 lbs ", "lost 0-4 lbs", "lost 4-8 lbs", "lost 8-12 lbs", "lost 12+ lbs"))
tally(~cuts, margins=TRUE)

# Calculate normal probabilities
# Weights ~ normal(mean, sd)
p1 <- pnorm(-4, mean=mean(weight), sd=sd(weight), lower.tail = TRUE)  #P(x < -4)
p2 <- pnorm(0, mean=mean(weight), sd=sd(weight))-pnorm(-4, mean=mean(weight), sd=sd(weight))
p3 <- pnorm(4, mean=mean(weight), sd=sd(weight))-pnorm(0, mean=mean(weight), sd=sd(weight))
p4 <- pnorm(8, mean=mean(weight), sd=sd(weight))-pnorm(4, mean=mean(weight), sd=sd(weight))
p5 <- pnorm(12, mean=mean(weight), sd=sd(weight))-pnorm(8, mean=mean(weight), sd=sd(weight))
p6 <- pnorm(12, mean=mean(weight), sd=sd(weight), lower.tail = FALSE) #P(X > 12)
p1+p2+p3+p4+p5+p6

# Store all the probabilities as "probs"
probs <- c(p1, p2, p3, p4, p5, p6)

# I'll store those frequencies in a vector called "weightbins"
# This code does it automatically
weightbins <- c(length(cuts[cuts=="gained 4+ lbs"]),
                length(cuts[cuts=="gained 0-4 lbs"]), 
                length(cuts[cuts=="lost 0-4 lbs"]),
                length(cuts[cuts=="lost 4-8 lbs"]),
                length(cuts[cuts=="lost 8-12 lbs"]),
                length(cuts[cuts=="lost 12+ lbs"]))

# Now I run the chi-squared test
chisq.test(weightbins, p = probs)

#  Next with randomization-based methods
chisq.test(weightbins, p = probs, simulate.p.value = TRUE, B=5000)


##################################################################
### Independence test
pain <- c(rep("reduced", 184), rep("not", 203), 
          rep("reduced", 171), rep("not", 216), 
          rep("reduced", 106), rep("not", 282))

treatment <- c(rep("verum", 387), 
               rep("sham", 387), 
               rep("traditional", 388))

acupuncture <- data.frame(treatment, pain)

tally(pain ~ treatment, data=acupuncture, margins=TRUE)

chisq.test(acupuncture$treatment, acupuncture$pain, correct=FALSE)

preducedsham = 171/387
preducedtraditional = 282/388
preducedverum = 203/387

preducedtraditional/preducedsham


##################################################################
### Race

race <- c(rep("white", 151), 
          rep("black", 63))
death <- c(rep("yes", 19), rep("no", 132), 
           rep("yes", 11), rep("no", 52))
deathpenalty <- data.frame(race, death)

tally(death ~ race, data=deathpenalty, margins=TRUE)

# P(11 or more black death | race has no impact)
help(phyper)
1 - phyper(10, 63, 151, 30, lower.tail=TRUE)

fisher.test(deathpenalty$race, deathpenalty$death, alternative="less")

chisq.test(deathpenalty$race, deathpenalty$death, correct=FALSE)