Load packages
library(mosaic)
library(dplyr)
library(ggplot2)
library(ggvis)
library(parallel)


Correlation: Malevolent Uniforms

## Load data 
NFL <- read.csv("http://www.bradthiessen.com/html5/data/NFLuniforms.csv")
head(NFL)
##       NFLTeam NFL_Malevolence ZPenYds
## 1  LA Raiders            5.10    1.19
## 2  Pittsburgh            5.00    0.48
## 3  Cincinnati            4.97    0.27
## 4 New Orleans            4.83    0.10
## 5     Chicago            4.68    0.29
## 6 Kansas City            4.58   -0.19
## Scatterplot
NFL %>% 
  ggvis(x=~NFL_Malevolence, y=~ZPenYds, fill := "steelblue", stroke:="steelblue", fillOpacity:=.6, strokeOpacity:=.6) %>%
  layer_points() %>%
  add_axis("x", title = "Malevolence") %>%
  scale_numeric("x", domain=c(2.5, 5.5), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Penalty yards (z-score)") %>%
  scale_numeric("y", domain=c(-2, 2), nice=FALSE, clamp=TRUE)

## Scatterplot with trend
NFL %>% 
  ggvis(x=~NFL_Malevolence, y=~ZPenYds, fill := "steelblue", stroke:="steelblue", fillOpacity:=.6, strokeOpacity:=.6) %>%
  layer_points() %>%
  layer_smooths() %>%
  add_axis("x", title = "Malevolence") %>%
  scale_numeric("x", domain=c(2.5, 5.5), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "Penalty yards (z-score)") %>%
  scale_numeric("y", domain=c(-2, 2), nice=FALSE, clamp=TRUE)

## Calculate Pearson's r
cor(NFL_Malevolence, ZPenYds, data=NFL)
## [1] 0.429796
## Test correlation (null: r=0)
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, alternative="greater")
## 
##  Pearson's product-moment correlation
## 
## data:  NFL$NFL_Malevolence and NFL$ZPenYds
## t = 2.4272, df = 26, p-value = 0.01122
## alternative hypothesis: true correlation is greater than 0
## 95 percent confidence interval:
##  0.1299369 1.0000000
## sample estimates:
##      cor 
## 0.429796
## Calculate Spearman's rho
cor(NFL_Malevolence, ZPenYds, data=NFL, method="spearman")
## [1] 0.2634173
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, method="spearman", alternative="greater")
## Warning in cor.test.default(NFL$NFL_Malevolence, NFL$ZPenYds, method =
## "spearman", : Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  NFL$NFL_Malevolence and NFL$ZPenYds
## S = 2691.473, p-value = 0.08781
## alternative hypothesis: true rho is greater than 0
## sample estimates:
##       rho 
## 0.2634173
## Calculate Kendall's tau
cor(NFL_Malevolence, ZPenYds, data=NFL, method="kendall")
## [1] 0.1861702
cor.test(NFL$NFL_Malevolence, NFL$ZPenYds, method="kendall", alternative="greater")
## Warning in cor.test.default(NFL$NFL_Malevolence, NFL$ZPenYds, method =
## "kendall", : Cannot compute exact p-value with ties
## 
##  Kendall's rank correlation tau
## 
## data:  NFL$NFL_Malevolence and NFL$ZPenYds
## z = 1.384, p-value = 0.08317
## alternative hypothesis: true tau is greater than 0
## sample estimates:
##       tau 
## 0.1861702
## Randomization-based test for r
test.stat <- cor(NFL_Malevolence, ZPenYds, data=NFL)
randomize <- do(10000) * cor(NFL_Malevolence, shuffle(ZPenYds), data=NFL)

histogram(~result, data = randomize, col="grey", xlim=c(-1, 1), xlab = "Randomization - Correlation", width=.01,
          main=paste("Observed r =", cor(NFL_Malevolence, ZPenYds, data=NFL)))

tally((~result >= test.stat), data=randomize, format="prop")
## 
##   TRUE  FALSE 
## 0.0101 0.9899
## Bootstrap estimate for r
boot.cor = function(x, y, n=5000, p=0.9, method="pearson"){ 
w = length(x); x.r = x; y.r = y 
sm = 1:w; cor.b = 1:n 
for (k in 1:n){ 
s = sample(sm, w, replace = T) 
for (i in 1:w) 
{ 
x.r[i] = x[s[i]] 
y.r[i] = y[s[i]] 
} 
cor.b[k] = cor(x.r, y.r, use = "pairwise", method) 
} 
cor.b = sort(cor.b); a = round(n*(1-p)/2,0); b = round(n*(p+1)/2,0) 
vec = c(cor.b[a+1], cor(x, y, use = "pairwise", method), cor.b[b-1]) 
n.r = c("value"); n.c = c("lower_bound", "correlation", "upper_bound") 
matrix(vec,1,3,dimnames = list(n.r,n.c)) 
}

boot.cor(NFL$NFL_Malevolence, NFL$ZPenYds)
##       lower_bound correlation upper_bound
## value 0.008809799    0.429796    0.685738
### Load a built-in dataset about Midwest states
data(midwest)
midwest <- midwest
names(midwest)
##  [1] "PID"                  "county"               "state"               
##  [4] "area"                 "poptotal"             "popdensity"          
##  [7] "popwhite"             "popblack"             "popamerindian"       
## [10] "popasian"             "popother"             "percwhite"           
## [13] "percblack"            "percamerindan"        "percasian"           
## [16] "percother"            "popadults"            "perchsd"             
## [19] "percollege"           "percprof"             "poppovertyknown"     
## [22] "percpovertyknown"     "percbelowpoverty"     "percchildbelowpovert"
## [25] "percadultpoverty"     "percelderlypoverty"   "inmetro"             
## [28] "category"
midwest %>% 
  ggvis(x=~percollege, y=~percbelowpoverty, fill := "steelblue", stroke:="steelblue", fillOpacity:=.1, strokeOpacity:=.3) %>%
  layer_points() %>%
  add_axis("x", title = "% attending college") %>%
  scale_numeric("x", domain=c(0, 50), nice=FALSE, clamp=TRUE)  %>%
  add_axis("y", title = "% below poverty") %>%
  scale_numeric("y", domain=c(0, 50), nice=FALSE, clamp=TRUE)

## We'll look at the correlation between
##### percbelowpoverty (% of population below poverty)
##### percollege (% of population who attended college)
## First let's calculate Pearson's r
cor(percbelowpoverty, percollege, data=midwest)
## [1] -0.2814064
## Now, let's run a test for that correlation
cor.test(midwest$percbelowpoverty, midwest$percollege, alternative="less")
## 
##  Pearson's product-moment correlation
## 
## data:  midwest$percbelowpoverty and midwest$percollege
## t = -6.1164, df = 435, p-value = 1.067e-09
## alternative hypothesis: true correlation is less than 0
## 95 percent confidence interval:
##  -1.0000000 -0.2072089
## sample estimates:
##        cor 
## -0.2814064
## We get a p-value of 0.000000001067
## Let's try a randomization-based test
test.stat <- cor(midwest$percbelowpoverty, midwest$percollege)
randomize <- do(10000) * cor(midwest$percbelowpoverty, shuffle(midwest$percollege))
histogram(~result, data = randomize, col="grey", xlim=c(-.5, .5), 
          xlab = "Randomization - Correlation", width=.01,
          main=paste("Observed r =", cor(midwest$percbelowpoverty, midwest$percollege)))

tally((~result <= test.stat), data=randomize, format="prop")
## 
##  TRUE FALSE 
##     0     1
## Let's try Spearman's rho
cor(midwest$percbelowpoverty, midwest$percollege, method="spearman")
## [1] -0.3334071
cor.test(midwest$percbelowpoverty, midwest$percollege, method="spearman", alternative="less")
## 
##  Spearman's rank correlation rho
## 
## data:  midwest$percbelowpoverty and midwest$percollege
## S = 18546140, p-value = 5.626e-13
## alternative hypothesis: true rho is less than 0
## sample estimates:
##        rho 
## -0.3334071
## ... and Kendall's tau
cor(midwest$percbelowpoverty, midwest$percollege, method="kendall")
## [1] -0.2334516
cor.test(midwest$percbelowpoverty, midwest$percollege, method="kendall", alternative="less")
## 
##  Kendall's rank correlation tau
## 
## data:  midwest$percbelowpoverty and midwest$percollege
## z = -7.2911, p-value = 1.537e-13
## alternative hypothesis: true tau is less than 0
## sample estimates:
##        tau 
## -0.2334516