Baseball Database


0) Load packages and set working directory
require(mosaic)
require(dplyr)
require(ggplot2)
require(RSQLite)
require(pitchRx)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
1) Create database with pitchRx data from a range of dates
## Create new database: db = MaxScherzer.sqlite3
db <- src_sqlite("MaxScherzer.sqlite3", create = TRUE)
## Scrape data from a range of dates
## Could also use Game ID with...
### GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## This gets us 5 dataframes: atbat, action, pitch, po, runner
scrape(start = "2014-06-12", end = "2014-06-12", connect = db$con)
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_lanmlb_cinmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_sdnmlb_phimlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_atlmlb_colmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_wasmlb_sfnmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_chnmlb_pitmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_tormlb_balmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_clemlb_bosmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_milmlb_nynmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_arimlb_houmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_detmlb_chamlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_nyamlb_seamlb_1/inning/inning_all.xml
## NULL
2) Join atbat and pitch tables; select Max Scherzer; Query database
atbats <- tbl(db, 'atbat') %.%
  filter(date >= '2014_06_12' & date <= '2014_06_12') %.%
  filter(pitcher_name == 'Max Scherzer')
pitches <- tbl(db, 'pitch')  
MaxScherzerJune12 <- collect(inner_join(pitches, atbats, by = c('num', 'gameday_link')))
MaxS <- collect(MaxScherzerJune12)
Slower way to do step 2…
## Select the atbat dataframe
# atbats <- tbl(db, 'atbat')
## Select the pitch table
# pitches <- tbl(db, 'pitch')
## Filter pitches thrown by Max Scherzer
# max <- filter(atbats, pitcher_name == 'Max Scherzer')
## Query the database to load this data
# dbSendQuery(db$con, 'CREATE INDEX pitcher_idx ON atbat(pitcher_name)')
## Join the atbat table with the pitches table
# MaxScherzerJune12 <- inner_join(pitches, max, by = c('num', 'gameday_link'))
## Query the database
# dbSendQuery(db$con, 'CREATE INDEX pitch_idx ON pitch(gameday_link, num)')
## Collect the joined tables into a dataframe
# MaxS <- collect(MaxScherzerJune12)
3) Visualize pitches
## Density - all pitches
strikeFX(MaxS, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 16 rows containing non-finite values (stat_density2d).
## Warning: Removed 12 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-5

## Just the called strikes
Maxstrikes <- subset(MaxS, des == "Called Strike")
strikeFX(Maxstrikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Just the swinging strikes
Maxswingstrikes <- subset(MaxS, des == "Swinging Strike")
strikeFX(Maxswingstrikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Just the balls
Maxballs <- subset(MaxS, des == "Ball")
strikeFX(Maxballs, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
## Warning: Removed 8 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-5

## The probability of a strike based on location
noswing <- subset(MaxS, des %in% c("Ball", "Called Strike"))
noswing$strike <- as.numeric(noswing$des %in% "Called Strike")
require(mgcv)
m1 <- bam(strike ~ s(px, pz, by=factor(stand)) +
          factor(stand), data=noswing, family = binomial(link='logit'))
strikeFX(noswing, model=m1, layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Animated pitches (averaged by pitch type)
library(animation)
saveHTML(
  animateFX(MaxS, avg.by = 'pitch_types', layer = list(theme_bw(), facet_grid(.~stand))),
  img.name = "MaxS"
)



Pitch data without creating database


0) Load packages and set working directory
require(mosaic)
require(dplyr)
require(ggplot2)
require(pitchRx)
require(plyr)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
1) Grab pitchRx data; join pitch/atbat tables; filter Justin Verlander
# Scrape data (can use GameID or range of dates)
GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## http://gd2.mlb.com/components/game/mlb/year_2011/month_05/day_07/gid_2011_05_07_detmlb_tormlb_1/inning/inning_all.xml
# Combine pitch and at-bat data
pitchFX <- join(GameData$pitch, GameData$atbat, by = c("num", "url"), type = "inner")
# This creates a dataframe with 69 columns
names(pitchFX)
##  [1] "des"             "des_es"          "id"             
##  [4] "type"            "tfs"             "tfs_zulu"       
##  [7] "x"               "y"               "cc"             
## [10] "mt"              "url"             "inning_side"    
## [13] "inning"          "next_"           "num"            
## [16] "sv_id"           "start_speed"     "end_speed"      
## [19] "sz_top"          "sz_bot"          "pfx_x"          
## [22] "pfx_z"           "px"              "pz"             
## [25] "x0"              "y0"              "z0"             
## [28] "vx0"             "vy0"             "vz0"            
## [31] "ax"              "ay"              "az"             
## [34] "break_y"         "break_angle"     "break_length"   
## [37] "pitch_type"      "type_confidence" "zone"           
## [40] "nasty"           "spin_dir"        "spin_rate"      
## [43] "on_1b"           "on_2b"           "on_3b"          
## [46] "gameday_link"    "count"           "pitcher"        
## [49] "batter"          "b"               "s"              
## [52] "o"               "start_tfs"       "start_tfs_zulu" 
## [55] "stand"           "b_height"        "p_throws"       
## [58] "atbat_des"       "atbat_des_es"    "event"          
## [61] "inning_side"     "inning"          "next_"          
## [64] "score"           "home_team_runs"  "away_team_runs" 
## [67] "batter_name"     "pitcher_name"    "gameday_link"   
## [70] "date"
# Keep only the pitches thrown by Justin Verlander
pitches <- subset(pitchFX, pitcher_name == "Justin Verlander")
2) Visualize pitches
# Graph all the pitches thrown by JV
strikeFX(pitches, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).