Baseball Database


0) Load packages and set working directory
require(mosaic)
require(dplyr)
require(ggplot2)
require(RSQLite)
require(pitchRx)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
1) Create database with pitchRx data from a range of dates
## Create new database: db = MaxScherzer.sqlite3
db <- src_sqlite("MaxScherzer.sqlite3", create = TRUE)
## Scrape data from a range of dates
## Could also use Game ID with...
### GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## This gets us 5 dataframes: atbat, action, pitch, po, runner
scrape(start = "2014-06-12", end = "2014-06-12", connect = db$con)
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_lanmlb_cinmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_sdnmlb_phimlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_atlmlb_colmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_wasmlb_sfnmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_chnmlb_pitmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_tormlb_balmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_clemlb_bosmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_milmlb_nynmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_arimlb_houmlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_detmlb_chamlb_1/inning/inning_all.xml 
## http://gd2.mlb.com/components/game/mlb/year_2014/month_06/day_12/gid_2014_06_12_nyamlb_seamlb_1/inning/inning_all.xml
## NULL
2) Join atbat and pitch tables; select Max Scherzer; Query database
atbats <- tbl(db, 'atbat') %.%
  filter(date >= '2014_06_12' & date <= '2014_06_12') %.%
  filter(pitcher_name == 'Max Scherzer')
pitches <- tbl(db, 'pitch')  
MaxScherzerJune12 <- collect(inner_join(pitches, atbats, by = c('num', 'gameday_link')))
MaxS <- collect(MaxScherzerJune12)
Slower way to do step 2…
## Select the atbat dataframe
# atbats <- tbl(db, 'atbat')
## Select the pitch table
# pitches <- tbl(db, 'pitch')
## Filter pitches thrown by Max Scherzer
# max <- filter(atbats, pitcher_name == 'Max Scherzer')
## Query the database to load this data
# dbSendQuery(db$con, 'CREATE INDEX pitcher_idx ON atbat(pitcher_name)')
## Join the atbat table with the pitches table
# MaxScherzerJune12 <- inner_join(pitches, max, by = c('num', 'gameday_link'))
## Query the database
# dbSendQuery(db$con, 'CREATE INDEX pitch_idx ON pitch(gameday_link, num)')
## Collect the joined tables into a dataframe
# MaxS <- collect(MaxScherzerJune12)
3) Visualize pitches
## Density - all pitches
strikeFX(MaxS, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 16 rows containing non-finite values (stat_density2d).
## Warning: Removed 12 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-5

## Just the called strikes
Maxstrikes <- subset(MaxS, des == "Called Strike")
strikeFX(Maxstrikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Just the swinging strikes
Maxswingstrikes <- subset(MaxS, des == "Swinging Strike")
strikeFX(Maxswingstrikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Just the balls
Maxballs <- subset(MaxS, des == "Ball")
strikeFX(Maxballs, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 4 rows containing non-finite values (stat_density2d).
## Warning: Removed 8 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-5

## The probability of a strike based on location
noswing <- subset(MaxS, des %in% c("Ball", "Called Strike"))
noswing$strike <- as.numeric(noswing$des %in% "Called Strike")
require(mgcv)
m1 <- bam(strike ~ s(px, pz, by=factor(stand)) +
          factor(stand), data=noswing, family = binomial(link='logit'))
strikeFX(noswing, model=m1, layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-5

## Animated pitches (averaged by pitch type)
library(animation)
saveHTML(
  animateFX(MaxS, avg.by = 'pitch_types', layer = list(theme_bw(), facet_grid(.~stand))),
  img.name = "MaxS"
)



Pitch data without creating database


0) Load packages and set working directory
require(mosaic)
require(dplyr)
require(ggplot2)
require(pitchRx)
require(plyr)
## Can update pitchRx with devtools::install_github("cpsievert/pitchRx")
setwd("~/Desktop/R Sandbox/Baseball")
1) Grab pitchRx data; join pitch/atbat tables; filter Justin Verlander
# Scrape data (can use GameID or range of dates)
GameData <- scrape(game.ids="gid_2011_05_07_detmlb_tormlb_1", suffix = "inning/inning_all.xml")
## http://gd2.mlb.com/components/game/mlb/year_2011/month_05/day_07/gid_2011_05_07_detmlb_tormlb_1/inning/inning_all.xml
# Combine pitch and at-bat data
pitchFX <- join(GameData$pitch, GameData$atbat, by = c("num", "url"), type = "inner")
# This creates a dataframe with 69 columns
names(pitchFX)
##  [1] "des"             "des_es"          "id"             
##  [4] "type"            "tfs"             "tfs_zulu"       
##  [7] "x"               "y"               "cc"             
## [10] "mt"              "url"             "inning_side"    
## [13] "inning"          "next_"           "num"            
## [16] "sv_id"           "start_speed"     "end_speed"      
## [19] "sz_top"          "sz_bot"          "pfx_x"          
## [22] "pfx_z"           "px"              "pz"             
## [25] "x0"              "y0"              "z0"             
## [28] "vx0"             "vy0"             "vz0"            
## [31] "ax"              "ay"              "az"             
## [34] "break_y"         "break_angle"     "break_length"   
## [37] "pitch_type"      "type_confidence" "zone"           
## [40] "nasty"           "spin_dir"        "spin_rate"      
## [43] "on_1b"           "on_2b"           "on_3b"          
## [46] "gameday_link"    "count"           "pitcher"        
## [49] "batter"          "b"               "s"              
## [52] "o"               "start_tfs"       "start_tfs_zulu" 
## [55] "stand"           "b_height"        "p_throws"       
## [58] "atbat_des"       "atbat_des_es"    "event"          
## [61] "inning_side"     "inning"          "next_"          
## [64] "score"           "home_team_runs"  "away_team_runs" 
## [67] "batter_name"     "pitcher_name"    "gameday_link"   
## [70] "date"
# Keep only the pitches thrown by Justin Verlander
pitches <- subset(pitchFX, pitcher_name == "Justin Verlander")
2) Visualize pitches
# Graph all the pitches thrown by JV
strikeFX(pitches, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-8

The above graph shows the location of all the pitches Justin Verlander threw that day (to right- and left-handed batters). Let’s look at just the called strikes:

strikes <- subset(pitches, des == "Called Strike")
strikeFX(strikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-9

… and the swinging strikes:

swingstrikes <- subset(pitches, des == "Swinging Strike")
strikeFX(swingstrikes, geom="tile", layer=facet_grid(.~stand))

plot of chunk unnamed-chunk-10

… and the balls:

balls <- subset(pitches, des == "Ball")
strikeFX(balls, geom="tile", layer=facet_grid(.~stand))
## Warning: Removed 1 rows containing non-finite values (stat_density2d).
## Warning: Removed 2 rows containing non-finite values (stat_density2d).

plot of chunk unnamed-chunk-11

Finally, here’s an animation of all the pitches from Justin Verlander during this game:

animateFX(pitches, layer=list(facet_grid(pitcher_name~stand, labeller = label_both), theme_bw(), coord_equal()))

require(animation)
ani.options(interval = 0.05)
saveHTML({animateFX(pitches, layer=list(facet_grid(pitcher_name~stand, labeller = label_both), theme_bw(), coord_equal()))}, img.name="JVpitches")

Animated GIF



Fielding data and openWAR


# The Sxslt package is required in order to download new game data from MLBAM
install.packages("Sxslt", repos = "http://www.omegahat.org/R", type = "source")
# Install openWAR package
require(devtools)
install_github("openWAR", "beanumber")
# Load package
require(openWAR)

## Get GameIDs for a particular date
getGameIds(date=as.Date("2014-06-12"))
## "gid_2014_06_12_detmlb_chamlb_1" is a DET vs CLE game

## Load data
data(MLBAM2013)

## Get MLBAM player ID for a particular player
playerNames <- unique(MLBAM2013$batterName)
playerNames[grep("Jackson", playerNames)]
## Austin Jackson is "Jackson, A"
jacksonId = unique(subset(MLBAM2013, batterName == "Jackson, A")$batterId)
jackson = subset(MLBAM2013, playerId.CF == jacksonId)

## Calculate the number of flyballs caught by Jackson in CF in 2013
require(mosaic)
head(sort(tally(~event, data=subset(jackson, fielderId == playerId.CF)), decreasing=TRUE))

## Other statistics, like
#### groundout-to-air out ratio of the Tigers;
#### batting average on balls in play of their opponents.
require(plyr)
ddply(jackson, ~playerId.CF, summarise,
      N = length(playerId.CF), G = length(unique(gameId)),
      BIP = sum(isBIP), PO = sum(fielderId == playerId.CF, na.rm=TRUE), 
      "GO/AO" = length(grep("Ground", event)) / length(grep("Fly", event)), 
      BABIP = sum(isHit) / sum(isAB) 
      )

## Visualize Jackson's catches on the field
plot(subset(jackson, fielderId == playerId.CF))

## Hexbin plot
require(hexbin)
my.colors <- function (n) {
  rev(heat.colors(n))
}
hexbinplot(our.y ~ our.x, data=subset(jackson, fielderId == playerId.CF), xbins = 10
         , panel = function(x,y, ...) {
           panel.baseball()
           panel.hexbinplot(x,y,  ...)
         }
       , xlim = c(-350, 350), ylim = c(-20, 525)
       , xlab = "Horizontal Distance from Home Plate (ft.)"
       , ylab = "Vertical Distance from Home Plate (ft.)"
       , colramp = my.colors, colorcut = seq(0, 1, length = 10)
  )

## Comparing Jackson to all other CF players
hexbinplot(our.y ~ our.x, data=subset(MLBAM2013, fielderId == playerId.CF), xbins = 50
         , panel = function(x,y, ...) {
           panel.baseball()
           panel.hexbinplot(x,y,  ...)
         }
       , xlim = c(-350, 350), ylim = c(-20, 525)
       , xlab = "Horizontal Distance from Home Plate (ft.)"
       , ylab = "Vertical Distance from Home Plate (ft.)"
       , colramp = my.colors, colorcut = seq(0, 1, length = 10)
  )

## Compare Jackson to a selected group of players
key = unique(subset(MLBAM2013, batterName %in% c("Trout", "Upton, B", "Jackson, A", "Ellsbury"), select=c("batterId", "batterName")))
comp = subset(MLBAM2013, playerId.CF %in% key$batterId & fielderId == playerId.CF)
 
hexbinplot(our.y ~ our.x | as.factor(playerId.CF), data=comp, xbins = 10
         , panel = function(x,y, ...) {
           panel.baseball()
           panel.hexbinplot(x,y,  ...)
         }
       , xlim = c(-350, 350), ylim = c(-20, 525)
       , xlab = "Horizontal Distance from Home Plate (ft.)"
       , ylab = "Vertical Distance from Home Plate (ft.)"
       , colramp = my.colors, colorcut = seq(0, 1, length = 10)
       , strip = strip.custom(factor.levels = as.character(key$batterName))
  )


************************************** MARCEL: http://www.tangotiger.net/marcel/ **************************************

require(Lahman)
## We'll calculate MARCEL for 2004
pred.year = 2004

## Collect and aggregate data for 2001-03 (adding plate appearances)
library(dplyr)
data(Batting)
A <- Batting %.%
  group_by(playerID, yearID) %.%
  filter(yearID >= pred.year - 3 & yearID < pred.year) %.%
  select(playerID, yearID, AB, R, H, X2B, X3B, HR, RBI, SB, CS, BB, SO, IBB, HBP, SH, SF, GIDP) %.%
  mutate(PA = AB + BB + HBP + SF + SH) %.%
  summarize(PA = sum(PA), AB = sum(AB), R = sum(R),
            H = sum(H), X2B = sum(X2B), X3B = sum(X3B), HR = sum(HR),
            RBI = sum(RBI), SB = sum(SB), CS = sum(CS), BB = sum(BB),
            SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP), SH = sum(SH),
            SF = sum(SF), GIDP = sum(GIDP))

## We could also use plyr for this
# B = subset(Batting, yearID >= pred.year - 3 & yearID < pred.year)
# B = transform(B, PA = AB + BB + HBP + SF + SH)
# require(plyr)
# stats = c("PA", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB",
#           "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP")
# B = ddply(B[, c("playerID", "yearID", stats)], ~playerID + yearID, summarise
#            , PA = sum(PA), AB = sum(AB), R = sum(R)
#            , H = sum(H), X2B = sum(X2B), X3B = sum(X3B), HR = sum(HR)
#            , RBI = sum(RBI), SB = sum(SB), CS = sum(CS), BB = sum(BB)
#            , SO = sum(SO), IBB = sum(IBB), HBP = sum(HBP), SH = sum(SH)
#            , SF = sum(SF), GIDP = sum(GIDP))

## Carlos Beltran home run projection
## Get HR and PA totals 
x = subset(A, playerID == "beltrca01")$HR
x
## Plate appearances
n = subset(A, playerID == "beltrca01")$PA
n
## Using dplyr results in data.frames
# x <- A %.%
#      filter(playerID == "beltrca01"); q$HR
# n <- A %.%
#     filter(playerID == "beltrca01"); q$PA

## MARCEL weighs statistics over time
## The season immediately preceding the current one is given
## a weight of 5; the season before that=4; the season two before=3. 
## Create a vector, t, with these weights
A$t = with(A, ifelse(pred.year - yearID == 1, 5, ifelse(pred.year - yearID == 2, 4, 3)))
## Beltran’s time-weighted home run totals is an inner product
t = subset(A, playerID == "beltrca01")$t
t
t %*% x

## remove pitchers from before we compute the league averages
P = subset(Pitching, yearID >= pred.year - 3 & yearID < pred.year)
A.pos = subset(merge(x=A, y=P[, c("playerID", "yearID", "BFP")], by=c("playerID", "yearID"), all.x=TRUE), PA > BFP | is.na(BFP))

## League averages are computed by simply summing over all the rows
p0 <- A.pos %.%
  group_by(yearID) %.%
  summarize(lgPA = sum(PA), lgAB = sum(AB)/sum(PA), lgR = sum(R)/sum(PA),
            lgH = sum(H)/sum(PA), lgX2B = sum(X2B)/sum(PA), lgX3B = sum(X3B)/sum(PA),
            lgHR = sum(HR)/sum(PA), lgRBI = sum(RBI)/sum(PA), lgSB = sum(SB)/sum(PA), 
            lgCS = sum(CS)/sum(PA), lgBB = sum(BB)/sum(PA), lgSO = sum(SO)/sum(PA), 
            lgIBB = sum(IBB)/sum(PA), lgHBP = sum(HBP)/sum(PA), lgSH = sum(SH)/sum(PA),
            lgSF = sum(SF)/sum(PA), lgGIDP = sum(GIDP)/sum(PA))
p0[, c("yearID", "lgHR")]

## Using plyr
# p0 = ddply(B.pos, ~yearID, summarise, lgPA = sum(PA), lgAB = sum(AB)/sum(PA), lgR = sum(R)/sum(PA)
#            , lgH = sum(H)/sum(PA), lgX2B = sum(X2B)/sum(PA), lgX3B = sum(X3B)/sum(PA), lgHR = sum(HR)/sum(PA)
#            , lgRBI = sum(RBI)/sum(PA), lgSB = sum(SB)/sum(PA), lgCS = sum(CS)/sum(PA), lgBB = sum(BB)/sum(PA)
#            , lgSO = sum(SO)/sum(PA), lgIBB = sum(IBB)/sum(PA), lgHBP = sum(HBP)/sum(PA), lgSH = sum(SH)/sum(PA)
#            , lgSF = sum(SF)/sum(PA), lgGIDP = sum(GIDP)/sum(PA))
# p0[, c("yearID", "lgHR")]

## Calculating MARCEL
# Merge league averages with player statistics
M = merge(x=A, y=p0, by="yearID")
# List of statistics to get projected
stats.lg = paste("lg", stats, sep="")
# X is a matrix of statistics
X = M[, stats]
# P0 is a matrix of league average statistics
P0 = M[, stats.lg]
# Carry out the calculations
t.X = M$t * X
t.n = M$t * M$PA
t.n0 = M$t * 100
t.P0 = M$t * M$PA * P0
# add Tango’s estimate for each player’s plate appearance projection
mPA = with(M, ifelse(pred.year - yearID == 1, 0.5 * PA, ifelse(pred.year - yearID == 2, 0.1 * PA, 200)))
# add these to our data frame and aggregate it by player
Q = cbind(M[, c("playerID", "yearID")], t.n, t.X, t.n0, t.P0, mPA)

## Using plyr
res = ddply(Q, ~playerID, summarise, numSeasons = length(t.n), reliability = sum(t.n)/(sum(t.n) + sum(t.n0))
            , tn = sum(t.n)
            , PA = sum(PA), lgPA = sum(lgPA), mPA = sum(mPA)
            , AB = sum(AB), lgAB = sum(lgAB), R = sum(R), lgR = sum(lgR)
            , H = sum(H), lgH = sum(lgH), X2B = sum(X2B), lgX2B = sum(lgX2B)
            , X3B = sum(X3B), lgX3B = sum(lgX3B), HR = sum(HR),  lgHR = sum(lgHR)
            , RBI = sum(RBI), lgRBI = sum(lgRBI), SB = sum(SB),  lgSB = sum(lgSB)
            , CS = sum(CS), lgCS = sum(lgCS), BB = sum(BB),  lgBB = sum(lgBB)
            , SO = sum(SO), lgSO = sum(lgSO), IBB = sum(IBB),  lgIBB = sum(lgIBB)
            , HBP = sum(HBP), lgHBP = sum(lgHBP), SH = sum(SH),  lgSH = sum(lgSH)
            , SF = sum(SF), lgSF = sum(lgSF), GIDP = sum(GIDP),  lgGIDP = sum(lgGIDP))


## Using dplyr
# res <- Q %.%
#  group_by(playerID) %.%
#  mutate(numSeasons = length(t.n), reliability = sum(t.n)/(sum(t.n) + sum(t.n0))) %.%
#  summarize(tn = sum(t.n)
#            , PA = sum(PA), lgPA = sum(lgPA), mPA = sum(mPA)
#            , AB = sum(AB), lgAB = sum(lgAB), R = sum(R), lgR = sum(lgR)
#            , H = sum(H), lgH = sum(lgH), X2B = sum(X2B), lgX2B = sum(lgX2B)
#            , X3B = sum(X3B), lgX3B = sum(lgX3B), HR = sum(HR),  lgHR = sum(lgHR)
#            , RBI = sum(RBI), lgRBI = sum(lgRBI), SB = sum(SB),  lgSB = sum(lgSB)
#            , CS = sum(CS), lgCS = sum(lgCS), BB = sum(BB),  lgBB = sum(lgBB)
#            , SO = sum(SO), lgSO = sum(lgSO), IBB = sum(IBB),  lgIBB = sum(lgIBB)
#            , HBP = sum(HBP), lgHBP = sum(lgHBP), SH = sum(SH),  lgSH = sum(lgSH)
#            , SF = sum(SF), lgSF = sum(lgSF), GIDP = sum(GIDP),  lgGIDP = sum(lgGIDP))



## The MARCEL projectiions are then the weighted averages
## of the player’s stats and the league averages,
## with the weights given by the reliability
stats.proj = setdiff(stats, "PA")
stats.m = paste("m", stats.proj, sep="")
stats.lg.proj = paste("lg", stats.proj, sep="")
res[, stats.m] = with(res, (reliability * res[, stats.proj]) / tn + (1 - reliability) * res[, stats.lg.proj] / tn)

subset(res, playerID == "beltrca01", select=c("reliability", "tn", "mPA", "HR", "lgHR", "mHR"))
res = merge(x=res, y=Master[,c("playerID", "birthYear")], by="playerID")
res = transform(res, age = pred.year - birthYear)
res$age.adj = with(res, ifelse(age > 29, 0.003 * (age - 29), 0.006 * (age - 29)))
res[, stats.m] = res[, stats.m] * (1 + res$age.adj)
subset(res, playerID == "beltrca01", select=c("reliability", "age.adj", "tn", "mPA", "HR", "lgHR", "mHR"))


************************************** Top batting averages over time **************************************

### Source:  http://lahman.r-forge.r-project.org/doc/Batting.html
## Create a plot of top batting averages over time
# Loads a baseball dataset
library(Lahman)
# Loads batting data
data(Batting)
# Loads a package that helps manipulate/reshape data
require('plyr')
# Look at the first several rows of data
head(Batting)
##    playerID yearID stint teamID lgID  G G_batting AB R H X2B X3B HR RBI SB
## 1 aardsda01   2004     1    SFN   NL 11        11  0 0 0   0   0  0   0  0
## 2 aardsda01   2006     1    CHN   NL 45        43  2 0 0   0   0  0   0  0
## 3 aardsda01   2007     1    CHA   AL 25         2  0 0 0   0   0  0   0  0
## 4 aardsda01   2008     1    BOS   AL 47         5  1 0 0   0   0  0   0  0
## 5 aardsda01   2009     1    SEA   AL 73         3  0 0 0   0   0  0   0  0
## 6 aardsda01   2010     1    SEA   AL 53         4  0 0 0   0   0  0   0  0
##   CS BB SO IBB HBP SH SF GIDP G_old
## 1  0  0  0   0   0  0  0    0    11
## 2  0  0  0   0   0  1  0    0    45
## 3  0  0  0   0   0  0  0    0     2
## 4  0  0  1   0   0  0  0    0     5
## 5  0  0  0   0   0  0  0    0    NA
## 6  0  0  0   0   0  0  0    0    NA
# calculate batting average and other stats
batting <- battingStats()

# add salary to Batting data; need to match by player, year and team
batting <- merge(batting, 
                 Salaries[,c("playerID", "yearID", "teamID", "salary")], 
                 by=c("playerID", "yearID", "teamID"), all.x=TRUE)
# Add name, age and bat hand information:
masterInfo <- Master[, c('playerID', 'birthYear', 'birthMonth',
                         'nameLast', 'nameFirst', 'bats')]
batting <- merge(batting, masterInfo, all.x = TRUE)
batting$age <- with(batting, yearID - birthYear -
                      ifelse(birthMonth < 10, 0, 1))
batting <- arrange(batting, playerID, yearID, stint)

## Generate a plot of batting average over time
# Restrict the pool of eligible players to the years after 1899 and
# players with a minimum of 450 plate appearances (this covers the
# strike year of 1994 when Tony Gwynn hit .394 before play was suspended
# for the season - in a normal year, the minimum number of plate appearances is 502)
eligibleHitters <- subset(batting, yearID >= 1900 & PA > 450)

# Find the hitters with the highest BA in MLB each year (there are a
# few ties).  Include all players with BA > .400
topHitters <- ddply(eligibleHitters, .(yearID), subset, (BA == max(BA))|BA > .400)

# Create a factor variable to distinguish the .400 hitters
topHitters$ba400 <- with(topHitters, BA >= 0.400)

# Sub-data frame for the .400 hitters plus the outliers after 1950
# (averages above .380) - used to produce labels in the plot below
bignames <- rbind(subset(topHitters, ba400),
                  subset(topHitters, yearID > 1950 & BA > 0.380))
# Cut to the relevant set of variables
bignames <- subset(bignames, select = c('playerID', 'yearID', 'nameLast',
                                        'nameFirst', 'BA'))

# Ditto for the original data frame
topHitters <- subset(topHitters, select = c('playerID', 'yearID', 'BA', 'ba400'))

# Positional offsets to spread out certain labels
#                     NL TC JJ TC GS TC RH GS HH RH RH BT TW TW  RC GB TG 
bignames$xoffset <- c(0, 0, 0, 0, 0, 0, 0, 0, -8, 0, 3, 3, 0, 0, -2, 0, 0)
bignames$yoffset <- c(0, 0, -0.003, 0, 0, 0, 0, 0, -0.004, 0, 0, 0, 0, 0, -0.003, 0, 0)  +  0.002

# Load package for creating visualizations
require('ggplot2')     
# Create the visualization
ggplot(topHitters, aes(x = yearID, y = BA)) +
  geom_point(aes(colour = ba400), size = 2.5) +
  geom_hline(yintercept = 0.400, size = 1) +
  geom_text(data = bignames, aes(x = yearID + xoffset, y = BA + yoffset,
                                 label = nameLast), size = 3) +
  scale_colour_manual(values = c('FALSE' = 'black', 'TRUE' = 'red')) +
  ylim(0.330, 0.430) +
  xlab('Year') +
  scale_y_continuous('Batting average',
                     breaks = seq(0.34, 0.42, by = 0.02),
                     labels = c('.340', '.360', '.380', '.400', '.420')) +
  geom_smooth() +
  theme(legend.position = 'none')

plot of chunk unnamed-chunk-15


************************************** Homerun Trend **************************************

### Source: http://lahman.r-forge.r-project.org/doc/Batting.html

# Total home runs by year
totalHR <- ddply(Batting, .(yearID), summarise,
                 HomeRuns = sum(as.numeric(HR), na.rm=TRUE),
                 Games = sum(as.numeric(G_batting), na.rm=TRUE),
                 HRperGame = HomeRuns/Games
)
totalHR <- totalHR[ which(totalHR$Games>0), ]

# Quick look at the data
head(totalHR)
##   yearID HomeRuns Games HRperGame
## 1   1871       47  2296  0.020470
## 2   1872       35  3307  0.010584
## 3   1873       46  3603  0.012767
## 4   1874       40  4199  0.009526
## 5   1875       40  6249  0.006401
## 6   1876       40  4696  0.008518
# Plot trend (total homeruns / total games played)
# Add lowess smoothed line to see trend
ggplot(totalHR, aes(x=yearID, y=HRperGame)) +
  geom_point(shape=1, alpha=0.8) +    # Use hollow circles
  geom_smooth(alpha=0.3)            # Add a loess smoothed fit curve with confidence region

plot of chunk unnamed-chunk-16


************************************** dplyr demonstrations **************************************

### Lists datasets available in the Lahman package
data(package="Lahman")
# Loads Batting data
data(Batting)
# Gets dimensions of Batting data
dim(Batting)
## [1] 96600    24

This Batting dataset has 96,600 rows and 24 columns. Let’s get an idea of what those rows and columns represent:

head(Batting, 8)
##    playerID yearID stint teamID lgID   G G_batting  AB  R   H X2B X3B HR
## 1 aardsda01   2004     1    SFN   NL  11        11   0  0   0   0   0  0
## 2 aardsda01   2006     1    CHN   NL  45        43   2  0   0   0   0  0
## 3 aardsda01   2007     1    CHA   AL  25         2   0  0   0   0   0  0
## 4 aardsda01   2008     1    BOS   AL  47         5   1  0   0   0   0  0
## 5 aardsda01   2009     1    SEA   AL  73         3   0  0   0   0   0  0
## 6 aardsda01   2010     1    SEA   AL  53         4   0  0   0   0   0  0
## 7 aardsda01   2012     1    NYA   AL   1        NA  NA NA  NA  NA  NA NA
## 8 aaronha01   1954     1    ML1   NL 122       122 468 58 131  27   6 13
##   RBI SB CS BB SO IBB HBP SH SF GIDP G_old
## 1   0  0  0  0  0   0   0  0  0    0    11
## 2   0  0  0  0  0   0   0  1  0    0    45
## 3   0  0  0  0  0   0   0  0  0    0     2
## 4   0  0  0  0  1   0   0  0  0    0     5
## 5   0  0  0  0  0   0   0  0  0    0    NA
## 6   0  0  0  0  0   0   0  0  0    0    NA
## 7  NA NA NA NA NA  NA  NA NA NA   NA    NA
## 8  69  2  2 28 39  NA   3  6  4   13   122

Each row represents the batting statistics for a baseball player in a single year playing for a single team. So, for example, if a player played for 2 teams in a single season, that player would have two rows (one for each “stint”).

From the header displayed above, we can see the player “aardsda01” has data from the 2004-2012 seasons (where he played for San Francisco, both Chicago teams, Boston, Seattle, and the Yankees).

Suppose I want to know which 5 players in MLB history have batted in the most games. To do this, I need to add up the games played (G) for all the rows for each player.

The dplyr package allows us to do these kind of data manipulations easily. We want to:

To do this step-by-step, we would use:

## Group rows of data by playerID
players <- group_by(Batting, playerID)
## Summarize the groups by taking the sum of games played
games <- summarize(players, total = sum(G))
## Arrange the data by games played (in descending order) and list the top 5
head(arrange(games, desc(total)), 5)
##     total
## 1 4988101

Those commands took a small fraction of a second to complete. From the output, we can see the top five players with most games played are: Pete Rose, Carl Yastrzemski, Hank Aaron, Ricky Henderson, and Ty Cobb.

We could have also done this manipulation by chaining operations together with the %.% operator:

Batting %.%
  group_by(playerID) %.%
  summarize(total = sum(G)) %.%
  arrange(desc(total)) %.%
  head(5)
##     total
## 1 4988101

You can think of %.% as an arrow pointing to the right –>. It tells the computer to move on to the next command in the pipeline.

Let’s see some other manipulation commands we may want to use:

Select columns (variables) of interest

This Batting dataset has 24 variables. Suppose we’re only interested in 14 of those columns: playerID, yearID, teamID, at-bats, hits, doubles, triples, homeruns, stolen bases, number of times caught stealing, strikeouts, walks, hit-by-pitch numbers, and sacrifice flies.

We can select a subset of columns from our dataset with the select command:

Batting %.%
  select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
  head(10)
##     playerID yearID teamID  AB   H X2B X3B HR SB CS SO BB HBP SF
## 1  aardsda01   2004    SFN   0   0   0   0  0  0  0  0  0   0  0
## 2  aardsda01   2006    CHN   2   0   0   0  0  0  0  0  0   0  0
## 3  aardsda01   2007    CHA   0   0   0   0  0  0  0  0  0   0  0
## 4  aardsda01   2008    BOS   1   0   0   0  0  0  0  1  0   0  0
## 5  aardsda01   2009    SEA   0   0   0   0  0  0  0  0  0   0  0
## 6  aardsda01   2010    SEA   0   0   0   0  0  0  0  0  0   0  0
## 7  aardsda01   2012    NYA  NA  NA  NA  NA NA NA NA NA NA  NA NA
## 8  aaronha01   1954    ML1 468 131  27   6 13  2  2 39 28   3  4
## 9  aaronha01   1955    ML1 602 189  37   9 27  3  1 61 49   3  4
## 10 aaronha01   1956    ML1 609 200  34  14 26  2  4 54 37   2  7

As you can see, our dataset now has these 14 columns. Suppose we wanted to add some new columns:

  • Batting average = hits / at-bats

  • On-base percentage = (hits + walks + hit-by-pitch) / (at-bats + walks + hbp + sacrifice flies)

  • Slugging percentage = (singles + 2doubles + 3triples + 4HRs) / (at-bats)

It’s easy to add columns using the mutate command in dplyr:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
  mutate(Avg = H/AB,
         OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
         SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
  head(12)
##     playerID yearID teamID  AB   H X2B X3B HR SB CS SO BB HBP SF    Avg
## 1  aardsda01   2004    SFN   0   0   0   0  0  0  0  0  0   0  0    NaN
## 2  aardsda01   2006    CHN   2   0   0   0  0  0  0  0  0   0  0 0.0000
## 3  aardsda01   2007    CHA   0   0   0   0  0  0  0  0  0   0  0    NaN
## 4  aardsda01   2008    BOS   1   0   0   0  0  0  0  1  0   0  0 0.0000
## 5  aardsda01   2009    SEA   0   0   0   0  0  0  0  0  0   0  0    NaN
## 6  aardsda01   2010    SEA   0   0   0   0  0  0  0  0  0   0  0    NaN
## 7  aardsda01   2012    NYA  NA  NA  NA  NA NA NA NA NA NA  NA NA     NA
## 8  aaronha01   1954    ML1 468 131  27   6 13  2  2 39 28   3  4 0.2799
## 9  aaronha01   1955    ML1 602 189  37   9 27  3  1 61 49   3  4 0.3140
## 10 aaronha01   1956    ML1 609 200  34  14 26  2  4 54 37   2  7 0.3284
## 11 aaronha01   1957    ML1 615 198  27   6 44  1  1 58 57   0  3 0.3220
## 12 aaronha01   1958    ML1 601 196  34   4 30  4  1 49 59   1  3 0.3261
##       OBP    SLG
## 1     NaN    NaN
## 2  0.0000 0.0000
## 3     NaN    NaN
## 4  0.0000 0.0000
## 5     NaN    NaN
## 6     NaN    NaN
## 7      NA     NA
## 8  0.3221 0.4466
## 9  0.3663 0.5399
## 10 0.3649 0.5583
## 11 0.3778 0.6000
## 12 0.3855 0.5458

As you can see, the command worked for rows 8-12. We can see, for example, that Hank Aaron (aaronha01) batted .279 in 1954. We can also see the command gave us “NaN” and “NA” in several of those rows.

The NA represents missing data, while NaN means “not a number.” We got NaN results when we tried to divide by zero in our calculations. For example, the first row lists a player with no at-bats. When we tried to calculate a batting average for that row, the calculation was 0/0 = NaN. To get around this problem, we can filter the dataset to include only players who have at least one at-bat in a season:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
  filter(AB>0) %.%
  mutate(Avg = H/AB,
         OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
         SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
  head(5)
##    playerID yearID teamID  AB   H X2B X3B HR SB CS SO BB HBP SF    Avg
## 1 aardsda01   2006    CHN   2   0   0   0  0  0  0  0  0   0  0 0.0000
## 2 aardsda01   2008    BOS   1   0   0   0  0  0  0  1  0   0  0 0.0000
## 3 aaronha01   1954    ML1 468 131  27   6 13  2  2 39 28   3  4 0.2799
## 4 aaronha01   1955    ML1 602 189  37   9 27  3  1 61 49   3  4 0.3140
## 5 aaronha01   1956    ML1 609 200  34  14 26  2  4 54 37   2  7 0.3284
##      OBP    SLG
## 1 0.0000 0.0000
## 2 0.0000 0.0000
## 3 0.3221 0.4466
## 4 0.3663 0.5399
## 5 0.3649 0.5583

Let’s find the top 10 season slugging percentages in MLB history:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
  filter(AB>0) %.%
  mutate(Avg = H/AB,
         OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
         SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
  arrange(desc(SLG)) %.%
  head(11)
##     playerID yearID teamID AB H X2B X3B HR SB CS SO BB HBP SF Avg OBP SLG
## 1  chacigu01   2010    HOU  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 2  hernafe02   2008    SEA  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 3  lefebbi01   1938    BOS  1 1   0   0  1  0  0  0  0   0 NA   1  NA   4
## 4   motagu01   1999    MON  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 5  narumbu01   1963    BAL  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 6  perrypa02   1988    CHN  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 7  quirkja01   1984    CLE  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 8  rogered01   2005    BAL  1 1   0   0  1  0  2  0  0   0  0   1   1   4
## 9  sleatlo01   1958    DET  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 10   yanes01   2000    TBA  1 1   0   0  1  0  0  0  0   0  0   1   1   4
## 11 altroni01   1924    WS1  1 1   0   1  0  0  0  0  0   0 NA   1  NA   3

As you can see, there were 10 players who had a homerun in their single at-bat during the season (which yields a slugging percentage of 4.00). Let’s filter these results to only include players with at least 100 at-bats in a season:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, H, X2B, X3B, HR, SB, CS, SO, BB, HBP, SF) %.%
  filter(AB>100) %.%
  mutate(Avg = H/AB,
         OBP = (H + BB + HBP)/(AB+BB+HBP+SF),
         SLG = (((H-X2B-X3B-HR)+(2*X2B)+(3*X3B)+(4*HR))/AB)) %.%
  arrange(desc(SLG)) %.%
  head(10)
##     playerID yearID teamID  AB   H X2B X3B HR SB CS  SO  BB HBP SF    Avg
## 1  bondsba01   2001    SFN 476 156  32   2 73 13  3  93 177   9  2 0.3277
## 2   ruthba01   1920    NYA 457 172  36   9 54 14 14  80 150   3 NA 0.3764
## 3   ruthba01   1921    NYA 540 204  44  16 59 17 13  81 145   4 NA 0.3778
## 4  bondsba01   2004    SFN 373 135  27   3 45  6  1  41 232   9  3 0.3619
## 5  bondsba01   2002    SFN 403 149  31   2 46  9  2  47 198   9  2 0.3697
## 6   ruthba01   1927    NYA 540 192  29   8 60  7  6  89 137   0 NA 0.3556
## 7  gehrilo01   1927    NYA 584 218  52  18 47 10  8  84 109   3 NA 0.3733
## 8   ruthba01   1923    NYA 522 205  45  13 41 17 21  93 170   4 NA 0.3927
## 9  hornsro01   1925    SLN 504 203  41  10 39  5  3  39  83   2 NA 0.4028
## 10 mcgwima01   1998    SLN 509 152  21   0 70  1  0 155 162   6  4 0.2986
##       OBP    SLG
## 1  0.5151 0.8634
## 2      NA 0.8490
## 3      NA 0.8463
## 4  0.6094 0.8123
## 5  0.5817 0.7990
## 6      NA 0.7722
## 7      NA 0.7654
## 8      NA 0.7644
## 9      NA 0.7560
## 10 0.4699 0.7525

From this, we see Barry Bonds had the highest slugging percentage in 2001 (when he hit 73 home runs). Let’s use a similar set of commands to see which players had the highest number of strikeouts:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, SO) %.%
  arrange(desc(SO)) %.%
  head(5)
##    playerID yearID teamID  SO
## 1 reynoma01   2009    ARI 223
## 2  dunnad01   2012    CHA 222
## 3 reynoma01   2010    ARI 211
## 4 stubbdr01   2011    CIN 205
## 5 reynoma01   2008    ARI 204

Mark Reynolds, in 2009, struck-out 223 times. Adam Dunn was a close second, with 222 strikeouts in 2012 for the White Sox.

Who had the most career strikeouts? Let’s see:

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, SO) %.%
  group_by(playerID) %.%
  summarize(totalSO = sum(SO)) %.%
  arrange(desc(totalSO)) %.%
  head(5)
##   totalSO
## 1      NA

Reggie Jackson has struck-out more times (2597 times) than any other player in MLB history. Does he have the highest strike-out rate (strikeouts per at-bat)? Let’s see (for players with at least 2000 at-bats in their careers):

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, SO) %.%
  group_by(playerID) %.%
  summarize(totalSO = sum(SO),
            totalAB = sum(AB)) %.%
  filter(totalAB>2000) %.%
  mutate(Krate = totalSO/totalAB) %.%
  arrange(desc(Krate)) %.%
  head(5)
## [1] totalSO totalAB Krate  
## <0 rows> (or 0-length row.names)

A Google search tells me the leader, striking out nearly 39% of the time, is Jack Cust (who amassed most of his strikeouts in Oakland from 2007-2009).

I wonder who had the highest strikeout rate while playing for the Detroit Tigers…

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, SO) %.%
  group_by(playerID) %.%
  filter(teamID=="DET") %.%
  summarize(totalSO = sum(SO),
            totalAB = sum(AB)) %.%
  filter(totalAB>2000) %.%
  mutate(Krate = totalSO/totalAB) %.%
  arrange(desc(Krate)) %.%
  head(5)
## [1] totalSO totalAB Krate  
## <0 rows> (or 0-length row.names)

Brandon Inge. What about the lowest strike-out rate? To do this, I just sort the data in ascending order (deleting the desc command):

## Add new columns (BA, OBP, SLG)
Batting %.%
  select(playerID, yearID, teamID, AB, SO) %.%
  group_by(playerID) %.%
  filter(teamID=="DET") %.%
  summarize(totalSO = sum(SO),
            totalAB = sum(AB)) %.%
  filter(totalAB>2000) %.%
  mutate(Krate = totalSO/totalAB) %.%
  arrange(Krate) %.%
  head(5)
## [1] totalSO totalAB Krate  
## <0 rows> (or 0-length row.names)

Google tells me it’s Doc Cramer, who played for Detroit from 1942-1948.


My favorite baseball player is Alan Trammell, a shortstop for the Detroit Tigers. To access his data, I need to know his playerID. I’ll look it up by typing in the first letters of his last name:

playerInfo("tramm")
##        playerID nameFirst nameLast
## 14308 trammal01      Alan Trammell
## 14309 trammbu01     Bubba Trammell

From this, we see there were two Trammells in MLB: Alan and Bubba. I want playerID trammal01. Let’s take a look at his batting statistics each season. Since I already know his playerID and teamID, I’ll eliminate those columns (using a minus - in the select command):

Batting %.%
  filter(playerID=="trammal01") %.%
  select(-playerID, -teamID, -stint, -lgID, -G_batting, -G_old)
##    yearID   G  AB   R   H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP
## 1    1977  19  43   6   8   0   0  0   0  0  0  4 12   0   0  1  0    1
## 2    1978 139 448  49 120  14   6  2  34  3  1 45 56   0   2  6  3   12
## 3    1979 142 460  68 127  11   4  6  50 17 14 43 55   0   0 12  5    6
## 4    1980 146 560 107 168  21   5  9  65 12 12 69 63   2   3 13  7   10
## 5    1981 105 392  52 101  15   3  2  31 10  3 49 31   2   3 16  3   10
## 6    1982 157 489  66 126  34   3  9  57 19  8 52 47   0   0  9  6    5
## 7    1983 142 505  83 161  31   2 14  66 30 10 57 64   2   0 15  4    7
## 8    1984 139 555  85 174  34   5 14  69 19 13 60 63   2   3  6  2    8
## 9    1985 149 605  79 156  21   7 13  57 14  5 50 71   4   2 11  9    6
## 10   1986 151 574 107 159  33   7 21  75 25 12 59 57   4   5 11  4    7
## 11   1987 151 597 109 205  34   3 28 105 21  2 60 47   8   3  2  6   11
## 12   1988 128 466  73 145  24   1 15  69  7  4 46 46   8   4  0  7   14
## 13   1989 121 449  54 109  20   3  5  43 10  2 45 45   1   4  3  5    9
## 14   1990 146 559  71 170  37   1 14  89 12 10 68 55   7   1  3  6   11
## 15   1991 101 375  57  93  20   0  9  55 11  2 37 39   1   3  5  1    7
## 16   1992  29 102  11  28   7   1  1  11  2  2 15  4   0   1  1  1    6
## 17   1993 112 401  72 132  25   3 12  60 12  8 38 38   2   2  4  2    7
## 18   1994  76 292  38  78  17   1  8  28  3  0 16 35   1   1  2  0    8
## 19   1995  74 223  28  60  12   0  2  23  3  1 27 19   4   0  3  2    8
## 20   1996  66 193  16  45   2   0  1  16  6  0 10 27   0   0  1  3    3

From this, we can see he played 20 seasons from 1977-1996. We can list his 5 best seasons in terms of on-base percentage:

## Add new columns (BA, OBP, SLG)
Batting %.%
  filter(playerID=="trammal01") %.%
  select(-playerID, -teamID, -stint, -lgID, -G_batting, -G_old) %.%
  mutate(Avg = H/AB,
         OBP = (H + BB + HBP)/(AB+BB+HBP+SF)) %.%
  arrange(desc(OBP)) %.%
  head(5)
##   yearID   G  AB   R   H X2B X3B HR RBI SB CS BB SO IBB HBP SH SF GIDP
## 1   1987 151 597 109 205  34   3 28 105 21  2 60 47   8   3  2  6   11
## 2   1993 112 401  72 132  25   3 12  60 12  8 38 38   2   2  4  2    7
## 3   1983 142 505  83 161  31   2 14  66 30 10 57 64   2   0 15  4    7
## 4   1984 139 555  85 174  34   5 14  69 19 13 60 63   2   3  6  2    8
## 5   1990 146 559  71 170  37   1 14  89 12 10 68 55   7   1  3  6   11
##      Avg    OBP
## 1 0.3434 0.4024
## 2 0.3292 0.3883
## 3 0.3188 0.3852
## 4 0.3135 0.3823
## 5 0.3041 0.3770

We can also get his career totals:

Batting %.%
  filter(playerID=="trammal01") %.%
  group_by(playerID) %.%
  dplyr::summarize(seasons = n(),
            games = sum(G),
            AB = sum(AB),
            hits = sum(H), 
            dbl = sum(X2B),
            trpl = sum(X3B),
            HR = sum(HR),
            Avg = (sum(H) / sum(AB)),
            OBP = ((sum(H)+sum(BB)+sum(HBP))/(sum(AB)+sum(BB)+sum(HBP)+sum(SF))),
            SB = sum(SB),
            SBpct = (sum(SB)/(sum(CS)+sum(SB))))
## Source: local data frame [1 x 12]
## 
##    playerID seasons games   AB hits dbl trpl  HR    Avg    OBP  SB  SBpct
## 1 trammal01      20  2293 8288 2365 412   55 185 0.2854 0.3515 236 0.6841

Over his 20 seasons, Alan Trammell batted .285, hit 185 HR, and successfully stole bases 68.4% of the time.


Visualizations

Let’s see the relationship between career homeruns and strikeouts for every player in MLB history
### Minimum 400 games over 5 seasons with at least one SO and HR
### na.rm=TRUE is a command to remove NA values from each variable
hrSO <- Batting %.%
  group_by(playerID) %.%
  dplyr::summarize(seasons = n(),
            games = sum(G, na.rm=TRUE),
            atbats = sum(AB, na.rm=TRUE),
            HR = sum(HR, na.rm=TRUE),
            Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
            SO = sum(SO, na.rm=TRUE)) %.%
  filter(seasons>5 & games>400 & atbats>1000 & SO>0 & HR>0)

### Create plot
ggplot(hrSO, aes(HR, SO)) +
  geom_point(alpha = 1/2) +
  geom_smooth() +
  scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-35

This plot shows the general positive relationship between strikeouts and homeruns. This could be simply due to some players having more at-bats than others, so let’s plot the relationship between “home runs per at-bat” and “strikeouts per at-bat”

### Minimum 400 games over 5 seasons with at least one SO and HR
### na.rm=TRUE is a command to remove NA values from each variable
hrSO2 <- Batting %.%
  group_by(playerID) %.%
  dplyr::summarize(seasons = n(),
            games = sum(G, na.rm=TRUE),
            atbats = sum(AB, na.rm=TRUE),
            HR = sum(HR, na.rm=TRUE),
            Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
            SO = sum(SO, na.rm=TRUE),
            HRperAB = (HR/atbats),
            SOperAB = (SO/atbats)) %.%
  filter(seasons>5 & games>400 & atbats>1000 & SO>0 & HR>0)

### Create plot
ggplot(hrSO2, aes(HRperAB, SOperAB)) +
  geom_point(alpha = 1/2) +
  geom_smooth() +
  scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-36

I wonder if this same relationship holds for pitchers. Do pitchers with more career strikeouts give up more career homeruns? Let’s see (controlling for innings pitched)…

### Minimum 3000 innings pitched
### na.rm=TRUE is a command to remove NA values from each variable
PitcherHRso <- Pitching %.%
  group_by(playerID) %.%
  dplyr::summarize(seasons = n(),
            IPouts = sum(IPouts, na.rm=TRUE),
            HR = sum(HR, na.rm=TRUE),
            SO = sum(SO, na.rm=TRUE),
            HRperIP = (HR/IPouts),
            SOperIP = (SO/IPouts)) %.%
  filter(IPouts > 3000)

### Create plot
ggplot(PitcherHRso, aes(HRperIP, SOperIP)) +
  geom_point(aes(size=IPouts), alpha = .4) +
  geom_smooth() +
  scale_size_area()
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-37

The size of the dots indicate the number of innings pitched for each pitcher. This relationship is a bit more complicated than it was for batters.


Final example - team statistics

See if you can figure out what this plot shows:

TeamSummary <- Batting %.%
  group_by(teamID) %.%
  dplyr::summarize(games = sum(G, na.rm=TRUE),
            atbats = sum(AB, na.rm=TRUE),
            HR = sum(HR, na.rm=TRUE),
            Avg = (sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)),
            SO = sum(SO, na.rm=TRUE),
            HRperGame = (HR/games),
            SOperGame = (SO/games)) %.%
            filter(HRperGame>0 & SOperGame>0)

ggplot(TeamSummary, aes(HRperGame, SOperGame)) +
  geom_point(aes(size=Avg), alpha = 1/2) +
  geom_smooth() +
  scale_size_area()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-38