Data-Informed Thinking + Doing

Feature Engineering

Creating new and insightful baseball pitching stats (features or independent/predictor variables).

Getting Started

R.version.string
[1] "R version 4.1.1 (2021-08-10)"
require(devtools)
devtools::install_version("Lahman", version = "9.0-0", repos = "http://cran.us.r-project.org")
devtools::install_version("dplyr", version = "1.0.4", repos = "http://cran.us.r-project.org")
devtools::install_version("ggplot", version = "3.3.5", repos = "http://cran.us.r-project.org")
devtools::install_version("caret", version = "6.0-90", repos = "http://cran.us.r-project.org")
library(Lahman)
library(dplyr)
library(ggplot2)
library(caret)
data(Pitching)
str(Pitching)
'data.frame':   48399 obs. of  30 variables:
 $ playerID: chr  "bechtge01" "brainas01" "fergubo01" "fishech01" ...
 $ yearID  : int  1871 1871 1871 1871 1871 1871 1871 1871 1871 1871 ...
 $ stint   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ teamID  : Factor w/ 149 levels "ALT","ANA","ARI",..: 97 142 90 111 90 136 111 56 97 136 ...
 $ lgID    : Factor w/ 7 levels "AA","AL","FL",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ W       : int  1 12 0 4 0 0 0 6 18 12 ...
 $ L       : int  2 15 0 16 1 0 1 11 5 15 ...
 $ G       : int  3 30 1 24 1 1 3 19 25 29 ...
 $ GS      : int  3 30 0 24 1 0 1 19 25 29 ...
 $ CG      : int  2 30 0 22 1 0 1 19 25 28 ...
 $ SHO     : int  0 0 0 1 0 0 0 1 0 0 ...
 $ SV      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ IPouts  : int  78 792 3 639 27 3 39 507 666 747 ...
 $ H       : int  43 361 8 295 20 1 20 261 285 430 ...
 $ ER      : int  23 132 3 103 10 0 5 97 113 153 ...
 $ HR      : int  0 4 0 3 0 0 0 5 3 4 ...
 $ BB      : int  11 37 0 31 3 0 3 21 40 75 ...
 $ SO      : int  1 13 0 15 0 0 1 17 15 12 ...
 $ BAOpp   : num  NA NA NA NA NA NA NA NA NA NA ...
 $ ERA     : num  7.96 4.5 27 4.35 10 0 3.46 5.17 4.58 5.53 ...
 $ IBB     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ WP      : int  7 7 2 20 0 0 1 15 3 44 ...
 $ HBP     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ BK      : int  0 0 0 0 0 0 0 2 0 0 ...
 $ BFP     : int  146 1291 14 1080 57 3 70 876 1059 1334 ...
 $ GF      : int  0 0 0 1 0 1 1 0 0 0 ...
 $ R       : int  42 292 9 257 21 0 30 243 223 362 ...
 $ SH      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ SF      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ GIDP    : int  NA NA NA NA NA NA NA NA NA NA ...
top25_nyn_pitching_era <- Pitching %>%
    dplyr::filter(teamID == "NYN" & IPouts >= 600) %>%
    dplyr::arrange(ERA) %>%
    dplyr::do(head(., 25))

top25_nyn_pitching_era
    playerID yearID stint teamID lgID  W  L  G GS CG SHO SV IPouts   H ER HR BB  SO BAOpp ERA IBB WP HBP BK  BFP GF   R SH SF GIDP
1  goodedw01   1985     1    NYN   NL 24  4 35 35 16   8  0    830 198 47 13 69 268  0.20 1.5   4  6   2  2 1065  0  51  6  2   21
2  degroja01   2018     1    NYN   NL 10  9 32 32  1   0  0    651 152 41 10 46 269  0.20 1.7   3  2   5  0  835  0  48  3  5   13
3  seaveto01   1971     1    NYN   NL 20 10 36 35 21   4  0    859 210 56 18 61 289  0.21 1.8   2  5   4  1 1103  1  61 11  6   NA
4  koosmje01   1968     1    NYN   NL 19 12 35 34 17   7  0    791 221 61 16 69 178  0.22 2.1   7  8   8  0 1058  0  72 NA NA   NA
5  seaveto01   1973     1    NYN   NL 19 10 36 36 18   3  0    870 219 67 23 64 251  0.21 2.1   5  5   4  0 1147  0  74 10  5   17
6  seaveto01   1968     1    NYN   NL 16 12 36 35 14   5  1    833 224 68 15 48 205  0.22 2.2   5  8   8  1 1088  1  73 NA NA   NA
7  seaveto01   1969     1    NYN   NL 25  7 36 35 18   5  0    820 202 67 24 82 208  0.20 2.2   9  8   7  1 1089  1  75 NA NA   NA
8   coneda01   1988     1    NYN   NL 20  3 35 28  8   4  0    694 178 57 10 80 213  0.21 2.2   7 10   4 10  936  0  67 11  5   11
9  koosmje01   1969     1    NYN   NL 17  9 32 32 16   6  0    723 187 61 14 68 180  0.21 2.3  11  7   4  2  957  0  66 NA NA   NA
10 matlajo01   1972     1    NYN   NL 15 10 34 32  8   4  0    732 215 63 14 71 169  0.23 2.3  14  7   2  1 1003  1  79  8  4   NA
11 seaveto01   1975     1    NYN   NL 22  9 36 36 15   5  0    841 217 74 11 88 243  0.21 2.4   6  7   4  1 1115  0  81  9  2   18
12 matlajo01   1974     1    NYN   NL 13 15 34 34 14   7  0    796 221 71  8 76 195  0.23 2.4  11  4   5  1 1076  0  82  8  7   18
13  swancr01   1978     1    NYN   NL  9  6 29 28  5   1  0    622 164 56 12 58 125  0.22 2.4   8  1   2  0  819  0  62  5  4   12
14 degroja01   2019     1    NYN   NL 11  8 32 32  0   0  0    612 154 55 19 44 255  0.21 2.4   1  2   7  0  804  0  59  5  3   10
15 santajo01   2008     1    NYN   NL 16  7 34 34  3   2  0    703 206 66 23 63 206  0.23 2.5   5  9   4  2  964  0  74  9  1   12
16 ojedabo01   1986     1    NYN   NL 18  5 32 30  7   2  0    652 185 62 15 52 148  0.23 2.6   3  2   2  1  871  1  72 10  3   19
17 seaveto01   1976     1    NYN   NL 14 11 35 34 13   5  0    813 211 78 14 77 235  0.21 2.6   9 12   4  0 1079  0  83  7  2   18
18 goodedw01   1984     1    NYN   NL 17  9 31 31  7   3  0    654 161 63  7 73 276  0.20 2.6   2  3   2  7  879  0  72  3  2    4
19 violafr01   1990     1    NYN   NL 20 12 35 35  7   3  0    749 227 74 15 60 182  0.24 2.7   2 11   2  0 1016  0  83 13  3   17
20 koosmje01   1976     1    NYN   NL 21 10 34 32 17   3  0    742 205 74 19 66 200  0.23 2.7   7  3   1  1  994  0  81 15  4   11
21 fernasi01   1992     1    NYN   NL 14 11 32 32  5   2  0    644 162 65 12 67 193  0.21 2.7   4  0   4  0  865  0  67 12 11    5
22 dickera01   2012     1    NYN   NL 20  6 34 33  5   3  0    701 192 71 24 54 230  0.23 2.7   2  4   9  1  927  1  78  9  7   25
23 seaveto01   1967     1    NYN   NL 16 13 35 34 18   2  0    753 224 77 19 78 170  0.24 2.8   6  5   5  0 1029  1  85 NA NA   NA
24 darliro01   1986     1    NYN   NL 15  6 34 34  4   2  0    711 203 74 21 81 184  0.23 2.8   2  7   3  3  967  0  84 10  6   18
25 seaveto01   1970     1    NYN   NL 18 12 37 36 19   2  0    872 230 91 21 83 283  0.21 2.8   8  6   4  0 1173  1 103  9  4   NA
pitching_nl_2000_2021 <- Pitching %>%
    dplyr::filter(lgID == "NL") %>%
    dplyr::filter(yearID >= 2000 & yearID <= 2021)

Bucketing and Binning

pitching_bb_wp_hbp <- Pitching %>%
    select(BB, WP, HBP)
str(pitching_bb_wp_hbp)
'data.frame':   48399 obs. of  3 variables:
 $ BB : int  11 37 0 31 3 0 3 21 40 75 ...
 $ WP : int  7 7 2 20 0 0 1 15 3 44 ...
 $ HBP: int  NA NA NA NA NA NA NA NA NA NA ...
ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = BB)) +
    geom_density()

ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = WP)) +
    geom_density()

ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = HBP)) +
    geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).

Box-Cox Transformation

preprocessed_box_cox <- caret::preProcess(pitching_bb_wp_hbp, method = c("BoxCox"))
transformed_box_cox <- predict(preprocessed_box_cox, Pitching)
ggplot2::ggplot(transformed_box_cox, aes(x = BB)) +
    geom_density()

ggplot2::ggplot(transformed_box_cox, aes(x = WP)) +
    geom_density()

ggplot2::ggplot(transformed_box_cox, aes(x = HBP)) +
    geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).

Yeo-Johnson Transformation

preprocessed_yeo_johnson <- caret::preProcess(pitching_bb_wp_hbp, method = c("YeoJohnson"))
transformed_yeo_johnson <- predict(preprocessed_yeo_johnson, Pitching)
ggplot2::ggplot(transformed_yeo_johnson, aes(x = BB)) +
    geom_density()

ggplot2::ggplot(transformed_yeo_johnson, aes(x = WP)) +
    geom_density()

ggplot2::ggplot(transformed_yeo_johnson, aes(x = HBP)) +
    geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).

Principal Component Analysis

Applied Advanced Analytics & AI in Sports