Data-Informed Thinking + Doing
Feature Engineering
Creating new and insightful baseball pitching stats (features or independent/predictor variables).
Getting Started
R.version.string
[1] "R version 4.1.1 (2021-08-10)"
require(devtools)
devtools::install_version("Lahman", version = "9.0-0", repos = "http://cran.us.r-project.org")
devtools::install_version("dplyr", version = "1.0.4", repos = "http://cran.us.r-project.org")
devtools::install_version("ggplot", version = "3.3.5", repos = "http://cran.us.r-project.org")
devtools::install_version("caret", version = "6.0-90", repos = "http://cran.us.r-project.org")
library(Lahman)
library(dplyr)
library(ggplot2)
library(caret)
data(Pitching)
str(Pitching)
'data.frame': 48399 obs. of 30 variables:
$ playerID: chr "bechtge01" "brainas01" "fergubo01" "fishech01" ...
$ yearID : int 1871 1871 1871 1871 1871 1871 1871 1871 1871 1871 ...
$ stint : int 1 1 1 1 1 1 1 1 1 1 ...
$ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 97 142 90 111 90 136 111 56 97 136 ...
$ lgID : Factor w/ 7 levels "AA","AL","FL",..: 4 4 4 4 4 4 4 4 4 4 ...
$ W : int 1 12 0 4 0 0 0 6 18 12 ...
$ L : int 2 15 0 16 1 0 1 11 5 15 ...
$ G : int 3 30 1 24 1 1 3 19 25 29 ...
$ GS : int 3 30 0 24 1 0 1 19 25 29 ...
$ CG : int 2 30 0 22 1 0 1 19 25 28 ...
$ SHO : int 0 0 0 1 0 0 0 1 0 0 ...
$ SV : int 0 0 0 0 0 0 0 0 0 0 ...
$ IPouts : int 78 792 3 639 27 3 39 507 666 747 ...
$ H : int 43 361 8 295 20 1 20 261 285 430 ...
$ ER : int 23 132 3 103 10 0 5 97 113 153 ...
$ HR : int 0 4 0 3 0 0 0 5 3 4 ...
$ BB : int 11 37 0 31 3 0 3 21 40 75 ...
$ SO : int 1 13 0 15 0 0 1 17 15 12 ...
$ BAOpp : num NA NA NA NA NA NA NA NA NA NA ...
$ ERA : num 7.96 4.5 27 4.35 10 0 3.46 5.17 4.58 5.53 ...
$ IBB : int NA NA NA NA NA NA NA NA NA NA ...
$ WP : int 7 7 2 20 0 0 1 15 3 44 ...
$ HBP : int NA NA NA NA NA NA NA NA NA NA ...
$ BK : int 0 0 0 0 0 0 0 2 0 0 ...
$ BFP : int 146 1291 14 1080 57 3 70 876 1059 1334 ...
$ GF : int 0 0 0 1 0 1 1 0 0 0 ...
$ R : int 42 292 9 257 21 0 30 243 223 362 ...
$ SH : int NA NA NA NA NA NA NA NA NA NA ...
$ SF : int NA NA NA NA NA NA NA NA NA NA ...
$ GIDP : int NA NA NA NA NA NA NA NA NA NA ...
top25_nyn_pitching_era <- Pitching %>%
dplyr::filter(teamID == "NYN" & IPouts >= 600) %>%
dplyr::arrange(ERA) %>%
dplyr::do(head(., 25))
top25_nyn_pitching_era
playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR BB SO BAOpp ERA IBB WP HBP BK BFP GF R SH SF GIDP
1 goodedw01 1985 1 NYN NL 24 4 35 35 16 8 0 830 198 47 13 69 268 0.20 1.5 4 6 2 2 1065 0 51 6 2 21
2 degroja01 2018 1 NYN NL 10 9 32 32 1 0 0 651 152 41 10 46 269 0.20 1.7 3 2 5 0 835 0 48 3 5 13
3 seaveto01 1971 1 NYN NL 20 10 36 35 21 4 0 859 210 56 18 61 289 0.21 1.8 2 5 4 1 1103 1 61 11 6 NA
4 koosmje01 1968 1 NYN NL 19 12 35 34 17 7 0 791 221 61 16 69 178 0.22 2.1 7 8 8 0 1058 0 72 NA NA NA
5 seaveto01 1973 1 NYN NL 19 10 36 36 18 3 0 870 219 67 23 64 251 0.21 2.1 5 5 4 0 1147 0 74 10 5 17
6 seaveto01 1968 1 NYN NL 16 12 36 35 14 5 1 833 224 68 15 48 205 0.22 2.2 5 8 8 1 1088 1 73 NA NA NA
7 seaveto01 1969 1 NYN NL 25 7 36 35 18 5 0 820 202 67 24 82 208 0.20 2.2 9 8 7 1 1089 1 75 NA NA NA
8 coneda01 1988 1 NYN NL 20 3 35 28 8 4 0 694 178 57 10 80 213 0.21 2.2 7 10 4 10 936 0 67 11 5 11
9 koosmje01 1969 1 NYN NL 17 9 32 32 16 6 0 723 187 61 14 68 180 0.21 2.3 11 7 4 2 957 0 66 NA NA NA
10 matlajo01 1972 1 NYN NL 15 10 34 32 8 4 0 732 215 63 14 71 169 0.23 2.3 14 7 2 1 1003 1 79 8 4 NA
11 seaveto01 1975 1 NYN NL 22 9 36 36 15 5 0 841 217 74 11 88 243 0.21 2.4 6 7 4 1 1115 0 81 9 2 18
12 matlajo01 1974 1 NYN NL 13 15 34 34 14 7 0 796 221 71 8 76 195 0.23 2.4 11 4 5 1 1076 0 82 8 7 18
13 swancr01 1978 1 NYN NL 9 6 29 28 5 1 0 622 164 56 12 58 125 0.22 2.4 8 1 2 0 819 0 62 5 4 12
14 degroja01 2019 1 NYN NL 11 8 32 32 0 0 0 612 154 55 19 44 255 0.21 2.4 1 2 7 0 804 0 59 5 3 10
15 santajo01 2008 1 NYN NL 16 7 34 34 3 2 0 703 206 66 23 63 206 0.23 2.5 5 9 4 2 964 0 74 9 1 12
16 ojedabo01 1986 1 NYN NL 18 5 32 30 7 2 0 652 185 62 15 52 148 0.23 2.6 3 2 2 1 871 1 72 10 3 19
17 seaveto01 1976 1 NYN NL 14 11 35 34 13 5 0 813 211 78 14 77 235 0.21 2.6 9 12 4 0 1079 0 83 7 2 18
18 goodedw01 1984 1 NYN NL 17 9 31 31 7 3 0 654 161 63 7 73 276 0.20 2.6 2 3 2 7 879 0 72 3 2 4
19 violafr01 1990 1 NYN NL 20 12 35 35 7 3 0 749 227 74 15 60 182 0.24 2.7 2 11 2 0 1016 0 83 13 3 17
20 koosmje01 1976 1 NYN NL 21 10 34 32 17 3 0 742 205 74 19 66 200 0.23 2.7 7 3 1 1 994 0 81 15 4 11
21 fernasi01 1992 1 NYN NL 14 11 32 32 5 2 0 644 162 65 12 67 193 0.21 2.7 4 0 4 0 865 0 67 12 11 5
22 dickera01 2012 1 NYN NL 20 6 34 33 5 3 0 701 192 71 24 54 230 0.23 2.7 2 4 9 1 927 1 78 9 7 25
23 seaveto01 1967 1 NYN NL 16 13 35 34 18 2 0 753 224 77 19 78 170 0.24 2.8 6 5 5 0 1029 1 85 NA NA NA
24 darliro01 1986 1 NYN NL 15 6 34 34 4 2 0 711 203 74 21 81 184 0.23 2.8 2 7 3 3 967 0 84 10 6 18
25 seaveto01 1970 1 NYN NL 18 12 37 36 19 2 0 872 230 91 21 83 283 0.21 2.8 8 6 4 0 1173 1 103 9 4 NA
pitching_nl_2000_2021 <- Pitching %>%
dplyr::filter(lgID == "NL") %>%
dplyr::filter(yearID >= 2000 & yearID <= 2021)
Bucketing and Binning
pitching_bb_wp_hbp <- Pitching %>%
select(BB, WP, HBP)
str(pitching_bb_wp_hbp)
'data.frame': 48399 obs. of 3 variables:
$ BB : int 11 37 0 31 3 0 3 21 40 75 ...
$ WP : int 7 7 2 20 0 0 1 15 3 44 ...
$ HBP: int NA NA NA NA NA NA NA NA NA NA ...
ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = BB)) +
geom_density()
ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = WP)) +
geom_density()
ggplot2::ggplot(pitching_bb_wp_hbp, aes(x = HBP)) +
geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).
Box-Cox Transformation
preprocessed_box_cox <- caret::preProcess(pitching_bb_wp_hbp, method = c("BoxCox"))
transformed_box_cox <- predict(preprocessed_box_cox, Pitching)
ggplot2::ggplot(transformed_box_cox, aes(x = BB)) +
geom_density()
ggplot2::ggplot(transformed_box_cox, aes(x = WP)) +
geom_density()
ggplot2::ggplot(transformed_box_cox, aes(x = HBP)) +
geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).
Yeo-Johnson Transformation
preprocessed_yeo_johnson <- caret::preProcess(pitching_bb_wp_hbp, method = c("YeoJohnson"))
transformed_yeo_johnson <- predict(preprocessed_yeo_johnson, Pitching)
ggplot2::ggplot(transformed_yeo_johnson, aes(x = BB)) +
geom_density()
ggplot2::ggplot(transformed_yeo_johnson, aes(x = WP)) +
geom_density()
ggplot2::ggplot(transformed_yeo_johnson, aes(x = HBP)) +
geom_density()
Warning: Removed 734 rows containing non-finite values (stat_density).
Principal Component Analysis
Applied Advanced Analytics & AI in Sports