library(tidyverse)
library(baseballr)
library(ggthemes)
library(ggimage)
library(gt)
library(readr)
options(scipen = 9999)
start_date <- as.Date('2022-04-07', '%Y-%m-%d')
end_date <- as.Date('2022-10-05', '%Y-%m-%d')
date_seq <- seq(from=start_date, to=end_date, by="day")
date_seq <- as.character(date_seq)
# 30 min runtime for full season.
statcast_df <- data.frame()
for (date in date_seq){
suppressWarnings({
statcast <- statcast_search_pitchers(start_date=date, end_date=date)
})
if (nrow(statcast) > 0){
statcast_df <- rbind(statcast_df, statcast)
}
}
schedule <- mlb_schedule(season="2022") %>%
filter(status_detailed_state != "Postponed") %>%
filter(status_detailed_state != "Cancelled") %>%
filter(series_description == "Regular Season") %>%
group_by(game_pk) %>%
summarize(count=n())
games_list <- as.list(schedule$game_pk)
# 120 min runtime for full season.
pbp_df <- data.frame()
for (i in games_list){
pbp <- mlb_pbp(i) %>%
filter(isPitch == "TRUE")
pbp_df <- bind_rows(pbp_df, pbp)
}
statcast_df$at_bat_number <- sprintf("%02d", statcast_df$at_bat_number)
statcast_df$pitch_number <- sprintf("%02d", statcast_df$pitch_number)
statcast_df$pitch_id <- paste(statcast_df$game_pk, statcast_df$at_bat_number, statcast_df$pitch_number, sep = "-")
pbp_df$atBatIndex <- sprintf("%02d", strtoi(pbp_df$atBatIndex) + 1)
pbp_df$pitchNumber <- sprintf("%02d", strtoi(pbp_df$pitchNumber))
pbp_df$pitch_id <- paste(pbp_df$game_pk, pbp_df$atBatIndex, pbp_df$pitchNumber, sep = "-")
joined_df <- merge(pbp_df, statcast_df, by="pitch_id", all=TRUE)
joined_df <- subset(joined_df, select=-reviewDetails.additionalReviews)
# 3 min runtime for full season. 1.2GB file size.
write.csv(joined_df,"C:\\Users\\chris\\Documents\\Datasets\\BaseballR\\2022\\2022merged.csv", row.names = FALSE)