From b161f750efe1b182ad49d09d77f1bffd614fd365 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Wed, 30 Aug 2023 13:23:28 +0200 Subject: [PATCH 1/4] Compute driveSequenceNumber if missing --- R/helper_scrape_nfl.R | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R index 1887f970..87d3bb2f 100644 --- a/R/helper_scrape_nfl.R +++ b/R/helper_scrape_nfl.R @@ -108,6 +108,32 @@ get_pbp_nfl <- function(id, dir = NULL, qs = FALSE, ...) { plays <- raw_data$data$viewer$gameDetail$plays %>% dplyr::mutate(game_id = as.character(game_id)) + # We have this issue https://github.com/nflverse/nflfastR/issues/309 with 2013 postseason games + # where the driveSequenceNumber in the plays df is NA for all plays. That prevents drive information + # from being joined. + # In this case, we compute our own driveSequenceNumber buy incrementing a counter depending on the + # value of driveTimeOfPossession. + # driveTimeOfPossession will be a constant value during a drive so this should actually be accurate + if (all(is.na(plays$driveSequenceNumber))){ + plays <- plays %>% + dplyr::mutate( + # First, create a trigger for cumsum + drive_trigger = dplyr::case_when( + # this is the first play of the first drive + is.na(dplyr::lag(driveTimeOfPossession)) & !is.na(driveTimeOfPossession) ~ 1, + # if driveTimeOfPossession changes, there is a new drive + dplyr::lag(driveTimeOfPossession) != driveTimeOfPossession ~ 1, + TRUE ~ 0 + ), + # Now create the drive number by accumulationg triggers + driveSequenceNumber = cumsum(drive_trigger), + # driveSequenceNumber should be NA on plays where driveTimeOfPossession is NA + driveSequenceNumber = ifelse(is.na(driveTimeOfPossession), NA_real_, driveSequenceNumber), + # drop the helper + drive_trigger = NULL + ) + } + #fill missing posteam info for this if ( ((home_team %in% c("JAC", "JAX") | away_team %in% c("JAC", "JAX")) & season <= 2015) | From e96e150fc82dee3aa4ebf6bea9316025819b8772 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Wed, 30 Aug 2023 13:25:52 +0200 Subject: [PATCH 2/4] version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index da0aacc2..f8ce77ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: nflfastR Title: Functions to Efficiently Access NFL Play by Play Data -Version: 4.5.1.9004 +Version: 4.5.1.9006 Authors@R: c(person(given = "Sebastian", family = "Carl", From b36847f7f8e3a03ee4184f0f405a999174f311af Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Wed, 30 Aug 2023 13:31:04 +0200 Subject: [PATCH 3/4] typo and news bullet --- NEWS.md | 1 + R/helper_scrape_nfl.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index cb3db312..f6698880 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ - The function `calculate_player_stats()` now returns the opponent team when called with argument `weekly = TRUE` (#414) - The function `calculate_player_stats_def()` no longer errors when small subsets of pbp data are missing stats. (#415) - The function `calculate_series_conversion_rates()` no longer returns `NA` values if a small subset of pbp data is missing series on offense or defense. (#417) +- nflfastR now fixes missing drive number counts in raw pbp data in order to provide accurate drive information. (#420) # nflfastR 4.5.1 diff --git a/R/helper_scrape_nfl.R b/R/helper_scrape_nfl.R index 87d3bb2f..67b434a9 100644 --- a/R/helper_scrape_nfl.R +++ b/R/helper_scrape_nfl.R @@ -111,7 +111,7 @@ get_pbp_nfl <- function(id, dir = NULL, qs = FALSE, ...) { # We have this issue https://github.com/nflverse/nflfastR/issues/309 with 2013 postseason games # where the driveSequenceNumber in the plays df is NA for all plays. That prevents drive information # from being joined. - # In this case, we compute our own driveSequenceNumber buy incrementing a counter depending on the + # In this case, we compute our own driveSequenceNumber by incrementing a counter depending on the # value of driveTimeOfPossession. # driveTimeOfPossession will be a constant value during a drive so this should actually be accurate if (all(is.na(plays$driveSequenceNumber))){ From 53fdc2125a9cc83225700319764688ecfde42118 Mon Sep 17 00:00:00 2001 From: Sebastian Carl Date: Wed, 30 Aug 2023 13:39:03 +0200 Subject: [PATCH 4/4] no need to patch any IDs. We decode them as usual --- R/helper_additional_functions.R | 51 --------------------------------- 1 file changed, 51 deletions(-) diff --git a/R/helper_additional_functions.R b/R/helper_additional_functions.R index 1d572c16..374c9636 100644 --- a/R/helper_additional_functions.R +++ b/R/helper_additional_functions.R @@ -55,57 +55,6 @@ clean_pbp <- function(pbp, ...) { } else { user_message("Cleaning up play-by-play...", "todo") - if(any(pbp$season >= 2022)){ - - # user_message("Loading pbp player ID patch files", "info") - - patch_seasons <- unique(pbp$season[pbp$season >= 2022]) - - patch_ids <- nflreadr::load_from_url( - glue::glue("https://github.com/nflverse/nflverse-data/releases/download/misc/pbp_patch_ids_{patch_seasons}.rds") - ) %>% suppressMessages() - - patchable_ids <- pbp %>% - dplyr::select( - dplyr::any_of(c( - "game_id", "play_id", - "passer_id", "passer_name" = "passer", - "receiver_id", "receiver_name" = "receiver", - "rusher_id", "rusher_name" = "rusher", - "fantasy_id", "fantasy_name" = "fantasy", - "fantasy_player_name" - )), - dplyr::matches("player_id|player_name") - ) %>% - tidyr::pivot_longer( - cols = -c("game_id","play_id"), - names_to = c("stat",".value"), - names_pattern = c("(.+)_(id|name)"), - values_drop_na = TRUE - ) %>% - dplyr::filter(is.na(.data$id)) %>% - dplyr::left_join(patch_ids, by = c("game_id","play_id","name")) %>% - dplyr::mutate( - id = dplyr::coalesce(.data$id,.data$gsis_id), - gsis_id = NULL, - club_code = NULL, - name = NULL - ) %>% - tidyr::pivot_wider( - names_from = "stat", - values_from = "id", - names_glue = "{stat}_id" - ) - - if(nrow(patchable_ids) > 0){ - pbp <- tibble::tibble(pbp) %>% - dplyr::rows_patch(patchable_ids, by = c("game_id","play_id")) - } - - # cli::cli_alert_success("{my_time()} | Patched {nrow(patchable_ids)} missing gsis_id field{?s}") - - } - # drop existing values of clean_pbp pbp <- pbp %>% dplyr::select(-tidyselect::any_of(drop.cols))