Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aborted snap and d.johnson fixes #202

Merged
merged 8 commits into from
Mar 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
* Fixed a bug where `fixed_drive` and `series` were falsely incrementing on muffed punts recovered by the punting team for a touchdown
* Fixed a bug where `add_xpass()` crashed when ran with data already including xpass variables.
* Fixed a bug in `epa` when a safety is scored by the team beginning the play in possession of the ball (#186)
* Change `name`, `id`, `rusher`, and `rusher_id` to be the player charged with the fumble on aborted snaps when the QB is unable to make a play (i.e. pass, sack, or scramble) (#162)
* Fix some bugs related to David and Duke Johnson on the Texans in 2020 (#163)
* Fix yet another bug related to correctly identifying possession team on kickoffs nullified by penalty (#199)
* The function `calculate_player_stats()` now adds the variable `dakota`, the `epa` + `cpoe` composite, for players with minimum 5 pass attempts.
* Fixed a bug where `calculate_player_stats()` forgot to clean player names by using their IDs
* Fixed a bug where special teams touchdowns were missing in the output of `calculate_player_stats()` (#203)
Expand Down
191 changes: 107 additions & 84 deletions R/helper_add_nflscrapr_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,84 +32,6 @@ add_nflscrapr_mutations <- function(pbp) {
dplyr::arrange(.data$order_sequence, .data$quarter, !is.na(.data$quarter_seconds_remaining), -.data$quarter_seconds_remaining, !is.na(.data$drive), .data$drive, .data$index, .by_group = TRUE) %>%
dplyr::mutate(

# Make the possession team for kickoffs be the return team, since that is
# more intuitive from the EPA / WPA point of view:
posteam = dplyr::case_when(
# kickoff_finder is defined below
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ .data$away_team,
(.data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ .data$home_team,
TRUE ~ .data$posteam
),

# Fill in the rows with missing posteam with the lead:
posteam = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam == ""),
dplyr::lead(.data$posteam),
.data$posteam),
posteam_id = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam_id == ""),
dplyr::lead(.data$posteam_id),
.data$posteam_id),

# remove posteam from END Q2 plays or END Q4 plays (when game goes in OT)
# because it doesn't make sense and breaks fixed_drive and fixed_drive_result
posteam = dplyr::if_else(
stringr::str_detect(.data$play_description, "(END QUARTER 2)|(END QUARTER 4)"),
NA_character_, .data$posteam
),

# Denote whether the home or away team has possession:
posteam_type = dplyr::if_else(.data$posteam == .data$home_team, "home", "away"),

# Column denoting which team is on defense:
defteam = dplyr::if_else(
.data$posteam == .data$home_team,
.data$away_team, .data$home_team
),

yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline),
yardline = dplyr::if_else(
nchar(.data$yardline) == 0 | is.null(.data$yardline) | .data$yardline == "NULL" | is.na(.data$yardline),
dplyr::lead(.data$yardline), .data$yardline
),
yardline_number = dplyr::if_else(
.data$yardline == "MID 50", 50, .data$yardline_number
),
yardline_100 = dplyr::if_else(
.data$yardline_side == .data$posteam | .data$yardline == "MID 50",
100 - .data$yardline_number, .data$yardline_number
),
# Create a column with the time in seconds remaining for each half:
half_seconds_remaining = dplyr::if_else(
.data$quarter %in% c(1, 3),
.data$quarter_seconds_remaining + 900,
.data$quarter_seconds_remaining),
# Create a column with the time in seconds remaining for the game:
game_seconds_remaining = dplyr::if_else(
.data$quarter %in% c(1, 2, 3, 4),
.data$quarter_seconds_remaining + (900 * (4 - as.numeric(.data$quarter))),
.data$quarter_seconds_remaining
),
# Add column for replay or challenge:
replay_or_challenge = stringr::str_detect(
.data$play_description, "(Replay Official reviewed)|( challenge(d)? )|(Challenged)") %>%
as.numeric(),
# Result of replay or challenge:
replay_or_challenge_result = dplyr::if_else(
.data$replay_or_challenge == 1,
dplyr::if_else(
stringr::str_detect(
tolower(.data$play_description),
"( upheld)|( reversed)|( confirmed)"
),
stringr::str_extract(
tolower(.data$play_description),
"( upheld)|( reversed)|( confirmed)"
) %>%
stringr::str_trim(), "denied"
),
NA_character_
),
# Using the various two point indicators, create a column denoting the result
# outcome for two point conversions:
two_point_conv_result = dplyr::if_else(
Expand Down Expand Up @@ -173,12 +95,7 @@ add_nflscrapr_mutations <- function(pbp) {
.data$field_goal_blocked == 1,
"blocked", .data$field_goal_result
),
# Set the kick_distance for extra points by adding 18 to the yardline_100:
kick_distance = dplyr::if_else(
.data$extra_point_attempt == 1,
.data$yardline_100 + 18,
.data$kick_distance
),

# Using the indicators make a column with the extra point result:
extra_point_result = dplyr::if_else(
.data$extra_point_attempt == 1 &
Expand All @@ -205,6 +122,112 @@ add_nflscrapr_mutations <- function(pbp) {
.data$extra_point_aborted == 1,
"aborted", .data$extra_point_result
),

# find kickoffs with penalty: a play where the next play is a kickoff
# and the prior play wasn't a safety or PAT
lead_ko = case_when(
dplyr::lead(.data$kickoff_attempt) == 1 &
.data$game_id == dplyr::lead(.data$game_id) &
!stringr::str_detect(tolower(.data$play_description), "(injured sf )|(tonight's attendance )|(injury update )|(end quarter)|(timeout)|( captains:)|( captains )|( captians:)|( humidity:)|(note - )|( deferred)|(game start )") &
!stringr::str_detect(.data$play_description, "GAME ") &
!.data$play_description %in% c("GAME", "Two-Minute Warning", "The game has resumed.") &
is.na(.data$two_point_conv_result) &
is.na(.data$extra_point_result) &
is.na(.data$field_goal_result) &
(.data$safety == 0 | is.na(.data$safety)) &
# because things too messed up before
.data$season > 2000 ~ 1,
TRUE ~ 0),

kickoff_attempt = dplyr::if_else(
.data$lead_ko == 1, 1, .data$kickoff_attempt
),

# Make the possession team for kickoffs be the return team, since that is
# more intuitive from the EPA / WPA point of view:
posteam = dplyr::case_when(
# kickoff_finder is defined below
(.data$lead_ko == 1 | .data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$home_team ~ .data$away_team,
(.data$lead_ko == 1 | .data$kickoff_attempt == 1 | stringr::str_detect(.data$play_description, kickoff_finder)) & .data$posteam == .data$away_team ~ .data$home_team,
TRUE ~ .data$posteam
),

# Fill in the rows with missing posteam with the lead:
posteam = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam == ""),
dplyr::lead(.data$posteam),
.data$posteam),
posteam_id = dplyr::if_else(
(.data$quarter_end == 1 | .data$posteam_id == ""),
dplyr::lead(.data$posteam_id),
.data$posteam_id),

# remove posteam from END Q2 plays or END Q4 plays (when game goes in OT)
# because it doesn't make sense and breaks fixed_drive and fixed_drive_result
posteam = dplyr::if_else(
stringr::str_detect(.data$play_description, "(END QUARTER 2)|(END QUARTER 4)"),
NA_character_, .data$posteam
),

# Denote whether the home or away team has possession:
posteam_type = dplyr::if_else(.data$posteam == .data$home_team, "home", "away"),

# Column denoting which team is on defense:
defteam = dplyr::if_else(
.data$posteam == .data$home_team,
.data$away_team, .data$home_team
),

yardline = dplyr::if_else(.data$yardline == "50", "MID 50", .data$yardline),
yardline = dplyr::if_else(
nchar(.data$yardline) == 0 | is.null(.data$yardline) | .data$yardline == "NULL" | is.na(.data$yardline),
dplyr::lead(.data$yardline), .data$yardline
),
yardline_number = dplyr::if_else(
.data$yardline == "MID 50", 50, .data$yardline_number
),
yardline_100 = dplyr::if_else(
.data$yardline_side == .data$posteam | .data$yardline == "MID 50",
100 - .data$yardline_number, .data$yardline_number
),
# Set the kick_distance for extra points by adding 18 to the yardline_100:
kick_distance = dplyr::if_else(
.data$extra_point_attempt == 1,
.data$yardline_100 + 18,
.data$kick_distance
),
# Create a column with the time in seconds remaining for each half:
half_seconds_remaining = dplyr::if_else(
.data$quarter %in% c(1, 3),
.data$quarter_seconds_remaining + 900,
.data$quarter_seconds_remaining),
# Create a column with the time in seconds remaining for the game:
game_seconds_remaining = dplyr::if_else(
.data$quarter %in% c(1, 2, 3, 4),
.data$quarter_seconds_remaining + (900 * (4 - as.numeric(.data$quarter))),
.data$quarter_seconds_remaining
),
# Add column for replay or challenge:
replay_or_challenge = stringr::str_detect(
.data$play_description, "(Replay Official reviewed)|( challenge(d)? )|(Challenged)") %>%
as.numeric(),
# Result of replay or challenge:
replay_or_challenge_result = dplyr::if_else(
.data$replay_or_challenge == 1,
dplyr::if_else(
stringr::str_detect(
tolower(.data$play_description),
"( upheld)|( reversed)|( confirmed)"
),
stringr::str_extract(
tolower(.data$play_description),
"( upheld)|( reversed)|( confirmed)"
) %>%
stringr::str_trim(), "denied"
),
NA_character_
),

# Create the column denoting the categorical description of the pass length:
pass_length = dplyr::if_else(
.data$two_point_attempt == 0 &
Expand Down
25 changes: 18 additions & 7 deletions R/helper_additional_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ clean_pbp <- function(pbp, ...) {

r <- pbp %>%
dplyr::mutate(
aborted_play = dplyr::if_else(stringr::str_detect(.data$desc, 'Aborted'), 1, 0),
#get rid of extraneous spaces that mess with player name finding
#if there is a space or dash, and then a capital letter, and then a period, and then a space, take out the space
desc = stringr::str_replace_all(.data$desc, "(((\\s)|(\\-))[A-Z]\\.)\\s+", "\\1"),
Expand Down Expand Up @@ -131,6 +132,8 @@ clean_pbp <- function(pbp, ...) {
TRUE ~ .data$passer
),
rusher = dplyr::case_when(
rusher == "D.Johnson" & posteam == "HOU" & season == 2020 & rusher_jersey_number == 31 ~ "Da.Johnson",
rusher == "D.Johnson" & posteam == "HOU" & season == 2020 & rusher_jersey_number == 25 ~ "Du.Johnson",
rusher == "Jos.Allen" ~ "J.Allen",
rusher == "Alex Smith" | rusher == "Ale.Smith" ~ "A.Smith",
rusher == "Ryan" & .data$posteam == "ATL" ~ "M.Ryan",
Expand All @@ -152,7 +155,6 @@ clean_pbp <- function(pbp, ...) {
TRUE ~ receiver
),
first_down = dplyr::if_else(.data$first_down_rush == 1 | .data$first_down_pass == 1 | .data$first_down_penalty == 1, 1, 0),
aborted_play = dplyr::if_else(stringr::str_detect(.data$desc, 'Aborted'), 1, 0),
# easy filter: play is 1 if a "special teams" play, or 0 otherwise
# with thanks to Lee Sharpe for the code
special = dplyr::if_else(.data$play_type %in%
Expand All @@ -179,33 +181,42 @@ clean_pbp <- function(pbp, ...) {

dplyr::group_by(.data$passer, .data$posteam, .data$season) %>%
dplyr::mutate(
passer_id = dplyr::if_else(is.na(.data$passer), NA_character_, custom_mode(.data$passer_player_id)),
passer_jersey_number = dplyr::if_else(is.na(.data$passer), NA_integer_, custom_mode(.data$passer_jersey_number))
passer_id = dplyr::if_else(is.na(.data$passer), NA_character_, custom_mode(.data$passer_player_id))
) %>%

dplyr::group_by(.data$passer_id) %>%
dplyr::mutate(passer = dplyr::if_else(is.na(.data$passer_id), NA_character_, custom_mode(.data$passer))) %>%

dplyr::group_by(.data$rusher, .data$posteam, .data$season) %>%
dplyr::mutate(
rusher_id = dplyr::if_else(is.na(.data$rusher), NA_character_, custom_mode(.data$rusher_player_id)),
rusher_jersey_number = dplyr::if_else(is.na(.data$rusher), NA_integer_, custom_mode(.data$rusher_jersey_number))
rusher_id = dplyr::if_else(is.na(.data$rusher), NA_character_, custom_mode(.data$rusher_player_id))
) %>%

dplyr::group_by(.data$rusher_id) %>%
dplyr::mutate(rusher = dplyr::if_else(is.na(.data$rusher_id), NA_character_, custom_mode(.data$rusher))) %>%

dplyr::group_by(.data$receiver, .data$posteam, .data$season) %>%
dplyr::mutate(
receiver_id = dplyr::if_else(is.na(.data$receiver), NA_character_, custom_mode(.data$receiver_player_id)),
receiver_jersey_number = dplyr::if_else(is.na(.data$receiver), NA_integer_, custom_mode(.data$receiver_jersey_number))
receiver_id = dplyr::if_else(is.na(.data$receiver), NA_character_, custom_mode(.data$receiver_player_id))
) %>%

dplyr::group_by(.data$receiver_id) %>%
dplyr::mutate(receiver = dplyr::if_else(is.na(.data$receiver_id), NA_character_, custom_mode(.data$receiver))) %>%

dplyr::ungroup() %>%
dplyr::mutate(
# if there's an aborted snap and qb didn't get a pass off,
# then charge it to whoever charged with the fumble
# this has to go after all the custom_mode stuff or it gets messed up
rusher = dplyr::if_else(
.data$aborted_play == 1 & is.na(.data$passer) & !is.na(.data$fumbled_1_player_name),
.data$fumbled_1_player_name, .data$rusher
),
rusher_id = dplyr::if_else(
.data$aborted_play == 1 & is.na(.data$passer) & !is.na(.data$fumbled_1_player_id),
.data$fumbled_1_player_id, .data$rusher_id
),

name = dplyr::if_else(!is.na(.data$passer), .data$passer, .data$rusher),
jersey_number = dplyr::if_else(!is.na(.data$passer_jersey_number), .data$passer_jersey_number, .data$rusher_jersey_number),
id = dplyr::if_else(!is.na(.data$passer_id), .data$passer_id, .data$rusher_id)
Expand Down
25 changes: 16 additions & 9 deletions data-raw/compare_dfs.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
library(tidyverse)
future::plan("multisession")

# function for comparing revisions against data in repo
# make sure to build package first
compare_pbp <- function(id, cols) {

s <- substr(id, 1, 4) %>% as.integer()
s <- substr(id[1], 1, 4) %>% as.integer()
# no idea why this is necessary
game <- id
games <- id

new_pbp <- build_nflfastR_pbp(
id
# comment this out to use the "normal" way
, dir = "../nflfastR-raw/raw_old"
# , dir = "../nflfastR-raw/raw"
) %>%
filter(!stringr::str_detect(desc, "GAME")) %>%
select(all_of(cols)) %>%
Expand All @@ -23,7 +24,7 @@ compare_pbp <- function(id, cols) {
)

repo_pbp <- readRDS(url(glue::glue("https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/play_by_play_{s}.rds"))) %>%
filter(game_id == game) %>%
filter(game_id %in% games) %>%
filter(!stringr::str_detect(desc, "GAME")) %>%
select(all_of(cols)) %>%
mutate(
Expand Down Expand Up @@ -51,21 +52,27 @@ compare_pbp <- function(id, cols) {

cols <- c(
# DO NOT REMOVE THESE ONES OR THE COMPARISON WILL BREAK
"play_id", "desc", "ep", "epa", "vegas_home_wp",
"game_id", "play_id", "desc", "ep", "epa", "vegas_home_wp",

# here is stuff you can choose whether to include
"posteam", "home_team", "away_team"
"posteam", "home_team", "away_team", "name", "rusher"
# , "posteam_timeouts_remaining", "defteam_timeouts_remaining"
)
)

id <- "2002_05_PHI_JAX"
id <- "2006_01_MIA_PIT"
id <- "2006_02_PIT_JAX"
id <- "2006_03_JAX_IND"
id <- "2017_08_LAC_NE"
id <- "2006_04_JAX_WAS"
id <- "2019_01_SF_TB"
id <- "2017_12_JAX_ARI"

ids <- nflfastR::fast_scraper_schedules(2020) %>%
dplyr::slice(11:20) %>%
pull(game_id)

compared <- compare_pbp(
id = id,
id = ids,
cols = cols
)

Expand Down