|
| 1 | +#' Prepare xG data |
| 2 | +#' |
| 3 | +#' @description Helper function to prepare hockeyR pbp data for xG calculations |
| 4 | +#' |
| 5 | +#' @param x A play-by-play data frame generated by hockeyR before xG is calculated |
| 6 | +#' |
| 7 | +#' @return A tibble; pbp data with xG model mutations along with identifiers for game and strength state |
| 8 | +#' |
| 9 | +#' @examples |
| 10 | +#' \dontrun{ |
| 11 | +#' pbp <- load_pbp(2022) |> dplyr::select(-xg) |
| 12 | +#' model_data <- prepare_xg_data(pbp) |
| 13 | +#' } |
| 14 | +prepare_xg_data <- function(x){ |
| 15 | + |
| 16 | + model_df <- x |> |
| 17 | + # filter out shootouts |
| 18 | + dplyr::filter(period_type != "SHOOTOUT") |> |
| 19 | + # remove penalty shots |
| 20 | + dplyr::filter(secondary_type != "Penalty Shot" | is.na(secondary_type)) |> |
| 21 | + # remove shift change events, which were excluded from model |
| 22 | + dplyr::filter(event_type != "CHANGE") |> |
| 23 | + # add model feature variables |
| 24 | + dplyr::group_by(game_id) |> |
| 25 | + dplyr::mutate( |
| 26 | + last_event_type = dplyr::lag(event_type), |
| 27 | + last_event_team = dplyr::lag(event_team), |
| 28 | + time_since_last = game_seconds - dplyr::lag(game_seconds), |
| 29 | + last_x = dplyr::lag(x), |
| 30 | + last_y = dplyr::lag(y), |
| 31 | + distance_from_last = round(sqrt(((y - last_y)^2) + ((x - last_x)^2)),1), |
| 32 | + event_zone = dplyr::case_when( |
| 33 | + x >= -25 & x <= 25 ~ "NZ", |
| 34 | + (x_fixed < -25 & event_team == home_name) | |
| 35 | + (x_fixed > 25 & event_team == away_name) ~ "DZ", |
| 36 | + (x_fixed > 25 & event_team == home_name) | |
| 37 | + (x_fixed < -25 & event_team == away_name) ~ "OZ" |
| 38 | + ), |
| 39 | + last_event_zone = dplyr::lag(event_zone) |
| 40 | + ) |> |
| 41 | + dplyr::ungroup() |> |
| 42 | + # filter to only unblocked shots |
| 43 | + dplyr::filter(event_type %in% c("SHOT","MISSED_SHOT","GOAL")) |> |
| 44 | + # get rid off oddball last_events |
| 45 | + # ie "EARLY_INTERMISSION_START" |
| 46 | + dplyr::filter( |
| 47 | + last_event_type %in% c( |
| 48 | + "FACEOFF","GIVEAWAY","TAKEAWAY","BLOCKED_SHOT","HIT", |
| 49 | + "MISSED_SHOT","SHOT","STOP","PENALTY","GOAL" |
| 50 | + ) |
| 51 | + ) |> |
| 52 | + # add more feature variables |
| 53 | + dplyr::mutate( |
| 54 | + era_2011_2013 = ifelse( |
| 55 | + season %in% c("20102011","20112012","20122013"), |
| 56 | + 1, 0 |
| 57 | + ), |
| 58 | + era_2014_2018 = ifelse( |
| 59 | + season %in% c("20132014","20142015","20152016","20162017","20172018"), |
| 60 | + 1, 0 |
| 61 | + ), |
| 62 | + era_2019_2021 = ifelse( |
| 63 | + season %in% c("20182019","20192020","20202021"), |
| 64 | + 1, 0 |
| 65 | + ), |
| 66 | + era_2022_on = ifelse( |
| 67 | + as.numeric(season) > 20202021, 1, 0 |
| 68 | + ), |
| 69 | + # these are only for the ST model |
| 70 | + event_team_skaters = ifelse(event_team == home_name, home_skaters, away_skaters), |
| 71 | + opponent_team_skaters = ifelse(event_team == home_name, away_skaters, home_skaters), |
| 72 | + total_skaters_on = event_team_skaters + opponent_team_skaters, |
| 73 | + event_team_advantage = event_team_skaters - opponent_team_skaters, |
| 74 | + # these are in 5v5 model |
| 75 | + rebound = ifelse(last_event_type %in% c("SHOT","MISSED_SHOT","GOAL") & time_since_last <= 2, 1, 0), |
| 76 | + rush = ifelse(last_event_zone %in% c("NZ","DZ") & time_since_last <= 4, 1, 0), |
| 77 | + cross_ice_event = ifelse( |
| 78 | + # indicates goalie had to move from one post to the other |
| 79 | + last_event_zone == "OZ" & |
| 80 | + ((last_y > 3 & y < -3) | (last_y < -3 & y > 3)) & |
| 81 | + # need some sort of time frame here to indicate shot was quick after goalie had to move |
| 82 | + time_since_last <= 2, 1, 0 |
| 83 | + ), |
| 84 | + # fix missing empty net vars |
| 85 | + empty_net = ifelse(is.na(empty_net) | empty_net == FALSE, FALSE, TRUE), |
| 86 | + shot_type = secondary_type, |
| 87 | + goal = ifelse(event_type == "GOAL", 1, 0) |
| 88 | + ) |> |
| 89 | + dplyr::select(season, game_id, event_id, strength_state, shot_distance, shot_angle, empty_net, last_event_type:goal) |> |
| 90 | + # one-hot encode some categorical vars |
| 91 | + dplyr::mutate(type_value = 1, last_value = 1) |> |
| 92 | + tidyr::pivot_wider(names_from = shot_type, values_from = type_value, values_fill = 0) |> |
| 93 | + tidyr::pivot_wider( |
| 94 | + names_from = last_event_type, values_from = last_value, values_fill = 0, names_prefix = "last_" |
| 95 | + ) |> |
| 96 | + janitor::clean_names() |> |
| 97 | + dplyr::select( |
| 98 | + -last_event_team, -event_zone, -last_event_zone, -event_team_skaters, -opponent_team_skaters |
| 99 | + ) |
| 100 | + |
| 101 | + if("na" %in% names(model_df)){ |
| 102 | + model_df <- dplyr::select(model_df, -na) |
| 103 | + } |
| 104 | + |
| 105 | + `%not_in%` <- purrr::negate(`%in%`) |
| 106 | + |
| 107 | + missing_feats <- dplyr::tibble( |
| 108 | + feature = xg_model_5v5$feature_names |
| 109 | + ) |> |
| 110 | + dplyr::filter(feature %not_in% names(model_df)) |> |
| 111 | + dplyr::mutate(val = 0) |> |
| 112 | + tidyr::pivot_wider(names_from = feature, values_from = val) |
| 113 | + |
| 114 | + if(length(missing_feats) > 0){ |
| 115 | + model_df <- dplyr::bind_cols(model_df, missing_feats) |
| 116 | + } |
| 117 | + |
| 118 | + return(model_df) |
| 119 | + |
| 120 | +} |
0 commit comments