Downloading and Prepping the Data

The play-by-play data used for the strategy simulations comes from nflscrapR and is read into R using the NFLSimulatoR function download_nflscrapR_data(). The prep_pbp_data() function will then clean and prepare the data for use in the sample_drives() function. Furthermore, we wanted our results to represent “normal” NFL drives where score differential and time remaining would not influence play calling. This was accomplished by filtering the play-by-play data to require a score differential of less than 28 points and greater than 2 minutes remaining in a half.

df <- dplyr::bind_cols(
  nflfastR::load_pbp(2018),
  nflfastR::load_pbp(2019))
  
pbp_data <- df %>% 
  prep_pbp_data(.) %>%
  filter(abs(score_differential) < 28, half_seconds_remaining/60 > 2)

pbp_data_18 <- pbp_data %>% 
  filter(.,substr(game_date,0,4) == 2018)

pbp_data_19 <- pbp_data %>% 
  filter(.,substr(game_date,0,4) == 2019)

Simulating Drives based on Pass Proportion

With the sample_drives() function, drives can be simulated according to a chosen proportion of pass plays by selecting the passes_rushes strategy. To improve performance when running thousands of simulated drives, the foreach and doParallel packages are used to run the simulations on multiple cores. The following simulates drives for a pass proportion of 0 to 1 (by .1) and stores the results in a data frame results.

# Pass Proportion 2019
drives <- NULL
results_pass_19 <- NULL
df_drives <- NULL
registerDoParallel(cores = 4)
prop <- seq(0,1, by = .1)
results_pass_19 <- foreach (i= 1:11, .combine = rbind, .packages = c("NFLSimulatoR", "progress","dplyr", "tidyverse"))  %dopar% {
  set.seed(i)
  drives <- sample_drives(n_sims = 10, 
                          from_yard_line = 25, 
                          play_by_play_data = pbp_data_19, 
                          strategy = "passes_rushes",
                          single_drive = T,
                          progress = F,
                          prop_passes = prop[i])
  df_drives <- drives %>%
  #add additional identifiers below as needed i.e. year, etc
  mutate(proportion = prop[i],year = 19)
}

Simulating Drives based on Fourth Down Strategies

We can also compare various fourth down strategies using the sample_drives() function. Simply pass “fourth_downs” to the strategy argument and a vector storing selected strategies to the fourth_down_strategy argument.

# 4th down strategies
  drives <- NULL
  results_fourths_1 <- NULL
  df_drives <- NULL
  registerDoParallel(cores = 4)
  strats <- c("always_go_for_it","empirical","exp_pts","never_go_for_it", "yds_less_than")
  results_fourths_1 <- foreach (i = 3:4, .combine = rbind, .packages = c("NFLSimulatoR", "progress","dplyr", "tidyverse"))  %dopar% {
    set.seed(i)
    drives <- sample_drives(n_sims = 10000, 
                            from_yard_line = 25, 
                            play_by_play_data = pbp_data, 
                            strategy = "fourth_downs",
                            fourth_down_strategy = strats[i],
                            single_drive = T,
                            progress = F
                            )
    df_drives <- drives %>%
    #add additional identifiers below as needed i.e. year, etc
    mutate(Scenario = strats[i])
  }

Adding a Layer to the Passes Rushes Simulations

Downloading and filtering data based on team Passer Rating (RTG)

To further analyze passing vs. rushing we can run the simulations based on a team’s ability to pass the football. This is accomplished by dividing the play-by-play data into groups three groups (low, mid, high) based on a team’s respective Passer Rating (RTG) relative to the league average over the last three seasons (2017-2019). This file can be downloaded from Google Drive here. The six datasets, three for 2018 and 2019 respectively, are stored in the list object RTG_list.

# RTG Data
# Team RTG read in (2017-2019)
RTG <- read.csv("path/to/file/given/above/Team_Passing_Offense.csv")

#Store Tercile Cutoffs (2017-2019)

cutoffs <- quantile(RTG$Rate,probs = c(0:3/3)) 

# Passer Rate Terciles

RTG_list <- list()
years <- c("2018","2019")
terciles <- c("Low","Mid","High")
for (j in 1:2){
  list_year <- list()
  for (i in 1:3){
    teams <- RTG %>% 
      filter(.,Year == years[j],
             Rate >= cutoffs[i] & Rate < cutoffs[(i+1)] ) %>% 
      select(.,Team)
    list_year[[paste(terciles[i],years[j],sep = "_")]] <- pbp_data %>% 
      filter(.,substr(game_date,0,4) == years[j],
             posteam %in% as.matrix(teams))
  }
  RTG_list <- append(RTG_list,list_year)
}

Running the RTG Simulations

Using the same structure as the pass vs. rush simulations above, we can simulate drives using each of the 6 subsets of data.

# Passer Rating - RTG
drives <- NULL
df_drives <- NULL
RTG_thirds_sims <- NULL
registerDoParallel(cores = 4)
prop <- seq(0,1, by = .1)
RTG_thirds_sims <- foreach (j = 1:6, .combine = rbind ) %:%
  foreach (i= 1:11, .combine = rbind, .packages = c("NFLSimulatoR", "progress","dplyr", "tidyverse"))  %dopar% {
    set.seed(i)
    drives <- sample_drives(n_sims = 10, 
                          from_yard_line = 25, 
                          play_by_play_data = RTG_list[[j]], 
                          strategy = "passes_rushes",
                          single_drive = T,
                          progress = F,
                          prop_passes = prop[i])
    df_drives <- drives %>%
    #add additional identifiers below as needed i.e. year, etc
    mutate(proportion = prop[i],
           RTG = names(RTG_list[j]),
           year = substr(RTG, nchar(RTG)-1, nchar(RTG)))
}