cmu-delphi · dshemetov · Apr 4, 2025 · Mar 26, 2025 · Apr 4, 2025 · Apr 4, 2025
diff --git a/Makefile b/Makefile
@@ -20,7 +20,17 @@ prod-flu:
 	export TAR_RUN_PROJECT=flu_hosp_prod; \
 	Rscript scripts/run.R
 
-prod: prod-covid prod-flu update_site netlify
+prod: prod-covid prod-flu update-site netlify
+
+explore-covid:
+	export TAR_RUN_PROJECT=covid_hosp_explore; \
+	Rscript scripts/run.R
+
+explore-flu:
+	export TAR_RUN_PROJECT=flu_hosp_explore; \
+	Rscript scripts/run.R
+
+explore: explore-covid explore-flu update-site netlify
 
 submit-covid:
 	cd ../covid19-forecast-hub; \

diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -90,8 +90,7 @@ add_pop_and_density <-
         )
     }
     if (!("agg_level" %in% names(original_dataset))) {
-      original_dataset %<>%
-        mutate(agg_level = ifelse(grepl("[0-9]{2}", geo_value), "hhs_region", ifelse(("us" == geo_value) | ("usa" == geo_value), "nation", "state")))
+      original_dataset %<>% add_agg_level()
     }
     original_dataset %>%
       mutate(year = year(time_value)) %>%
@@ -106,6 +105,15 @@ add_pop_and_density <-
       fill(population, density)
   }
 
+add_agg_level <- function(data) {
+  data %>%
+    mutate(agg_level = case_when(
+      grepl("[0-9]{2}", geo_value) ~ "hhs_region",
+      geo_value %in% c("us", "usa") ~ "nation",
+      .default = "state"
+    ))
+}
+
 gen_pop_and_density_data <-
   function(apportion_filename = here::here("aux_data", "flusion_data", "apportionment.csv"),
            state_code_filename = here::here("aux_data", "flusion_data", "state_codes_table.csv"),
@@ -188,13 +196,15 @@ gen_pop_and_density_data <-
 daily_to_weekly <- function(epi_df, agg_method = c("sum", "mean"), keys = "geo_value", values = c("value")) {
   agg_method <- arg_match(agg_method)
   epi_df %>%
+    arrange(across(all_of(c(keys, "time_value")))) %>%
     mutate(epiweek = epiweek(time_value), year = epiyear(time_value)) %>%
     group_by(across(any_of(c(keys, "epiweek", "year")))) %>%
     summarize(
       across(all_of(values), ~ sum(.x, na.rm = TRUE)),
       time_value = floor_date(max(time_value), "weeks", week_start = 7) + 3,
       .groups = "drop"
     ) %>%
+    arrange(across(all_of(c(keys, "time_value")))) %>%
     select(-epiweek, -year)
 }
 
@@ -336,9 +346,7 @@ add_hhs_region_sum <- function(archive_data_raw, hhs_region_table) {
   archive_data_raw %<>%
     filter(agg_level != "state") %>%
     mutate(hhs_region = hhs) %>%
-    bind_rows(
-      hhs_region_agg_state
-    )
+    bind_rows(hhs_region_agg_state)
   if (need_agg_level) {
     archive_data_raw %<>% select(-agg_level)
   }
@@ -401,11 +409,32 @@ get_health_data <- function(as_of, disease = c("covid", "flu")) {
     # Get something sort of compatible with that by summing to national with
     # na.omit = TRUE. As otherwise we have some NAs from probably territories
     # propagated to US level.
-    bind_rows(
-      (.) %>%
-        group_by(time_value) %>%
-        summarize(geo_value = "us", hhs = sum(hhs, na.rm = TRUE))
-    )
+    append_us_aggregate("hhs")
+}
+
+#' Append a national aggregate to a dataframe
+#'
+#' Computes national values by summing all the values per group_keys.
+#' Removes pre-existing national values.
+#'
+#' @param df A dataframe with a `geo_value` column.
+#' @param cols A character vector of column names to aggregate.
+#' @param group_keys A character vector of column names to group by.
+#' @return A dataframe with a `geo_value` column.
+append_us_aggregate <- function(df, cols = NULL, group_keys = c("time_value")) {
+  if (!(is.data.frame(df))) {
+    cli::cli_abort("df must be a data.frame", call = rlang::caller_env())
+  }
+  national_col_names <- c("us", "usa", "national", "nation", "US", "USA")
+  df1 <- df %>% filter(geo_value %nin% national_col_names)
+  if (is.null(cols)) {
+    df2 <- df1 %>%
+      summarize(geo_value = "us", across(where(is.numeric), ~ sum(.x, na.rm = TRUE)), .by = all_of(group_keys))
+  } else {
+    df2 <- df1 %>%
+      summarize(geo_value = "us", across(all_of(cols), ~ sum(.x, na.rm = TRUE)), .by = all_of(group_keys))
+  }
+  bind_rows(df1, df2)
 }
 
 calculate_burden_adjustment <- function(flusurv_latest) {
@@ -718,7 +747,7 @@ up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
     wait_seconds = 1,
     fn = pub_covidcast,
     source = "nssp",
-    signal = glue::glue("pct_ed_visits_{disease}"),
+    signals = glue::glue("pct_ed_visits_{disease}"),
     time_type = "week",
     geo_type = "state",
     geo_values = "*",
@@ -727,7 +756,7 @@ up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
   nssp_state %>%
     select(geo_value, time_value, issue, nssp = value) %>%
     as_epi_archive(compactify = TRUE) %>%
-    `$`("DT") %>%
+    extract2("DT") %>%
     # End of week to midweek correction.
     mutate(time_value = time_value + 3) %>%
     as_epi_archive(compactify = TRUE)

diff --git a/R/forecasters/data_transforms.R b/R/forecasters/data_transforms.R
@@ -104,17 +104,14 @@ rolling_sd <- function(epi_data, sd_width = 29L, mean_width = NULL, cols_to_sd =
 #' @importFrom tidyr drop_na
 #' @importFrom epiprocess as_epi_df
 #' @export
-clear_lastminute_nas <- function(epi_data, outcome, extra_sources) {
+clear_lastminute_nas <- function(epi_data, cols) {
   meta_data <- attr(epi_data, "metadata")
-  if (extra_sources == c("")) {
-    extra_sources <- character(0L)
-  }
   as_of <- attributes(epi_data)$metadata$as_of
   other_keys <- attributes(epi_data)$metadata$other_keys %||% character()
   epi_data %>% na.omit()
   # make sure at least one column is not NA
   epi_data %<>%
-    filter(if_any(c(!!outcome, !!extra_sources), ~ !is.na(.x))) %>%
+    filter(if_any(c(!!cols), ~ !is.na(.x))) %>%
     as_epi_df(as_of = as_of, other_keys = other_keys)
   attr(epi_data, "metadata") <- meta_data
   return(epi_data)

diff --git a/R/forecasters/data_validation.R b/R/forecasters/data_validation.R
@@ -30,8 +30,7 @@ sanitize_args_predictors_trainer <- function(epi_data,
   if (!is.null(trainer) && !epipredict:::is_regression(trainer)) {
     cli::cli_abort("{trainer} must be a `{parsnip}` model of mode 'regression'.")
   } else if (inherits(trainer, "rand_forest") && trainer$engine == "grf_quantiles") {
-    trainer %<>%
-      set_engine("grf_quantiles", quantiles = args_list$quantile_levels)
+    trainer %<>% set_engine("grf_quantiles", quantiles = args_list$quantile_levels)
   } else if (inherits(trainer, "quantile_reg")) {
     # add all quantile_levels to the trainer and update args list
     quantile_levels <- sort(epipredict:::compare_quantile_args(
@@ -97,3 +96,17 @@ filter_extraneous <- function(epi_data, filter_source, filter_agg_level) {
   }
   return(epi_data)
 }
+
+#' Unwrap an argument if it's a list of length 1
+#'
+#' Many of our arguments to the forecasters come as lists not because we expect
+#' them that way, but as a byproduct of tibble and expand_grid.
+unwrap_argument <- function(arg, default_trigger = "", default = character(0L)) {
+  if (is.list(arg) && length(arg) == 1) {
+    arg <- arg[[1]]
+  }
+  if (identical(arg, default_trigger)) {
+    return(default)
+  }
+  return(arg)
+}
diff --git a/R/forecasters/forecaster_climatological.R b/R/forecasters/forecaster_climatological.R
@@ -22,6 +22,8 @@ climate_linear_ensembled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
   args_list <- list(...)
   ahead <- as.integer(ahead / 7)

diff --git a/R/forecasters/forecaster_flatline.R b/R/forecasters/forecaster_flatline.R
@@ -18,10 +18,13 @@ flatline_fc <- function(epi_data,
                         filter_agg_level = "",
                         ...) {
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
+
   # perform any preprocessing not supported by epipredict
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
   # this is a temp fix until a real fix gets put into epipredict
-  epi_data <- clear_lastminute_nas(epi_data, outcome, extra_sources)
+  epi_data <- clear_lastminute_nas(epi_data, cols = c(outcome, extra_sources))
   # one that every forecaster will need to handle: how to manage max(time_value)
   # that's older than the `as_of` date
   c(epi_data, ahead) %<-% extend_ahead(epi_data, ahead)

diff --git a/R/forecasters/forecaster_flusion.R b/R/forecasters/forecaster_flusion.R
@@ -24,6 +24,8 @@ flusion <- function(epi_data,
   derivative_estimator <- arg_match(derivative_estimator)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
   # perform any preprocessing not supported by epipredict
   args_input <- list(...)

diff --git a/R/forecasters/forecaster_no_recent_outcome.R b/R/forecasters/forecaster_no_recent_outcome.R
@@ -24,6 +24,8 @@ no_recent_outcome <- function(epi_data,
   week_method <- arg_match(week_method)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
   # this is for the case where there are multiple sources in the same column
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
@@ -62,10 +64,10 @@ no_recent_outcome <- function(epi_data,
   args_input[["quantile_levels"]] <- quantile_levels
   args_list <- do.call(default_args_list, args_input)
   # if you want to hardcode particular predictors in a particular forecaster
-  if (identical(extra_sources[[1]], "")) {
+  if (identical(extra_sources, character(0L))) {
     predictors <- character()
   } else {
-    predictors <- extra_sources[[1]]
+    predictors <- extra_sources
   }
   c(args_list, tmp_pred, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
 

diff --git a/R/forecasters/forecaster_scaled_pop.R b/R/forecasters/forecaster_scaled_pop.R
@@ -64,6 +64,8 @@ scaled_pop <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
   # perform any preprocessing not supported by epipredict
   #
@@ -94,7 +96,7 @@ scaled_pop <- function(epi_data,
   args_input[["nonneg"]] <- scale_method == "none"
   args_list <- inject(default_args_list(!!!args_input))
   # if you want to hardcode particular predictors in a particular forecaster
-  predictors <- c(outcome, extra_sources[[1]])
+  predictors <- c(outcome, extra_sources)
   c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by

diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -58,11 +58,9 @@ scaled_pop_seasonal <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
-  # TODO: handle this when creating param grid?
-  if (typeof(seasonal_method) == "list") {
-    seasonal_method <- seasonal_method[[1]]
-  }
   if (all(seasonal_method == c("none", "flu", "covid", "indicator", "window", "climatological"))) {
     seasonal_method <- "none"
   }
@@ -71,7 +69,7 @@ scaled_pop_seasonal <- function(epi_data,
   # this is for the case where there are multiple sources in the same column
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
   # this is a temp fix until a real fix gets put into epipredict
-  epi_data <- clear_lastminute_nas(epi_data, outcome, extra_sources)
+  epi_data <- clear_lastminute_nas(epi_data, cols = c(outcome, extra_sources))
   # this next part is basically unavoidable boilerplate you'll want to copy
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
@@ -100,7 +98,7 @@ scaled_pop_seasonal <- function(epi_data,
   args_input[["seasonal_forward_window"]] <- seasonal_forward_window + ahead
   args_list <- inject(default_args_list(!!!args_input))
   # if you want to hardcode particular predictors in a particular forecaster
-  predictors <- c(outcome, extra_sources[[1]])
+  predictors <- c(outcome, extra_sources)
   c(args_list, predictors, trainer) %<-% sanitize_args_predictors_trainer(epi_data, outcome, predictors, trainer, args_list)
 
   if ("season_week" %nin% names(epi_data)) {

diff --git a/R/forecasters/forecaster_smoothed_scaled.R b/R/forecasters/forecaster_smoothed_scaled.R
@@ -73,13 +73,15 @@ smoothed_scaled <- function(epi_data,
   nonlin_method <- arg_match(nonlin_method)
 
   epi_data <- validate_epi_data(epi_data)
+  extra_sources <- unwrap_argument(extra_sources)
+  trainer <- unwrap_argument(trainer)
 
   # perform any preprocessing not supported by epipredict
   #
   # this is for the case where there are multiple sources in the same column
   epi_data %<>% filter_extraneous(filter_source, filter_agg_level)
   # this is a temp fix until a real fix gets put into epipredict
-  epi_data <- clear_lastminute_nas(epi_data, outcome, extra_sources)
+  epi_data <- clear_lastminute_nas(epi_data, cols = c(outcome, extra_sources))
   # see latency_adjusting for other examples
   args_input <- list(...)
   # edge case where there is no data or less data than the lags; eventually epipredict will handle this
@@ -106,14 +108,12 @@ smoothed_scaled <- function(epi_data,
   args_list <- inject(default_args_list(!!!args_input))
   # `extra_sources` sets which variables beyond the outcome are lagged and used as predictors
   # any which are modified by `rolling_mean` or `rolling_sd` have their original values dropped later
-  predictors <- c(outcome, extra_sources[[1]])
-  predictors <- predictors[predictors != ""]
+  predictors <- c(outcome, extra_sources)
   # end of the copypasta
   # finally, any other pre-processing (e.g. smoothing) that isn't performed by
   # epipredict
 
 
-
   #######################
   # robust whitening
   #######################