Climatological backtest (#190)

dsweber2 · web-flow · commit b43cd7d21752 · 2025-04-03T16:58:46.000-07:00
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -304,7 +304,7 @@ drop_non_seasons <- function(epi_data, min_window = 12) {
         (forecast_date - time_value < as.difftime(min_window, units = "weeks")),
       season != "2020/21",
       # season != "2021/22", # keeping this because whitening otherwise gets really bad with the single season of data
-      (season != "2019/20") | (time_value < "2020-03-01"),
+      (season != "2019/20"),
       season != "2008/09"
     )
 }
diff --git a/R/forecasters/climatological_model.R b/R/forecasters/climatological_model.R
@@ -2,7 +2,8 @@
 #' @param epi_data expected to have columns time_value, geo_value, season, value,
 climatological_model <- function(epi_data, ahead, window_size = 3,
                                  recent_window = 3, quantile_method = c("baseR", "epipredict"),
-                                 quant_type = 8, geo_agg = FALSE) {
+                                 quant_type = 8, geo_agg = FALSE,
+                                 floor_value = 0, pop_scale = FALSE, include_forecast_date = TRUE) {
   quantile_method <- arg_match(quantile_method)
   forecast_date <- attributes(epi_data)$metadata$as_of
   forecast_week <- epiweek(forecast_date)
@@ -20,17 +21,23 @@ climatological_model <- function(epi_data, ahead, window_size = 3,
   # drop weird years
   filtered %<>% filter((season != "2020/21") & (season != "2021/22"))
   # keep data either within the window, or within the past window weeks
-  filtered %<>% filter(
-    (abs(forecast_week + ahead - epiweek) <= window_size) |
-      (last_date_data - time_value <= recent_window * 7)
-  )
-
-  if (geo_agg) {
+  if (include_forecast_date) {
+    filtered %<>% filter(
+      (abs(forecast_week + ahead - epiweek) <= window_size) |
+        (last_date_data - time_value <= recent_window * 7)
+    )
+  } else {
+    filtered %<>% filter(
+      (abs(forecast_week + ahead - epiweek) <= window_size)
+    )
+  }
+  # filtered %>% ggplot(aes(x = epiweek, y = value, color = source)) + geom_point() + facet_wrap(~geo_value); epi_data %>% autoplot(value, .facet_by = "geo_value", color = "source")
+  if (geo_agg && pop_scale) {
     filtered %<>%
       add_pop_and_density() %>%
       mutate(value = value / population * 1e5) %>%
-      select(geo_value, epiweek, epiyear, season, season_week, value, population)
-  } else {
+      select(any_of(c("geo_value", "epiweek", "epiyear", "season", "season_week", "value", "population")))
+  } else if (!geo_agg) {
     filtered %<>%
       group_by(geo_value)
   }
@@ -56,18 +63,26 @@ climatological_model <- function(epi_data, ahead, window_size = 3,
       summarize(.dist_quantile = dist_quantiles(value, quantile), .groups = "keep") %>%
       reframe(tibble(quantile = covidhub_probs(), value = quantile(.dist_quantile, p = covidhub_probs())[[1]]))
   }
-  naive_preds %<>% mutate(value = pmax(0, value))
+  naive_preds %<>% mutate(value = pmax(floor_value, value))
   if (geo_agg) {
     naive_preds %<>%
       expand_grid(
-        filtered %>% distinct(geo_value, population)
-      ) %>%
-      mutate(value = value * population / 1e5) %>%
-      select(-population) %>%
+        filtered %>% distinct(geo_value)
+      )
+    if (pop_scale) {
+      naive_preds %<>%
+        left_join(
+          filtered %>%
+            distinct(geo_value, population)
+        ) %>%
+        mutate(value = value * population / 1e5)
+    }
+    naive_preds %<>%
+      select(-any_of("population")) %>%
       select(geo_value, forecast_date, target_end_date, quantile, value) %>%
       arrange(geo_value, forecast_date, target_end_date)
   }
   naive_preds %>%
-    mutate(value = pmax(0, value)) %>%
+    mutate(value = pmax(floor_value, value)) %>%
     ungroup()
 }
diff --git a/R/forecasters/ensemble_linear_climate.R b/R/forecasters/ensemble_linear_climate.R
@@ -4,8 +4,6 @@
 #' forecasts and averages them on a per-quantile basis. By default the average
 #' used is the median, but it can accept any vectorized function.
 #'
-#' @param epi_data The data for fitting. Currently unused, but matches interface
-#' of other forecasters.
 #' @param forecasts A tibble of quantile forecasts to aggregate. They should
 #'   be tibbles with columns `(geo_value, forecast_date, target_end_date,
 #'   quantile, value)`, preferably in that order.
diff --git a/R/forecasters/forecaster_baseline_linear.R b/R/forecasters/forecaster_baseline_linear.R
@@ -1,13 +1,17 @@
 #' epi_data is expected to have: geo_value, time_value, and value columns.
-forecaster_baseline_linear <- function(epi_data, ahead, log = FALSE, sort = FALSE, residual_tail = 0.85, residual_center = 0.085, no_intercept = FALSE) {
+forecaster_baseline_linear <- function(epi_data, ahead, log = FALSE, sort = FALSE, residual_tail = 0.85, residual_center = 0.085, no_intercept = TRUE, floor_value = 0, population_scale = TRUE) {
   epi_data <- validate_epi_data(epi_data)
   forecast_date <- attributes(epi_data)$metadata$as_of
   population_data <- get_population_data() %>%
     rename(geo_value = state_id) %>%
     distinct(geo_value, population)
+  if (population_scale) {
   df_processed <- epi_data %>%
     left_join(population_data, by = "geo_value") %>%
     mutate(value = value / population * 10**5)
+  } else {
+    df_processed <- epi_data
+  }
 
   if (log) {
     df_processed <- df_processed %>% mutate(value = log(value))
@@ -112,21 +116,27 @@ forecaster_baseline_linear <- function(epi_data, ahead, log = FALSE, sort = FALS
     pivot_quantiles_longer(dist) %>%
     rename(quantile_levels = dist_quantile_level, values = dist_value) %>%
     select(-value) %>%
-    left_join(population_data, by = "geo_value") %>%
     rename(quantile = quantile_levels) %>%
     {
       if (log) {
         (.) %>% mutate(values = exp(values))
       } else {
         .
       }
-    } %>%
-    mutate(
-      value = values * population / 10**5,
+    }
+  if (population_scale) {
+    quantile_forecast %<>%
+      left_join(population_data, by = "geo_value") %>%
+      mutate(
+        value = values * population / 10**5
+      ) %>%
+      select(-population)
+  }
+  quantile_forecast %<>% mutate(
       target_end_date = reference_date + ahead * 7,
       forecast_date = forecast_date,
     ) %>%
-    select(-model, -values, -population, -season_week) %>%
-    mutate(value = pmax(0, value))
+    select(-model, -values, -season_week) %>%
+    mutate(value = pmax(floor_value, value))
   quantile_forecast
 }
diff --git a/R/forecasters/forecaster_climatological.R b/R/forecasters/forecaster_climatological.R
@@ -1,15 +1,19 @@
+#' @params model_used the model used. "climate" means just climatological_model, "climate_linear" means the weighted ensemble with a linear model, "climatological_forecaster" means using the model from epipredict
+#'
 climate_linear_ensembled <- function(epi_data,
                                      outcome,
                                      extra_sources = "",
                                      ahead = 7,
                                      trainer = parsnip::linear_reg(),
                                      quantile_levels = covidhub_probs(),
+                                     model_used = "climate_linear",
                                      filter_source = "",
                                      filter_agg_level = "",
                                      scale_method = c("quantile", "std", "none"),
                                      center_method = c("median", "mean", "none"),
                                      nonlin_method = c("quart_root", "none"),
-                                     drop_non_seasons = FALSE,
+                                     quantiles_by_geo = TRUE,
+                                     drop_non_season = FALSE,
                                      residual_tail = 0.99,
                                      residual_center = 0.35,
                                      ...) {
@@ -45,31 +49,101 @@ climate_linear_ensembled <- function(epi_data,
         by = c("epiweek", "epiyear")
       )
   }
-  if (drop_non_seasons) {
-    season_data <- epi_data %>% drop_non_seasons()
+  if (drop_non_season) {
+    season_data <- epi_data %>%
+      drop_non_seasons() %>%
+      filter(season != "2021/22")
   } else {
     season_data <- epi_data
   }
   learned_params <- calculate_whitening_params(season_data, outcome, scale_method, center_method, nonlin_method)
-  epi_data %<>% data_whitening(outcome, learned_params, nonlin_method)
-  epi_data <- epi_data %>%
+  season_data %<>% data_whitening(outcome, learned_params, nonlin_method)
+  # epi_data %>% drop_non_seasons() %>% ggplot(aes(x = time_value, y = hhs, color = source)) + geom_line() + facet_wrap(~geo_value)
+  season_data <- season_data %>%
     select(geo_value, source, time_value, season, value = !!outcome) %>%
     mutate(epiweek = epiweek(time_value))
-  pred_climate <- climatological_model(epi_data, ahead) %>% mutate(forecaster = "climate")
-  pred_geo_climate <- climatological_model(epi_data, ahead, geo_agg = FALSE) %>% mutate(forecaster = "climate_geo")
-  pred_linear <- forecaster_baseline_linear(epi_data, ahead, residual_tail = residual_tail, residual_center = residual_center) %>% mutate(forecaster = "linear")
-  pred <- bind_rows(pred_climate, pred_linear, pred_geo_climate) %>%
-    ensemble_climate_linear((args_list$aheads[[1]]) / 7) %>%
-    ungroup()
+
+  # either climate or climate linear needs the climate prediction
+  if (model_used == "climate" || model_used == "climate_linear") {
+    pred_climate <- climatological_model(season_data, ahead, geo_agg = quantiles_by_geo, floor_value = min(season_data$value, na.rm = TRUE), pop_scale = FALSE) %>% mutate(forecaster = "climate")
+    pred <- pred_climate %>% select(-forecaster)
+  }
+
+  # either linear or climate linear needs the linear prediction
+  if (model_used == "linear" || model_used == "climate_linear") {
+    pred_linear <- forecaster_baseline_linear(
+      season_data %>% filter(source %in% c("nhsn", "none")),
+      ahead,
+      residual_tail = residual_tail,
+      residual_center = residual_center,
+      no_intercept = TRUE,
+      floor_value = min(season_data$value, na.rm = TRUE, population_scale = FALSE)
+    ) %>%
+      mutate(forecaster = "linear")
+    pred <- pred_linear %>% select(-forecaster)
+  }
+
+  if (model_used == "climate_linear") {
+    pred <- bind_rows(pred_climate, pred_linear) %>%
+      ensemble_climate_linear((args_list$aheads[[1]]) / 7) %>%
+      ungroup()
+  } else if (model_used == "climatological_forecaster") {
+    # forecast all aheads at the same time
+    if (ahead == args_list$aheads[[1]][[1]] / 7) {
+      if (quantiles_by_geo) {
+        quantile_key <- "geo_value"
+      } else {
+        quantile_key <- character(0)
+      }
+      clim_res <- climatological_forecaster(
+        season_data,
+        "value",
+        args_list = climate_args_list(
+          nonneg = (scale_method == "none"),
+          time_type = "epiweek",
+          quantile_levels = quantile_levels,
+          forecast_horizon = args_list$aheads[[1]] / 7,
+          quantile_by_key = quantile_key
+        )
+      )
+      ## clim_res$predictions
+      pred <- clim_res$predictions %>%
+        filter(source %in% c("nhsn", "none")) %>%
+        pivot_quantiles_longer(.pred_distn) %>%
+        select(geo_value, forecast_date, target_end_date = target_date, value = .pred_distn_value, quantile = .pred_distn_quantile_level) %>%
+        mutate(target_end_date = ceiling_date(target_end_date, unit = "weeks", week_start = 6))
+    } else {
+      # we're fitting everything all at once in the first ahead for the
+      # climatological_forecaster, so just return a null result for the other
+      # aheads
+      null_result <- tibble(
+        geo_value = character(),
+        forecast_date = lubridate::Date(),
+        target_end_date = lubridate::Date(),
+        quantile = numeric(),
+        value = numeric()
+      )
+      return(null_result)
+    }
+  }
   # undo whitening
+  if (adding_source) {
+    pred %<>%
+      rename({{ outcome }} := value) %>%
+      mutate(source = "none")
+  } else {
+    pred %<>%
+      rename({{ outcome }} := value) %>%
+      mutate(source = "nhsn")
+  }
   pred_final <- pred %>%
-    rename({{ outcome }} := value) %>%
-    mutate(source = "nhsn") %>%
-    data_coloring(outcome, learned_params, join_cols = key_colnames(epi_data, exclude = "time_value"), nonlin_method = nonlin_method) %>%
+    data_coloring(outcome, learned_params, join_cols = key_colnames(season_data, exclude = "time_value"), nonlin_method = nonlin_method) %>%
     rename(value = {{ outcome }}) %>%
     mutate(value = pmax(0, value)) %>%
     select(-source)
   # move dates to appropriate markers
-  pred_final <- pred_final %>% mutate(target_end_date = target_end_date - 3)
+  pred_final <- pred_final %>%
+    mutate(target_end_date = target_end_date - 3) %>%
+    sort_by_quantile()
   return(pred_final)
 }
diff --git a/R/scoring.R b/R/scoring.R
@@ -1,6 +1,13 @@
 # Scoring and Evaluation Functions
 
 evaluate_predictions <- function(forecasts, truth_data) {
+  # make sure the quantiles are in ascending order
+  forecasts <- forecasts %>%
+    arrange(model, geo_value, target_end_date, forecast_date, quantile) %>%
+    group_by(model, geo_value, target_end_date, forecast_date) %>%
+    mutate(prediction = sort(prediction)) %>%
+    ungroup()
+
   checkmate::assert_data_frame(forecasts)
   checkmate::assert_data_frame(truth_data)
   checkmate::assert_names(
diff --git a/R/targets/covid_forecaster_config.R b/R/targets/covid_forecaster_config.R
@@ -113,6 +113,45 @@ get_covid_forecaster_params <- function() {
         c("climatological"),
         c("climatological", "window")
       )
+    ),
+    climate_linear = bind_rows(
+      expand_grid(
+        forecaster = "climate_linear_ensembled",
+        scale_method = "quantile",
+        center_method = "median",
+        nonlin_method = c("quart_root", "none"),
+        model_used = c("climate_linear", "climate", "climatological_forecaster"),
+        filter_agg_level = "state",
+        drop_non_seasons = c(TRUE, FALSE),
+        quantiles_by_geo = c(TRUE, FALSE),
+        aheads = list(g_aheads),
+        residual_tail = 0.70,
+        residual_center = 0.127
+      ),
+      expand_grid(
+        forecaster = "climate_linear_ensembled",
+        scale_method = "none",
+        center_method = "none",
+        nonlin_method = c("quart_root", "none"),
+        model_used = c("climate_linear", "climate", "climatological_forecaster"),
+        filter_agg_level = "state",
+        drop_non_seasons = c(TRUE, FALSE),
+        quantiles_by_geo = c(TRUE, FALSE),
+        aheads = list(g_aheads),
+        residual_tail = 0.97,
+        residual_center = 0.097
+      ),
+      expand_grid(
+        forecaster = "climate_linear_ensembled",
+        scale_method = "none",
+        center_method = "none",
+        nonlin_method = "none",
+        model_used = "linear",
+        filter_agg_level = "state",
+        aheads = list(g_aheads),
+        residual_tail = 0.97,
+        residual_center = 0.097
+      ),
     )
   ) %>%
     map(function(x) {
diff --git a/R/targets/flu_forecaster_config.R b/R/targets/flu_forecaster_config.R
diff --git a/scripts/reports/overall-comparison-notebook.Rmd b/scripts/reports/overall-comparison-notebook.Rmd

Original file line number	Diff line number	Diff line change
`@@ -304,7 +304,7 @@ drop_non_seasons <- function(epi_data, min_window = 12) {`
`304`	`304`	`(forecast_date - time_value < as.difftime(min_window, units = "weeks")),`
`305`	`305`	`season != "2020/21",`
`306`	`306`	`# season != "2021/22", # keeping this because whitening otherwise gets really bad with the single season of data`
`307`		`- (season != "2019/20") \| (time_value < "2020-03-01"),`
	`307`	`+ (season != "2019/20"),`
`308`	`308`	`season != "2008/09"`
`309`	`309`	`)`
`310`	`310`	`}`