fix: archiver, deduplicate by etag

dshemetov · dshemetov · commit c89c932416f6 · 2025-04-16T16:18:59.000-07:00
diff --git a/R/utils.R b/R/utils.R
@@ -149,9 +149,10 @@ make_ensemble_grid <- function(tib) {
 #'
 #' @export
 get_exclusions <- function(
-    date,
-    forecaster,
-    exclusions_json = here::here("scripts", "geo_exclusions.json")) {
+  date,
+  forecaster,
+  exclusions_json = here::here("scripts", "geo_exclusions.json")
+) {
   if (!file.exists(exclusions_json)) {
     return("")
   }
@@ -569,7 +570,7 @@ get_targets_errors <- function(project = tar_path_store(), top_n = 10) {
 #'   wait_seconds = 1,
 #'   fn = pub_covidcast,
 #'   source = "nssp",
-#'   signals= "pct_ed_visits_covid",
+#'   signals = "pct_ed_visits_covid",
 #'   geo_type = "state",
 #'   geo_values = "*",
 #'   time_type = "week"
@@ -603,6 +604,6 @@ validate_epi_data <- function(epi_data) {
 }
 
 #' Convenience wrapper for working with Delphi S3 bucket.
-get_bucket_df_delphi <- function(prefix = "", bucket = "forecasting-team-data") {
-  aws.s3::get_bucket_df(prefix = prefix, bucket = bucket) %>% tibble()
+get_bucket_df_delphi <- function(prefix = "", bucket = "forecasting-team-data", ...) {
+  aws.s3::get_bucket_df(prefix = prefix, bucket = bucket, ...) %>% tibble()
 }
diff --git a/reports/template.md b/reports/template.md
@@ -10,6 +10,7 @@
 
 ## Exploration Reports
 
+- [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
 - [NHSN 2024-2025 Data Analysis](new_data.html)
 
 ### Flu
diff --git a/scripts/build_nhsn_archive.R b/scripts/build_nhsn_archive.R
@@ -87,7 +87,10 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
 #'
 #' @param verbose Whether to print verbose output.
 update_nhsn_data_raw <- function() {
+  # If this request fails (which occurs surprisingly often, eyeroll), we
+  # will just return a future date (2040-01-01) and download anyway.
   raw_update_at <- get_socrata_updated_at(config$raw_metadata_url)
+  # Same here.
   prelim_update_at <- get_socrata_updated_at(config$prelim_metadata_url)
   last_raw_file_update_at <- get_last_raw_update_at("raw")
   last_prelim_file_update_at <- get_last_raw_update_at("prelim")
@@ -109,6 +112,11 @@ update_nhsn_data_raw <- function() {
     cli_inform("Downloading the prelim data... {prelim_file}")
     read_csv(config$prelim_query_url) %>% s3write_using(write_parquet, object = prelim_file, bucket = config$s3_bucket)
   }
+
+  # Since we may have downloaded a duplicate file above, filter out the ones
+  # that have the same ETag. (I don't feel like rederiving AWS S3's ETag field
+  # and computing ahead of time.)
+  delete_duplicates_from_s3_by_etag(config$s3_bucket, config$raw_file_name_prefix, dry_run = FALSE)
 }
 
 #' Process Raw NHSN Data File
@@ -182,7 +190,9 @@ update_nhsn_data_archive <- function() {
     return(invisible(NULL))
   }
 
-  cli_inform("New datasets available at, adding {nrow(new_data_files_latest_per_day)} new NHSN datasets to the archive.")
+  cli_inform(
+    "New datasets available at, adding {nrow(new_data_files_latest_per_day)} new NHSN datasets to the archive."
+  )
 
   # Process each new dataset snapshot
   new_data <- new_data_files_latest_per_day$Key %>%
@@ -211,4 +221,4 @@ update_nhsn_data <- function(verbose = FALSE) {
   update_nhsn_data_archive()
 }
 
-update_nhsn_data(verbose = TRUE)
+update_nhsn_data(verbose = TRUE)