Skip to content

Commit c89c932

Browse files
committed
fix: archiver, deduplicate by etag
1 parent 450b5ba commit c89c932

File tree

3 files changed

+20
-8
lines changed

3 files changed

+20
-8
lines changed

R/utils.R

+7-6
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,10 @@ make_ensemble_grid <- function(tib) {
149149
#'
150150
#' @export
151151
get_exclusions <- function(
152-
date,
153-
forecaster,
154-
exclusions_json = here::here("scripts", "geo_exclusions.json")) {
152+
date,
153+
forecaster,
154+
exclusions_json = here::here("scripts", "geo_exclusions.json")
155+
) {
155156
if (!file.exists(exclusions_json)) {
156157
return("")
157158
}
@@ -569,7 +570,7 @@ get_targets_errors <- function(project = tar_path_store(), top_n = 10) {
569570
#' wait_seconds = 1,
570571
#' fn = pub_covidcast,
571572
#' source = "nssp",
572-
#' signals= "pct_ed_visits_covid",
573+
#' signals = "pct_ed_visits_covid",
573574
#' geo_type = "state",
574575
#' geo_values = "*",
575576
#' time_type = "week"
@@ -603,6 +604,6 @@ validate_epi_data <- function(epi_data) {
603604
}
604605

605606
#' Convenience wrapper for working with Delphi S3 bucket.
606-
get_bucket_df_delphi <- function(prefix = "", bucket = "forecasting-team-data") {
607-
aws.s3::get_bucket_df(prefix = prefix, bucket = bucket) %>% tibble()
607+
get_bucket_df_delphi <- function(prefix = "", bucket = "forecasting-team-data", ...) {
608+
aws.s3::get_bucket_df(prefix = prefix, bucket = bucket, ...) %>% tibble()
608609
}

reports/template.md

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
## Exploration Reports
1212

13+
- [An Analysis of Decreasing Behavior in Forecasters](decreasing_forecasters.html)
1314
- [NHSN 2024-2025 Data Analysis](new_data.html)
1415

1516
### Flu

scripts/build_nhsn_archive.R

+12-2
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,10 @@ get_last_raw_update_at <- function(type = c("raw", "prelim"), missing_value = MI
8787
#'
8888
#' @param verbose Whether to print verbose output.
8989
update_nhsn_data_raw <- function() {
90+
# If this request fails (which occurs surprisingly often, eyeroll), we
91+
# will just return a future date (2040-01-01) and download anyway.
9092
raw_update_at <- get_socrata_updated_at(config$raw_metadata_url)
93+
# Same here.
9194
prelim_update_at <- get_socrata_updated_at(config$prelim_metadata_url)
9295
last_raw_file_update_at <- get_last_raw_update_at("raw")
9396
last_prelim_file_update_at <- get_last_raw_update_at("prelim")
@@ -109,6 +112,11 @@ update_nhsn_data_raw <- function() {
109112
cli_inform("Downloading the prelim data... {prelim_file}")
110113
read_csv(config$prelim_query_url) %>% s3write_using(write_parquet, object = prelim_file, bucket = config$s3_bucket)
111114
}
115+
116+
# Since we may have downloaded a duplicate file above, filter out the ones
117+
# that have the same ETag. (I don't feel like rederiving AWS S3's ETag field
118+
# and computing ahead of time.)
119+
delete_duplicates_from_s3_by_etag(config$s3_bucket, config$raw_file_name_prefix, dry_run = FALSE)
112120
}
113121

114122
#' Process Raw NHSN Data File
@@ -182,7 +190,9 @@ update_nhsn_data_archive <- function() {
182190
return(invisible(NULL))
183191
}
184192

185-
cli_inform("New datasets available at, adding {nrow(new_data_files_latest_per_day)} new NHSN datasets to the archive.")
193+
cli_inform(
194+
"New datasets available at, adding {nrow(new_data_files_latest_per_day)} new NHSN datasets to the archive."
195+
)
186196

187197
# Process each new dataset snapshot
188198
new_data <- new_data_files_latest_per_day$Key %>%
@@ -211,4 +221,4 @@ update_nhsn_data <- function(verbose = FALSE) {
211221
update_nhsn_data_archive()
212222
}
213223

214-
update_nhsn_data(verbose = TRUE)
224+
update_nhsn_data(verbose = TRUE)

0 commit comments

Comments
 (0)