diff --git a/DESCRIPTION b/DESCRIPTION index 1af74463..9cefb6b5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,7 @@ License: Apache License (>= 2) Encoding: UTF-8 Remotes: github::epiforecasts/EpiNow2@9b8cd4fcceca41ac34545a38989f6f295ddeeaf7, - github::stan-dev/cmdstanr + github::stan-dev/cmdstanr@da99e2ba954658bdad63bffb738c4444c33a4e0e Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 Suggests: @@ -51,7 +51,9 @@ Imports: tidyr, tidybayes, optparse, - Microsoft365R + Microsoft365R, + stringr, + glue Additional_repositories: https://stan-dev.r-universe.dev URL: https://cdcgov.github.io/cfa-epinow2-pipeline/ diff --git a/Makefile b/Makefile index 7252cd40..9b183768 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,21 @@ IMAGE_NAME=cfa-epinow2-pipeline BRANCH=$(shell git branch --show-current) CONFIG_CONTAINER=rt-epinow2-config CNTR_MGR=docker +DATA_API=v1 ifeq ($(BRANCH), main) TAG=latest else TAG=$(BRANCH) endif +ifeq ($(DATA_API),v1) +API_CONTAINER := nssp-etl +else ifeq ($(DATA_API),v2) +API_CONTAINER := nssp-etl-api-v2 +else +$(error Unknown DATA_API '$(DATA_API)'. Expected v1 or v2) +endif + CONFIG=test.json POOL="cfa-epinow2-$(TAG)" TIMESTAMP:=$(shell date -u +"%Y%m%d_%H%M%S") @@ -38,12 +47,14 @@ config: ## Generates a configuration file for running the model --disease="COVID-19,Influenza,RSV" \ --state=all \ --output-container=nssp-rt-v2 \ + --input-container=$(API_CONTAINER) \ --job-id=$(JOB) \ --report-date-str=$(REPORT_DATE) rerun-config: ## Generate a configuration file to rerun a previous model uv run azure/generate_rerun_configs.py \ --output-container=nssp-rt-v2 \ + --input-container=$(API_CONTAINER) \ --job-id=$(JOB) \ --report-date-str=$(REPORT_DATE) @@ -84,6 +95,7 @@ test-batch: ## Run GitHub Actions workflow and then job.py for testing on Azure --disease="COVID-19,Influenza,RSV" \ --state=NY \ --output-container=nssp-rt-testing \ + --input-container=$(API_CONTAINER) \ --job-id=$(JOB) \ --report-date-str=$(REPORT_DATE) uv run --env-file .env \ diff --git a/NEWS.md b/NEWS.md index 87e3c43d..beef39a1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # CFAEpiNow2Pipeline v0.2.0 ## Features + +* Add the ability to read data from API v2 as well as v1 * dependabot updating package docker/login-action from version 3 to version 4 new * Adding image tag validation when dependabot PRs are opened and automatic update to NEWs md * Add a utility script for reading, preparing, and uploading Rt review decisions diff --git a/R/config.R b/R/config.R index ac55276e..07ee4bdb 100644 --- a/R/config.R +++ b/R/config.R @@ -158,6 +158,12 @@ Data <- S7::new_class( #' "YYYY-MM-DD". #' @param output_container An optional string specifying the output blob storage #' container. +#' @param facility_active_proportion A numeric value between 0 and 1 specifying +#' the proportion of days during the modeling period that facilities must have +#' reported at least one informative discharge diagnosis (DDI) to be included in +#' the analysis. Default is 0.94 (require +#' active reporting for >=53 of 56 days in the training period). +#' Lower values allow inclusion of facilities with fewer active days. #' @family config #' @export Config <- S7::new_class( @@ -190,7 +196,26 @@ Config <- S7::new_class( # Would add default values, but Roxygen isn't happy about them yet. sampler_opts = S7::class_list, exclusions = S7::S7_class(Exclusions()), - output_container = character_or_null + output_container = character_or_null, + facility_active_proportion = S7::new_property( + S7::class_double, + default = quote(0.94), + validator = \(value) { + if ( + rlang::is_bare_numeric(value) && + length(value) == 1 && + value >= 0 && + value <= 1 + ) { + NULL + } else { + paste0( + "Invalid value for facility_active_proportion. ", + "It must be a single numeric value between 0 and 1." + ) + } + } + ) ) ) diff --git a/R/pipeline.R b/R/pipeline.R index 34d649f9..053e67ec 100644 --- a/R/pipeline.R +++ b/R/pipeline.R @@ -85,7 +85,7 @@ orchestrate_pipeline <- function( ) read_json_into_config( config_path, - c("exclusions", "output_container") + c("exclusions", "output_container", "facility_active_proportion") ) }, error = function(con) { @@ -94,6 +94,7 @@ orchestrate_pipeline <- function( } ) if (typeof(config) == "logical") { + cli::cli_warn("Failed to read config", class = "Bad_config") return(invisible(FALSE)) } @@ -113,6 +114,7 @@ orchestrate_pipeline <- function( logfile_connection <- file(file.path(logfile_path, "logs.txt"), open = "wt") sink( logfile_connection, + # "output" means general (non-error) output type = "output", append = TRUE, # Send output to logs and to console @@ -120,8 +122,11 @@ orchestrate_pipeline <- function( ) sink( logfile_connection, + # "message" means error messages type = "message", append = TRUE + # Unfortunately, we can't split messages to both console and log file. + # It will error out if we try ) on.exit(sink(file = NULL)) cli::cli_alert_info("Starting run at {Sys.time()}") @@ -189,7 +194,8 @@ execute_model_logic <- function(config, input_dir, output_dir) { geo_value = config@geo_value, report_date = config@report_date, max_reference_date = config@max_reference_date, - min_reference_date = config@min_reference_date + min_reference_date = config@min_reference_date, + facility_active_proportion = config@facility_active_proportion ) # rlang::is_empty() checks for empty and NULL values @@ -289,7 +295,8 @@ execute_model_logic <- function(config, input_dir, output_dir) { config@exclusions@blob_storage_container ), # Add the config container here when refactoring out to outer func - run_at = format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z") + run_at = format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z"), + facility_active_proportion = config@facility_active_proportion ) write_model_outputs( diff --git a/R/read_data.R b/R/read_data.R index 0c18d28d..28ba55d2 100644 --- a/R/read_data.R +++ b/R/read_data.R @@ -1,8 +1,13 @@ #' Read in the dataset of incident case counts #' -#' Each row of the table corresponds to a single facilities' cases for a -#' reference-date/report-date/disease tuple. We want to aggregate these counts -#' to the level of geographic aggregate/report-date/reference-date/disease. +#' Reads in data from either data API v1 or v2. Data API version is +#' intuited by read_data by the presence of the `any_visits_this_day` column +#' in the underlying data. Each row of the table corresponds to a single +#' facilities' cases for a reference-date/report-date/disease tuple. +#' We want to aggregate these counts to the level of geographic +#' aggregate/report-date/reference-date/disease. The +#' _facility_active_proportion_ field is used to filter facilities with data +#' outages from the data API v2 (this field is not used for data API v1). #' #' We handle two distinct cases for geographic aggregates: #' @@ -29,102 +34,171 @@ read_data <- function( geo_value, report_date, max_reference_date, - min_reference_date + min_reference_date, + facility_active_proportion = 0.94 ) { rlang::arg_match(disease) - # NOTE: this is temporary workaround until we switch to the new API. I'm not - # sure if there's a better way to do this without a whole bunch of special - # casing -- which is its own code smell. I think this should really be handled - # upstream in the ETL job and standardize on "COVID-19", but that's beyond - # scope here and we need to do _something_ in the meantime so this runs. - disease_map <- c( - "COVID-19" = "COVID-19/Omicron", - "Influenza" = "Influenza", - "RSV" = "RSV", - "test" = "test" + check_file_exists(data_path) + + con <- DBI::dbConnect(duckdb::duckdb()) + on.exit(DBI::dbDisconnect(con), add = TRUE) + + # any_visits_this_day is calculated for API v2 + # True if a given facility, on a given reference_date had a DDI count > 0 + # Same across all diseases and metrics for (facility, reference_date) + is_api_v2 <- rlang::try_fetch( + { + cols <- DBI::dbGetQuery( + con, + "SELECT * FROM read_parquet(?) LIMIT 0;", + params = list(data_path) + ) |> + names() + + "any_visits_this_day" %in% cols + }, + error = function(con) { + cli::cli_abort( + c( + "Error reading schema from {.path {data_path}}", + "Original error: {con}" + ), + class = "wrapped_schema_read_error" + ) + } ) - mapped_disease <- disease_map[[disease]] - check_file_exists(data_path) + is_us <- identical(geo_value, "US") + + cli::cli_inform( + c( + "Using {if (is_api_v2) 'API v2 (facility filtered)' else 'API v1'}", + "query for {.val {geo_value}}" + ) + ) + + disease_param <- if (disease == "COVID-19") paste0(disease, "%") else disease - parameters <- list( + base_params <- list( data_path = data_path, - disease = mapped_disease, + disease = disease_param, min_ref_date = stringify_date(min_reference_date), max_ref_date = stringify_date(max_reference_date), report_date = stringify_date(report_date) ) - # We need different queries for the states and the US overall. For US overall - # we need to aggregate over all the facilities in all the states. For the - # states, we need to aggregate over all the facilities in that one state - if (geo_value == "US") { - query <- " - SELECT - report_date, - reference_date, - CASE - WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' - ELSE disease - END AS disease, - -- We want to inject the 'US' as our abbrevation here bc data is not agg'd - 'US' AS geo_value, - sum(value) AS confirm - FROM read_parquet(?) - WHERE 1=1 - AND disease = ? - AND metric = 'count_ed_visits' - AND reference_date >= ? :: DATE - AND reference_date <= ? :: DATE - AND report_date = ? :: DATE - GROUP BY reference_date, report_date, disease - ORDER BY reference_date - " + geo_select <- if (is_us) { + "'US' AS geo_value" } else { - # We want just one state so aggregate over facilites in that one state only - query <- " - SELECT - report_date, - reference_date, - CASE - WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' - ELSE disease - END AS disease, - geo_value AS geo_value, - sum(value) AS confirm, - FROM read_parquet(?) - WHERE 1=1 - AND disease = ? - AND metric = 'count_ed_visits' - AND reference_date >= ? :: DATE - AND reference_date <= ? :: DATE - AND report_date = ? :: DATE - AND geo_value = ? - GROUP BY geo_value, reference_date, report_date, disease - ORDER BY reference_date - " - # Append `geo_value` to the query - parameters <- c(parameters, list(geo_value = geo_value)) + "geo_value" + } + + geo_filter <- if (is_us) { + "" + } else { + "AND geo_value = ?" + } + + group_by <- if (is_us) { + "GROUP BY reference_date, report_date, disease" + } else { + "GROUP BY geo_value, reference_date, report_date, disease" + } + + if (!is_api_v2) { + query <- glue::glue( + " + SELECT + report_date, + reference_date, + CASE + WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' + ELSE disease + END AS disease, + {geo_select}, + SUM(value) AS confirm + FROM read_parquet(?) + WHERE disease LIKE ? + AND metric = 'count_ed_visits' + AND reference_date >= ?::DATE + AND reference_date <= ?::DATE + AND report_date = ?::DATE + {geo_filter} + {group_by} + ORDER BY reference_date + " + ) + + params <- base_params + if (!is_us) { + params <- c(params, list(geo_value = geo_value)) + } + } else { + query <- glue::glue( + " + WITH facility_checks AS ( + SELECT + *, + AVG(IF(any_visits_this_day, 1, 0)) OVER ( + PARTITION BY facility + ) AS proportion_true + FROM read_parquet(?) + WHERE disease LIKE ? + AND metric = 'count_ed_visits' + AND reference_date >= ?::DATE + AND reference_date <= ?::DATE + AND report_date = ?::DATE + {geo_filter} + ) + SELECT + report_date, + reference_date, + CASE + WHEN disease = 'COVID-19/Omicron' THEN 'COVID-19' + ELSE disease + END AS disease, + {geo_select}, + SUM(value) AS confirm + FROM facility_checks + WHERE proportion_true >= ? + -- `WHERE` filters before the GROUP BY, so this filter excludes + -- from the agg all facilities with insufficient reporting + {group_by} + ORDER BY reference_date + " + ) + + params <- base_params + if (!is_us) { + params <- c(params, list(geo_value = geo_value)) + } + params <- c( + params, + list(facility_active_proportion = facility_active_proportion) + ) } - con <- DBI::dbConnect(duckdb::duckdb()) - on.exit(expr = DBI::dbDisconnect(con)) df <- rlang::try_fetch( DBI::dbGetQuery( con, statement = query, - params = unname(parameters) + params = unname(params) ), error = function(con) { cli::cli_abort( c( "Error fetching data from {.path {data_path}}", "Using parameters:", - "*" = "data_path: {.path {parameters[['data_path']]}}", - "*" = "mapped_disease: {.val {parameters[['disease']]}}", - "*" = "min_reference_date: {.val {parameters[['min_ref_date']]}}", - "*" = "max_reference_date: {.val {parameters[['max_ref_date']]}}", - "*" = "report_date: {.val {parameters[['report_date']]}}", + "*" = "data_path: {.path {base_params[['data_path']]}}", + "*" = "disease: {.val {base_params[['disease']]}}", + "*" = "min_reference_date: {.val {base_params[['min_ref_date']]}}", + "*" = "max_reference_date: {.val {base_params[['max_ref_date']]}}", + "*" = "report_date: {.val {base_params[['report_date']]}}", + "*" = "geo_value: {.val {geo_value}}", + "*" = paste0( + "facility_active_proportion: ", + "{.val {facility_active_proportion}}" + ), "Original error: {con}" ), class = "wrapped_invalid_query" @@ -132,20 +206,20 @@ read_data <- function( } ) - # Guard against empty return if (nrow(df) == 0) { cli::cli_abort( c( "No data matching returned from {.path {data_path}}", - "Using parameters {parameters}" + "Using parameters {base_params}" ), class = "empty_return" ) } - # Warn for incomplete return + n_rows_expected <- as.Date(max_reference_date) - as.Date(min_reference_date) + 1 + if (nrow(df) != n_rows_expected) { expected_dates <- seq.Date( from = as.Date(min_reference_date), @@ -153,12 +227,9 @@ read_data <- function( by = "day" ) missing_dates <- stringify_date( - # Setdiff strips the date attribute from the objects; re-add it so that we - # can pretty-format the date for printing - as.Date( - setdiff(expected_dates, df[["reference_date"]]) - ) + as.Date(setdiff(expected_dates, df[["reference_date"]])) ) + cli::cli_warn( c( "Incomplete number of rows returned", @@ -171,5 +242,5 @@ read_data <- function( } cli::cli_alert_success("Read {nrow(df)} rows from {.path {data_path}}") - return(df) + df } diff --git a/azure/run_container_app_job.py b/azure/run_container_app_job.py index 25ec2d5e..92118926 100644 --- a/azure/run_container_app_job.py +++ b/azure/run_container_app_job.py @@ -98,7 +98,7 @@ def main(image_name: str, config_container: str, job_id: str): state_config = config_path.split("/").pop() job_execution_id = job_execution.id.split("/").pop() print( - f"Started Container App Job #{i+1}/{len(task_configs)} for {state_config} with execution ID: {job_execution_id}" + f"Started Container App Job #{i + 1}/{len(task_configs)} for {state_config} with execution ID: {job_execution_id}" ) @@ -118,7 +118,7 @@ def main(image_name: str, config_container: str, job_id: str): "--config_container", type=str, help="The name of the storage container where config files are located", - default="rt-epinow2-config" + default="rt-epinow2-config", ) parser.add_argument( "--job_id", diff --git a/man/Config.Rd b/man/Config.Rd index 4c451e2a..b53b18c0 100644 --- a/man/Config.Rd +++ b/man/Config.Rd @@ -25,7 +25,8 @@ Config( parameters = class_missing, sampler_opts = class_missing, exclusions = class_missing, - output_container = class_missing + output_container = class_missing, + facility_active_proportion = class_missing ) } \arguments{ @@ -85,6 +86,13 @@ criteria.} \item{output_container}{An optional string specifying the output blob storage container.} + +\item{facility_active_proportion}{A numeric value between 0 and 1 specifying +the proportion of days during the modeling period that facilities must have +reported at least one informative discharge diagnosis (DDI) to be included in +the analysis. Default is 0.94 (require +active reporting for >=53 of 56 days in the training period). +Lower values allow inclusion of facilities with fewer active days.} } \description{ Represents the complete configuration for the pipeline. diff --git a/man/read_data.Rd b/man/read_data.Rd index 8e9875a2..15fa0bd0 100644 --- a/man/read_data.Rd +++ b/man/read_data.Rd @@ -10,7 +10,8 @@ read_data( geo_value, report_date, max_reference_date, - min_reference_date + min_reference_date, + facility_active_proportion = 0.94 ) } \arguments{ @@ -31,15 +32,27 @@ date. Formatted as "YYYY-MM-DD".} \item{min_reference_date}{A string representing the minimum reference date. Formatted as "YYYY-MM-DD".} + +\item{facility_active_proportion}{A numeric value between 0 and 1 specifying +the proportion of days during the modeling period that facilities must have +reported at least one informative discharge diagnosis (DDI) to be included in +the analysis. Default is 0.94 (require +active reporting for >=53 of 56 days in the training period). +Lower values allow inclusion of facilities with fewer active days.} } \value{ A dataframe with one or more rows and columns \code{report_date}, \code{reference_date}, \code{geo_value}, \code{confirm} } \description{ -Each row of the table corresponds to a single facilities' cases for a -reference-date/report-date/disease tuple. We want to aggregate these counts -to the level of geographic aggregate/report-date/reference-date/disease. +Reads in data from either data API v1 or v2. Data API version is +intuited by read_data by the presence of the \code{any_visits_this_day} column +in the underlying data. Each row of the table corresponds to a single +facilities' cases for a reference-date/report-date/disease tuple. +We want to aggregate these counts to the level of geographic +aggregate/report-date/reference-date/disease. The +\emph{facility_active_proportion} field is used to filter facilities with data +outages from the data API v2 (this field is not used for data API v1). } \details{ We handle two distinct cases for geographic aggregates: diff --git a/tests/testthat/data/CA_COVID-19.json b/tests/testthat/data/CA_COVID-19.json index ae4c2bc2..7c3c2122 100644 --- a/tests/testthat/data/CA_COVID-19.json +++ b/tests/testthat/data/CA_COVID-19.json @@ -13,6 +13,7 @@ "geo_type": "state", "report_date": "2024-11-26", "production_date": "2024-11-26", + "output_container": "testing", "parameters": { "as_of_date": "2024-11-26", "generation_interval": { @@ -59,5 +60,6 @@ 0.5, 0.95 ], - "model": "EpiNow2" + "model": "EpiNow2", + "facility_active_proportion": 0.94 } diff --git a/tests/testthat/data/CA_apiv2_test.parquet b/tests/testthat/data/CA_apiv2_test.parquet new file mode 100644 index 00000000..f38e712d Binary files /dev/null and b/tests/testthat/data/CA_apiv2_test.parquet differ diff --git a/tests/testthat/data/README.md b/tests/testthat/data/README.md new file mode 100644 index 00000000..9d8905ca --- /dev/null +++ b/tests/testthat/data/README.md @@ -0,0 +1,17 @@ + +- **Parquet test fixtures** + - `CA_test.parquet`: Synthetic test data containing metric 'count_ed_visits' and 'COVID-19/Omicron' as disease (data in format of API v1). `read_data` converts 'COVID-19/Omicron' disease to 'COVID-19'. Does _not_ contain column `any_visits_this_day`. This data was created in Fall 2024 by taking the format of a production DATA API v1 .parquet file and adding synthetic data to it. + - `CA_apiv2_test.parquet`: Synthetic test data containing metric 'count_ed_visits' and 'COVID-19' as disease (data in format of API v2). Contains column `any_visits_this_day`. This data was created in Fall 2024 by taking the format of a production DATA API v2 .parquet file and adding synthetic data to it. + - `test_data.parquet`/`us_overall_test_data.parquet`: Package data. See `?gostic_toy_rt` and `data-raw/convert_gostic_toy_rt_to_test_dataset.R` + - `test_parameters.parquet`: Package data. See `?sir_gt_pmf` and `data-raw/sir_gt_pmf.R` + +- **JSON test configs** + - `CA_COVID-19.json`: EpiNow2 task config for a CA/COVID-19 run (dates, model/sampler settings) pointing to `CA_test.parquet`, with no exclusions (`exclusions.path: null`). + - `bad_config.json`: Intentionally invalid config value to test validation/error handling + - `sample_config_no_exclusion.json`: Example valid config for no exclusions. + - `sample_config_with_exclusion.json`: Example config that includes exclusions + - `v_bad_config.json`: Intentionally incomplete/invalid config (only `job_id` and `task_id`) to test required fields are enforced. + +- **CSV exclusions fixtures** + - `test_exclusions.csv`: Minimal exclusions fixture (columns `reference_date, report_date, state, disease`) with one row for `state=test, disease=test`. + - `test_big_exclusions.csv`: Larger exclusions fixture (same columns) to test impact of many exclusions diff --git a/tests/testthat/test-pipeline.R b/tests/testthat/test-pipeline.R index d5c8146b..b3f63a39 100644 --- a/tests/testthat/test-pipeline.R +++ b/tests/testthat/test-pipeline.R @@ -77,7 +77,7 @@ test_that("Process pipeline produces expected outputs and returns success", { config_path <- file.path(input_dir, "sample_config_with_exclusion.json") config <- read_json_into_config( config_path, - c("exclusions", "output_container") + c("exclusions", "output_container", "facility_active_proportion") ) # Read from locally output_dir <- "pipeline_test" @@ -110,7 +110,7 @@ test_that("Runs on config from generator as of 2024-11-26", { input_dir <- test_path("data") config <- read_json_into_config( file.path(input_dir, config_path), - c("exclusions", "output_container") + c("exclusions", "output_container", "facility_active_proportion") ) # Read from locally output_dir <- test_path("pipeline_test") diff --git a/tests/testthat/test-read_data.R b/tests/testthat/test-read_data.R index b54e4b04..c5d9b108 100644 --- a/tests/testthat/test-read_data.R +++ b/tests/testthat/test-read_data.R @@ -100,7 +100,7 @@ test_that("An invalid query throws a wrapped error", { min_reference_date = "2023-01-02", max_reference_date = "2023-01-22" ), - class = "wrapped_invalid_query" + class = "wrapped_schema_read_error" ) }) @@ -121,7 +121,7 @@ test_that("Incomplete return throws warning", { ) }) -test_that("Replace COVID-19/Omicron with COVID-19, one state", { +test_that("Replace COVID-19/Omicron with COVID-19, one state (API v1)", { data_path <- test_path("data/CA_test.parquet") actual <- read_data( @@ -140,7 +140,7 @@ test_that("Replace COVID-19/Omicron with COVID-19, one state", { }) -test_that("Replace COVID-19/Omicron with COVID-19, US", { +test_that("Replace COVID-19/Omicron with COVID-19, US (API v1)", { data_path <- test_path("data/CA_test.parquet") actual <- read_data( @@ -157,3 +157,95 @@ test_that("Replace COVID-19/Omicron with COVID-19, US", { expect_false("COVID-19/Omicron" %in% actual$disease) expect_true(all(actual$disease == "COVID-19")) }) + +test_that("API v2 with COVID-19, one state", { + data_path <- test_path("data/CA_apiv2_test.parquet") + + actual <- read_data( + data_path, + disease = "COVID-19", + geo_value = "CA", + report_date = "2024-11-26", + min_reference_date = as.Date("2024-06-01"), + max_reference_date = "2024-11-25", + facility_active_proportion = 1.0 + ) + + # Expect that there should be no "COVID-19/Omicron" in the data, + # only "COVID-19" + expect_false("COVID-19/Omicron" %in% actual$disease) + expect_true(all(actual$disease == "COVID-19")) +}) + +test_that("API v2 with COVID-19, US (default facility_active_proportion)", { + data_path <- test_path("data/CA_apiv2_test.parquet") + + actual <- read_data( + data_path, + disease = "COVID-19", + geo_value = "US", + report_date = "2024-11-26", + min_reference_date = as.Date("2024-06-01"), + max_reference_date = "2024-11-25" + ) + + # Expect that there should be no "COVID-19/Omicron" in the data, + # only "COVID-19" + expect_false("COVID-19/Omicron" %in% actual$disease) + expect_true(all(actual$disease == "COVID-19")) +}) + +test_that("facility_active_proportion affects counts (API v2)", { + data_path <- test_path("data/CA_apiv2_test.parquet") + + # Read data with facility_active_proportion = 1.0 + # (stricter - only facilities active all days) + data_strict <- read_data( + data_path, + disease = "COVID-19", + geo_value = "CA", + report_date = "2024-11-26", + min_reference_date = as.Date("2024-06-01"), + max_reference_date = "2024-11-25", + facility_active_proportion = 1.0 + ) + + # Read data with facility_active_proportion = 0.5 + # (less strict - facilities active >=50% of days) + data_lenient <- read_data( + data_path, + disease = "COVID-19", + geo_value = "CA", + report_date = "2024-11-26", + min_reference_date = as.Date("2024-06-01"), + max_reference_date = "2024-11-25", + facility_active_proportion = 0.5 + ) + + # Both should have the same structure + expect_equal(names(data_strict), names(data_lenient)) + expect_equal(nrow(data_strict), nrow(data_lenient)) + + expect_true( + all(data_lenient$confirm >= data_strict$confirm), + info = "Lenient data should have equal or more counts than strict data" + ) +}) + +test_that("facility_active_proportion doesn't affects counts (API v1)", { + # facility_active_proportion is ignored for API v1 + data_path <- test_path("data/CA_test.parquet") + + actual <- read_data( + data_path, + disease = "COVID-19", + geo_value = "US", + report_date = "2024-11-26", + min_reference_date = as.Date("2024-06-01"), + max_reference_date = "2024-11-25" + ) + + expected_rows <- 178 + actual_rows <- actual |> nrow() + expect_equal(expected_rows, actual_rows) +})