CDCgov
diff --git a/‎.github/workflows/mkdocs.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/mkdocs.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 22 additions & 27 deletions b/‎Makefile‎
Lines changed: 22 additions & 27 deletions
diff --git a/‎README.md‎
Lines changed: 55 additions & 74 deletions b/‎README.md‎
Lines changed: 55 additions & 74 deletions
diff --git a/‎data/.placeholder‎ b/‎data/.placeholder‎
diff --git a/‎data/get_nis.py‎
Lines changed: 35 additions & 0 deletions b/‎data/get_nis.py‎
Lines changed: 35 additions & 0 deletions
@@ -27,7 +27,7 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version-file: ".python-version"
-      - run: uv sync --locked --only-group mkdocs
+      - run: uv sync --frozen --only-group mkdocs
       - run: uv run mkdocs build --strict
       - uses: actions/upload-pages-artifact@v4
         with:
 
@@ -1,52 +1,47 @@
 RUN_ID = test
-TOKEN_PATH = scripts/socrata_app_token.txt
-TOKEN = $(shell cat $(TOKEN_PATH))
 CONFIG = scripts/config.yaml
 SETTINGS = output/settings/$(RUN_ID)/
-RAW_DATA = output/data/$(RUN_ID)/
-MODEL_FITS = output/fits/$(RUN_ID)/
+RAW_DATA = data/raw.parquet
+DATA = output/data/$(RUN_ID)/nis.parquet
+FITS = output/fits/$(RUN_ID)/
 DIAGNOSTICS = output/diagnostics/$(RUN_ID)/
-PREDICTIONS = output/forecasts/$(RUN_ID)/
+FORECASTS = output/forecasts/$(RUN_ID)/
 SCORES = output/scores/$(RUN_ID)/
+DATA_PLOT = output/diagnostics/$(RUN_ID)/data_national.png
 
 
-.PHONY: clean nis delete_nis viz
+.PHONY: clean viz
 
-all: $(SETTINGS) $(RAW_DATA) $(MODEL_FITS) $(DIAGNOSTICS) $(PREDICTIONS) $(SCORES)
+all: $(SETTINGS) $(DATA) $(FITS) $(DIAGNOSTICS) $(FORECASTS) $(SCORES) $(DATA_PLOT)
 
 viz:
 	streamlit run scripts/viz.py -- \
-	--obs=$(RAW_DATA) --pred=$(PREDICTIONS) --score=$(SCORES) --config=$(CONFIG)
+	--obs=$(DATA) --pred=$(FORECASTS) --score=$(SCORES) --config=$(CONFIG)
 
-$(SCORES): scripts/eval.py $(PREDICTIONS) $(RAW_DATA)
+$(SCORES): scripts/eval.py $(FORECASTS) $(DATA)
 	python $< \
-		--pred=$(PREDICTIONS) --obs=$(RAW_DATA) --config=$(CONFIG) \
+		--pred=$(FORECASTS) --obs=$(DATA) --config=$(CONFIG) \
 		--output=$@
 
-$(PREDICTIONS): scripts/forecast.py $(RAW_DATA) $(MODEL_FITS) $(CONFIG)
-	python $< --input=$(RAW_DATA) --models=$(MODEL_FITS) --config=$(CONFIG) \
+$(FORECASTS): scripts/forecast.py $(DATA) $(FITS) $(CONFIG)
+	python $< --data=$(DATA) --models=$(FITS) --config=$(CONFIG) \
 	--output=$@
 
-$(DIAGNOSTICS): scripts/diagnostics.py $(MODEL_FITS) $(CONFIG)
-	python $< --input=$(MODEL_FITS) --config=$(CONFIG) --output=$@
+$(DIAGNOSTICS): scripts/diagnostics.py $(FITS) $(CONFIG)
+	python $< --input=$(FITS) --config=$(CONFIG) --output=$@
 
-$(MODEL_FITS): scripts/fit.py $(RAW_DATA) $(CONFIG)
-	python $< --input=$(RAW_DATA) --config=$(CONFIG) --output=$@
+$(FITS): scripts/fit.py $(DATA) $(CONFIG)
+	python $< --data=$(DATA) --config=$(CONFIG) --output=$@
 
-$(RAW_DATA): scripts/preprocess.py $(CONFIG)
-	python $< --config=$(CONFIG) --output=$@
+$(DATA_PLOT): scripts/describe_data.py $(DATA)
+	python $< --input=$(DATA) --output_dir=output/diagnostics/$(RUN_ID)/
+
+$(DATA): scripts/preprocess.py $(RAW_DATA) $(CONFIG)
+	python $< --config=$(CONFIG) --input=$(RAW_DATA) --output=$@
 
 $(SETTINGS): $(CONFIG)
 	mkdir -p $(SETTINGS)
 	cp $(CONFIG) $(SETTINGS)
 
 clean:
-	rm -r $(SETTINGS) $(RAW_DATA) $(MODEL_FITS) $(DIAGNOSTICS) $(PREDICTIONS) $(SCORES)
-
-nis:
-	python -c "import nisapi"
-	python -m nisapi cache --app-token=$(TOKEN)
-
-delete_nis:
-	python -c "import nisapi"
-	python -m nisapi delete
+	rm -r $(SETTINGS) $(DATA) $(FITS) $(DIAGNOSTICS) $(FORECASTS) $(SCORES)
@@ -1,44 +1,50 @@
 # Immunization uptake projections
 
-This repo contains statistical tools to predict the uptake of immunizations (primarily vaccines and boosters). The three primary steps are:
+This repo contains statistical tools to predict the uptake of immunizations (primarily vaccines and boosters).
 
-1. Import data sets on past uptake and cast them into a standardize format
-2. Fit a variety of models that both capture past uptake as well as project future uptake, and
-3. Evaluate model projections against realized uptake.
+## Getting started
 
-All three steps are currently under development.
+1. Read the docs at <https://cdcgov.github.io/cfa-immunization-uptake-projection>, or build them locally with `mkdocs serve`
+1. This project uses [`uv`](https://docs.astral.sh/uv/) for environment and dependency management. Ensure you can `uv sync`. Use the uv-managed virtual environment (e.g., by prepending `uv run`).
+1. Run the [vignette](#vignette).
 
-This approach is applicable to seasonal adult immunizations. Each year, the uptake process starts afresh, and individuals' transitions across age groups are not relevant.
+## Vignette
 
-## Data sources
+The vignette demonstrates a workflow using this package:
 
-Use <https://github.com/CDCgov/nis-py-api> for access to the NIS data.
+1. Fit a model to uptake data from past seasons
+1. Use it to forecast future uptake data in the latest season
+1. Evaluate forecasts against observed values
 
-## Getting started
+### Data source
+
+For convenience, the raw data are tracked in this repo under `data/`, which includes the script `get_nis.py`, used to collect that data with [`nis-py-api`](https://github.com/CDCgov/nis-py-api). These are estimates of season flu vaccine coverage, tracked monthly from the 2009/2010 to 2022/2023 seasons, from the [National Immunization Survey](https://www.cdc.gov/nis/about/index.html).
+
+### Running the vignette
+
+1. Copy `scripts/config_template.yaml` to `scripts/config.yaml`. This config can be modified; see the [file structure](#config-file-structure) below.
+1. Run `make` to run the model fitting and forecasting pipeline. Each run of the pipeline is assigned a `RUN_ID`. When a new `RUN_ID` is given, a new subfolder will be created inside each of the above six folders to store the corresponding outputs. When an existing `RUN_ID` is given, the contents of that `RUN_ID`'s existing subfolders will be overwritten, assuming the pipeline inputs have changed since the last run. `RUN_ID` can be assigned in line 1 of the Makefile or directly in the command line `make RUN_ID=name_of_run`.
+1. Inspect the `output/` subfolders:
+   - `settings`: a copy of the config.
+   - `data`: the pre-processed data.
+   - `fits`: the fit model object(s).
+   - `diagnostics`: diagnostic plots and tables for the desired model(s) and forecast date(s).
+   - `forecasts`: posterior predictions and forecasts.
+   - `scores`: evaluation scores comparing model structures and/or forecast dates.
+1. Run `make viz` to open a streamlit app in web browser, which shows the individual forecast trajectories, credible intervals, and evaluation scores, with options of dimensions and filters to customize the visualization.
+1. Optionally, `make clean` to remove all outputs for a particular `RUN_ID` .
+
+### Config file structure
 
-1. Either set up a virtual environment and install all dependencies with `uv sync` and then enter the virtual environment (with `.venv/Scripts/activate`, `.venv/bin/activate`, or similar), or else remember to prepend each of your command-line entries with `uv run` (e.g. `uv run make nis`).
-2. Get a [Socrata app token](https://github.com/CDCgov/nis-py-api?tab=readme-ov-file#getting-started) and save it in `scripts/socrata_app_token.txt`.
-3. Cache NIS data with `make nis`.
-4. Copy the config template in `scripts/config_template.yaml` to `scripts/config.yaml` and fill in the necessary fields.
-    - data: specify the vaccination uptake data to use, including a de facto annual start of the disease season, filters for rows and columns to keep, and grouping factors by which to partition forecasts.
-    - forecast_timeframe: specify the start and the end of the forecast period and the interval between reference dates in the forecast (using the [polars string language](https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.date_range.html), e.g., `7d`).
-    - evaluation_timeframe: specify the interval between forecast dates if multiple forecasts are desired (sharing the same end of the forecast period). This will create different forecast horizons, which can be compared with evaluation scores. If blank, no evaluation score will not be computed.
-    - models: specify the name of the model (refer to `iup.models`), random seed, initial values of parameters, and parameters to use NUTS kernel in MCMC run.
-    - scores: specify the quantile of the posterior forecasts to use for evaluation, the date(s) on which to compute absolute difference, and any additional evaluation metrics (e.g. mean squared prediction error as `mspe`).
-    - forecast_plots: specify the credible interval (in fractional terms) and number of randomly chosen trajectories to show on forecast plots.
-    - diagnostics: specify the model (refer to `iup.models`) and the range of forecast dates (i.e. a list of earliest and latest) on which to perform diagnostics, as well as the types of plots and tables to create (refer to `iup.diagnostics`).
-5. Run `make all` to run the model fitting and forecasting pipeline. This will create six `output/` subfolders:
-    - `settings`: a copy of the config.
-    - `data`: the pre-processed data.
-    - `fits`: the fit model object(s).
-    - `diagnostics`: diagnostic plots and tables for the desired model(s) and forecast date(s).
-    - `forecasts`: posterior predictions and forecasts.
-    - `scores`: evaluation scores comparing model structures and/or forecast dates.
-    Each run of the pipeline is assigned a `RUN_ID`. When a new `RUN_ID` is given, a new subfolder will be created inside each of the above six folders to store the corresponding outputs. When an existing `RUN_ID` is given, the contents of that `RUN_ID`'s existing subfolders will be overwritten, assuming the pipeline inputs have changed since the last run. `RUN_ID` can be assigned in line 1 of the Makefile or directly in the command line `make all RUN_ID=name_of_run`.
-6. Run `make viz` to open a streamlit app in web browser, which shows the individual forecast trajectories, credible intervals, and evaluation scores, with options of dimensions and filters to customize the visualization.
-7. Run `make clean` to remove all outputs for a particular `RUN_ID` and `make delete_nis` to delete the NIS data from the cache.
-
-#### Package workflow:
+- data: specify the vaccination uptake data to use, including a de facto annual start of the disease season, filters for rows and columns to keep, and grouping factors by which to partition forecasts.
+- forecast_timeframe: specify the start and the end of the forecast period and the interval between reference dates in the forecast (using the [polars string language](https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.date_range.html), e.g., `7d`).
+- evaluation_timeframe: specify the interval between forecast dates if multiple forecasts are desired (sharing the same end of the forecast period). This will create different forecast horizons, which can be compared with evaluation scores. If blank, no evaluation score will not be computed.
+- models: specify the name of the model (refer to `iup.models`), random seed, initial values of parameters, and parameters to use NUTS kernel in MCMC run.
+- scores: specify the quantile of the posterior forecasts to use for evaluation, the date(s) on which to compute absolute difference, and any additional evaluation metrics (e.g. mean squared prediction error as `mspe`).
+- forecast_plots: specify the credible interval (in fractional terms) and number of randomly chosen trajectories to show on forecast plots.
+- diagnostics: specify the model (refer to `iup.models`) and the range of forecast dates (i.e. a list of earliest and latest) on which to perform diagnostics, as well as the types of plots and tables to create (refer to `iup.diagnostics`).
+
+### Vignette workflow
 
 ```mermaid
 
@@ -103,7 +109,6 @@ config --> diagnostics.py
 config --> forecast.py
 config --> eval.py
 
-
 style nis_data fill: #8451b5
 style forecast fill: #8451b5
 style scores fill: #8451b5
@@ -118,68 +123,44 @@ style diagnostic_plot fill: #b46060
 style proj_plot fill: #b46060
 style pred_summary fill: #b46060
 style score_plot fill: #b46060
-
-
 ```
 
 ## Project admins
 
-- Edward Schrom (CDC/CFA/Predict) <tec0@cdc.gov>
+- Scott Olesen (CDC/CFA/Predict) <ulp7@cdc.gov>
+
+## Disclaimers
 
-## General Disclaimer
+### General Disclaimer
 
 This repository was created for use by CDC programs to collaborate on public health related projects in support of the [CDC mission](https://www.cdc.gov/about/organization/mission.htm). GitHub is not hosted by the CDC, but is a third party website used by CDC and its partners to share information and collaborate on software. CDC use of GitHub does not imply an endorsement of any one particular service, product, or enterprise.
 
-## Public Domain Standard Notice
+### Public Domain Standard Notice
 
-This repository constitutes a work of the United States Government and is not
-subject to domestic copyright protection under 17 USC § 105. This repository is in
-the public domain within the United States, and copyright and related rights in
-the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/).
-All contributions to this repository will be released under the CC0 dedication. By
-submitting a pull request you are agreeing to comply with this waiver of
-copyright interest.
+This repository constitutes a work of the United States Government and is not subject to domestic copyright protection under 17 USC § 105. This repository is in the public domain within the United States, and copyright and related rights in the work worldwide are waived through the [CC0 1.0 Universal public domain dedication](https://creativecommons.org/publicdomain/zero/1.0/). All contributions to this repository will be released under the CC0 dedication. By submitting a pull request you are agreeing to comply with this waiver of copyright interest.
 
-## License Standard Notice
+### License Standard Notice
 
 This repository is licensed under ASL v2 or later.
 
-This source code in this repository is free: you can redistribute it and/or modify it under
-the terms of the Apache Software License version 2, or (at your option) any
-later version.
+This source code in this repository is free: you can redistribute it and/or modify it under the terms of the Apache Software License version 2, or (at your option) any later version.
 
-This source code in this repository is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE. See the Apache Software License for more details.
+This source code in this repository is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache Software License for more details.
 
-You should have received a copy of the Apache Software License along with this
-program. If not, see http://www.apache.org/licenses/LICENSE-2.0.html
+You should have received a copy of the Apache Software License along with this program. If not, see http://www.apache.org/licenses/LICENSE-2.0.html
 
 The source code forked from other open source projects will inherit its license.
 
-## Privacy Standard Notice
+### Privacy Standard Notice
 
-This repository contains only non-sensitive, publicly available data and
-information. All material and community participation is covered by the
-[Disclaimer](https://github.com/CDCgov/template/blob/master/DISCLAIMER.md)
-and [Code of Conduct](https://github.com/CDCgov/template/blob/master/code-of-conduct.md).
-For more information about CDC's privacy policy, please visit [http://www.cdc.gov/other/privacy.html](https://www.cdc.gov/other/privacy.html).
+This repository contains only non-sensitive, publicly available data and information. All material and community participation is covered by the [Disclaimer](https://github.com/CDCgov/template/blob/master/DISCLAIMER.md) and [Code of Conduct](https://github.com/CDCgov/template/blob/master/code-of-conduct.md). For more information about CDC's privacy policy, please visit [http://www.cdc.gov/other/privacy.html](https://www.cdc.gov/other/privacy.html).
 
-## Contributing Standard Notice
+### Contributing Standard Notice
 
-Anyone is encouraged to contribute to the repository by [forking](https://help.github.com/articles/fork-a-repo)
-and submitting a pull request. (If you are new to GitHub, you might start with a
-[basic tutorial](https://help.github.com/articles/set-up-git).) By contributing
-to this project, you grant a world-wide, royalty-free, perpetual, irrevocable,
-non-exclusive, transferable license to all users under the terms of the
-[Apache Software License v2](http://www.apache.org/licenses/LICENSE-2.0.html) or
-later.
+Anyone is encouraged to contribute to the repository by [forking](https://help.github.com/articles/fork-a-repo) and submitting a pull request. (If you are new to GitHub, you might start with a [basic tutorial](https://help.github.com/articles/set-up-git).) By contributing to this project, you grant a world-wide, royalty-free, perpetual, irrevocable, non-exclusive, transferable license to all users under the terms of the [Apache Software License v2](http://www.apache.org/licenses/LICENSE-2.0.html) or later.
 
-All comments, messages, pull requests, and other submissions received through
-CDC including this GitHub page may be subject to applicable federal law, including but not limited to the Federal Records Act, and may be archived. Learn more at [http://www.cdc.gov/other/privacy.html](http://www.cdc.gov/other/privacy.html).
+All comments, messages, pull requests, and other submissions received through CDC including this GitHub page may be subject to applicable federal law, including but not limited to the Federal Records Act, and may be archived. Learn more at [http://www.cdc.gov/other/privacy.html](http://www.cdc.gov/other/privacy.html).
 
-## Records Management Standard Notice
+### Records Management Standard Notice
 
-This repository is not a source of government records but is a copy to increase
-collaboration and collaborative potential. All government records will be
-published through the [CDC web site](http://www.cdc.gov).
+This repository is not a source of government records but is a copy to increase collaboration and collaborative potential. All government records will be published through the [CDC web site](http://www.cdc.gov).
@@ -0,0 +1,35 @@
+"""
+Download data from https://data.cdc.gov/Flu-Vaccinations/Influenza-Vaccination-Coverage-for-All-Ages-6-Mont/vh55-3he6/about_data
+"""
+
+import nisapi
+import polars as pl
+
+data = (
+    nisapi.get_nis()
+    .filter(
+        pl.col("vaccine") == pl.lit("flu"),
+        pl.col("geography_type").is_in(["nation", "admin1"]),
+        pl.col("domain_type") == pl.lit("age & possible risk"),
+        pl.col("domain") == pl.lit(">=18 years"),
+        pl.col("time_type") == pl.lit("month"),
+        pl.col("indicator_type") == pl.lit("received a vaccination"),
+        pl.col("indicator") == pl.lit("yes"),
+        pl.col("id") == pl.lit("vh55-3he6"),
+    )
+    .select(
+        [
+            "geography_type",
+            "geography",
+            "time_end",
+            "estimate",
+            "lci",
+            "uci",
+            "sample_size",
+        ]
+    )
+    .sort(["geography_type", "geography", "time_end"])
+    .collect()
+)
+
+data.write_parquet("data/raw.parquet")