Skip to content

Commit f604ae0

Browse files
authored
Merge pull request #27 from ihmeuw/feature/draw-generation
Feature/draw generation
2 parents 4f67610 + 1d13df2 commit f604ae0

16 files changed

+598
-218
lines changed

scripts/gen_data_pages.py

Lines changed: 107 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,32 @@
11
"""Generates a comprehensive report of the data stored in the Climate Database."""
22

3+
from collections.abc import Collection
34
from pathlib import Path
5+
from typing import Any
46

57
import mkdocs_gen_files
68
import pandas as pd
79

10+
from climate_data import constants as cdc
811
from climate_data.data import ClimateData
12+
from climate_data.generate.scenario_annual import TRANSFORM_MAP as ANNUAL_TRANSFORM_MAP
13+
from climate_data.generate.scenario_daily import TRANSFORM_MAP as DAILY_TRANSFORM_MAP
914

1015
nav = mkdocs_gen_files.Nav() # type: ignore[attr-defined, no-untyped-call]
1116
doc_root = Path()
1217

13-
cdata = ClimateData()
18+
cdata = ClimateData(create_root=False)
19+
20+
21+
def list_to_str(lst: Collection[Any]) -> str:
22+
items = []
23+
for item in lst:
24+
if isinstance(item, str):
25+
items.append(f"`{item}`")
26+
else:
27+
items.append(f"`{item.name}`")
28+
return ", ".join(items)
29+
1430

1531
# Extracted data
1632

@@ -38,13 +54,32 @@
3854
│ └── _Other Data Sources_/
3955
└── {cdata.results.stem}/
4056
├── {cdata.annual_results.stem}/
57+
│ ├── archive/
58+
│ │ ├── historical/
59+
│ │ │ └── {{ANNUAL_VARIABLE}}/
60+
│ │ │ └── {{YEAR}}.nc
61+
│ │ └── {{SCENARIO}}/
62+
│ │ └── {{ANNUAL_VARIABLE}}/
63+
│ │ └── {{YEAR}}.nc
64+
| ├── {cdata.raw_annual_results.stem}/
65+
│ │ ├── {cdata.compiled_annual_results.stem}/
66+
│ │ │ └── {{SCENARIO}}/
67+
│ │ │ └── {{ANNUAL_VARIABLE}}/
68+
│ │ │ └── {{GCM_MEMBER}}.nc
69+
│ │ ├── historical/
70+
│ │ │ └── {{ANNUAL_VARIABLE}}/
71+
│ │ │ └── {{YEAR}}_era5.nc
72+
│ │ └── {{SCENARIO}}/
73+
│ │ └── {{ANNUAL_VARIABLE}}/
74+
│ │ └── {{YEAR}}_{{GCM_MEMBER}}.nc
4175
│ └── {{SCENARIO}}/
4276
│ └── {{ANNUAL_VARIABLE}}/
43-
│ └── {{YEAR}}_{{DRAW}}.nc
77+
│ └── {{DRAW}}.nc
4478
├── {cdata.daily_results.stem}/
4579
│ └── {{SCENARIO}}/
4680
│ └── {{DAILY_VARIABLE}}/
47-
│ └── {{YEAR}}.nc
81+
│ ├── {{YEAR}}.nc
82+
│ └── reference.nc
4883
└── {cdata.results_metadata.stem}/
4984
5085
```
@@ -79,13 +114,14 @@
79114
it significantly smaller and faster to work with. This is the dataset we use to supplement the ERA5-Land data over regions where the land data is
80115
missing or incomplete. We also use this dataset for variables that are not available in the ERA5-Land dataset.
81116
82-
#### Storage and Naming Conventions
117+
!!! note "Storage and naming conventions"
118+
**File Pattern**: `{cdata.extracted_era5}/{{ERA5_DATASET}}_{{ERA5_VARIABLE}}_{{YEAR}}_{{MONTH}}.nc`
83119
84-
- Storage Root: `{cdata.extracted_era5}`
85-
- Naming Convention: `{{ERA5_DATASET}}_{{ERA5_VARIABLE}}_{{YEAR}}_{{MONTH}}.nc`
86-
* `{{ERA5_DATASET}}`: One of `reanalysis-era5-land`, or `reanalysis-era5-single-levels`.
87-
* `{{ERA5_VARIABLE}}`: The variable being extracted (variable names can be found on the pages linked above).
88-
* `{{YEAR}}` and `{{MONTH}}`: The year and month of the data being extracted.
120+
**Naming Conventions**
121+
122+
- `{{ERA5_DATASET}}`: One of `reanalysis-era5-land`, or `reanalysis-era5-single-levels`.
123+
- `{{ERA5_VARIABLE}}`: The variable being extracted (one of {list_to_str(cdc.ERA5_VARIABLES)}).
124+
- `{{YEAR}}` and `{{MONTH}}`: The year and month of the data being extracted. `{{YEAR}}` ranges from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`.
89125
90126
"""
91127

@@ -109,6 +145,16 @@
109145
is a collaborative effort to compare climate models across the globe. The data is organized into
110146
different variables, scenarios, and sources.
111147
148+
!!! note "Storage and Naming Conventions"
149+
**File Pattern:** `{cdata.extracted_cmip6}/{{CMIP6_VARIABLE}}_{{CMIP6_EXPERIMENT}}_{{CMIP6_SOURCE}}_{{VARIANT}}.nc`
150+
151+
**Naming Conventions**
152+
153+
- `{{CMIP6_VARIABLE}}`: The variable being extracted (one of {list_to_str(cdc.CMIP6_VARIABLES)}).
154+
- `{{CMIP6_EXPERIMENT}}`: The scenario being extracted (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}).
155+
- `{{CMIP6_SOURCE}}`: The source model for the data. A source model is a particular model from a particular institution, e.g. `BCC-CSM2-MR`.
156+
- `{{VARIANT}}`: The variant of the model, which is a particular run of the model with specific initial and boundary conditions and forcing scenarios.
157+
112158
#### Model Inclusion
113159
114160
We use a subset of the CMIP6 data in our analysis following a [model evaluation published in Nature](https://www.nature.com/articles/s41597-023-02549-6)
@@ -137,14 +183,6 @@
137183
138184
{"\n\t".join(source_table.to_markdown().split("\n"))}
139185
140-
#### Storage and Naming Conventions
141-
142-
- Storage Root: `{cdata.extracted_cmip6}`
143-
- Naming Convention: `{{CMIP6_VARIABLE}}_{{CMIP6_EXPERIMENT}}_{{CMIP6_SOURCE}}_{{VARIANT}}.nc`
144-
* `{{CMIP6_VARIABLE}}`: The variable being extracted (variable names can be found in the [CMIP6 database](https://airtable.com/appYNLuWqAgzLbhSq/shrKcLEdssxb8Yvcp/tblL7dJkC3vl5zQLb)).
145-
* `{{CMIP6_EXPERIMENT}}`: The scenario being extracted (one of `ssp126`, `ssp245`, or `ssp585`).
146-
* `{{CMIP6_SOURCE}}`: The source model for the data. A source model is a particular model from a particular institution, e.g. `BCC-CSM2-MR`.
147-
* `{{VARIANT}}`: The variant of the model, which is a particular run of the model with specific initial and boundary conditions and forcing scenarios.
148186
149187
??? example "Variant Labels"
150188
@@ -178,25 +216,64 @@
178216

179217
processed_data_content = f"""## Processed Data
180218
181-
The processed data is stored in the `{cdata.results}` directory, organized by scenario, variable, and year.
182-
We generally only generate annual results, as storing daily results for all models and all variables would be
183-
prohibitively expensive.
219+
The processed data is stored in the `{cdata.results}` directory, organized by scenario and variable.
220+
221+
There are two types of processed data: daily and annual. Daily data is stored for historical data only (and for the
222+
`mean_temperature` variable for CMIP6 data). We generally only generate annual results, as storing daily results
223+
for all models and all variables would be prohibitively expensive. Daily data is stored in the `{cdata.daily_results}` directory.
224+
225+
!!! note "Daily Data Storage and Naming Conventions"
226+
**File Patterns:**
227+
228+
- `{cdata.daily_results}/historical/{{DAILY_VARIABLE}}/{{YEAR}}.nc` - Daily data for historical variables.
229+
- `{cdata.daily_results}/historical/{{DAILY_VARIABLE}}/reference.nc` - Reference climatology data for historical variables.
230+
- `{cdata.daily_results}/{{SCENARIO}}/mean_temperature/{{YEAR}}.nc` - Daily data for the `mean_temperature` variable for CMIP6 scenarios.
231+
232+
**Naming Conventions**
233+
234+
- `{{SCENARIO}}`: The CMIP6 scenario being stored (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}).
235+
- `{{DAILY_VARIABLE}}`: The name of the variable being stored (one of {list_to_str(DAILY_TRANSFORM_MAP)}).
236+
- `{{YEAR}}`: The year of the data being stored. In `historical` subdirectories, this runs from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`.
237+
In scenario subdirectories, this runs from `{cdc.FORECAST_YEARS[0]}` to `{cdc.FORECAST_YEARS[-1]}`.
238+
239+
The annual data is stored in the `{cdata.annual_results}` directory. Annual data is stored by draw number, with each draw
240+
representing a random sample of a Global Climate Model (GCM) and variant from CMIP6. Each draw is a full annual time
241+
series from 1950 to 2100 and collates the historical ERA5 data with the CMIP6 scenario data.
242+
243+
!!! note "Annual Data Storage and Naming Conventions"
244+
**Archive File Patterns**
245+
246+
These store the prior results for the climate database to ease transition to the new draw-level outputs.
247+
They use an older version of the CMIP6 ensemble and represent an ensemble mean. They should be transitioned
248+
to the new draw-level outputs as soon as possible.
249+
250+
- `{cdata.annual_results}/archive/historical/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` - Archived historical annual data using the ERA5 dataset.
251+
- `{cdata.annual_results}/archive/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` - Archived scenario annual data using the CMIP6 dataset and the original point estimate ensemble.
252+
253+
**Raw and Compiled File Patterns**
254+
255+
- `{cdata.raw_annual_results}/historical/{{ANNUAL_VARIABLE}}/{{YEAR}}_era5.nc` - Raw historical annual data using the ERA5 dataset.
256+
- `{cdata.raw_annual_results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}_{{GCM_MEMBER}}.nc` - Raw scenario annual data using the CMIP6 dataset. Each dataset is a bias-corrected and downscaled GCM-member.
257+
- `{cdata.compiled_annual_results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{GCM_MEMBER}}.nc` - Annual compilations of the raw scenario data for each GCM-member.
258+
259+
**Draw File Pattern:** {cdata.results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{DRAW}}.nc
260+
261+
**Naming Conventions**
262+
263+
- `{{ANNUAL_VARIABLE}}`: The name of the variable being stored (one of {list_to_str(ANNUAL_TRANSFORM_MAP)}).
264+
- `{{SCENARIO}}`: The scenario being stored (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}).
265+
- `{{YEAR}}`: The year of the data being stored. In `historical` subdirectories, this runs from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`.
266+
In scenario subdirectories, this runs from `{cdc.FORECAST_YEARS[0]}` to `{cdc.FORECAST_YEARS[-1]}`.
267+
- `{{GCM_MEMBER}}`: The GCM member being stored. This is a unique identifier for each GCM member combining the source model and variant.
268+
- `{{DRAW}}`: The draw number of the data being stored as a three digit string (e.g. `027`).
184269
185-
### Storage and Naming Conventions
186270
187-
- Daily Storage Root: `{cdata.daily_results}`
188-
- Naming Convention: `{{SCENARIO}}/{{DAILY_VARIABLE}}/{{YEAR}}.nc` (historical data only)
189-
- `{{SCENARIO}}`: Generally, only historical data is available at the daily level, so this will be `historical`.
190-
- `{{DAILY_VARIABLE}}`: The name of the variable being stored.
191-
- `{{YEAR}}`: The year of the data being stored.
192-
- Annual Storage Root: `{cdata.results}`
193-
- Naming Convention: `{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` or `{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}_{{DRAW}}.nc`
194271
195272
### Pipeline Stages
196273
197274
The processing pipelines turn the extracted [ERA5](#era5-data) and [CMIP6](#cmip6-data) data into a coherent set of
198275
climate variables with a consistent resolution, time scale, and data storage format. The pipeline is run
199-
with the `cdrun` command (see [Installation](../installation.md) for installation instructions). The pipeline
276+
with the `cdrun` command (see [Installation](./installation.md) for installation instructions). The pipeline
200277
has the following steps:
201278
202279
1. **Historical Daily** (`cdrun generate historical_daily`): This processes the hourly ERA5-Land and ERA5-Single-Level
@@ -210,7 +287,7 @@
210287
the scenario data by serving as a seasonally-aware reference point we can intercept shift to.
211288
3. **Scenario Inclusion** (`cdrun generate scenario_inclusion`): This produces a set of metadata that determines
212289
which CMIP sources and variants are used to generate scenario draws. This is the second stage scenario determination.
213-
When we [extract CMIP6 data](####model-inclusion), we cannot determine the year range of the data until it is extracted.
290+
When we [extract CMIP6 data](#model-inclusion), we cannot determine the year range of the data until it is extracted.
214291
This stage determines which models are included based on the year range of the data and writes this information to a file
215292
in {cdata.results_metadata}.
216293
4. **Scenario Daily** (`cdrun generate scenario_daily`): This produces scenario projections from the CMIP6 data by dynamical
-10 Bytes
Binary file not shown.

src/climate_data/cli_options.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from collections.abc import Collection
1111
from typing import ParamSpec, TypeVar
1212

13+
import click
1314
from rra_tools.cli_tools import (
1415
RUN_ALL,
1516
ClickOption,
@@ -171,6 +172,15 @@ def with_scenario(
171172
)
172173

173174

175+
def with_gcm_member() -> ClickOption[_P, _T]:
176+
return click.option(
177+
"--gcm-member",
178+
"-g",
179+
type=click.STRING,
180+
help="GCM member to process.",
181+
)
182+
183+
174184
__all__ = [
175185
"RUN_ALL",
176186
"ClickOption",
@@ -181,6 +191,7 @@ def with_scenario(
181191
"with_draw",
182192
"with_era5_dataset",
183193
"with_era5_variable",
194+
"with_gcm_member",
184195
"with_input_directory",
185196
"with_month",
186197
"with_num_cores",

src/climate_data/constants.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import numpy as np
55
import xarray as xr
6-
from pydantic import BaseModel
76

87
##############
98
# File roots #
@@ -17,8 +16,7 @@
1716

1817
# Time
1918

20-
FULL_HISTORY_YEARS = [str(y) for y in range(1950, 2024)]
21-
HISTORY_YEARS = [str(y) for y in range(1990, 2024)]
19+
HISTORY_YEARS = [str(y) for y in range(1950, 2024)]
2220
REFERENCE_YEARS = HISTORY_YEARS[-5:]
2321
REFERENCE_PERIOD = slice(
2422
f"{REFERENCE_YEARS[0]}-01-01",
@@ -96,7 +94,7 @@ class _CMIP6Experiments(NamedTuple):
9694
CMIP6_EXPERIMENTS = _CMIP6Experiments()
9795

9896

99-
class CMIP6Variable(BaseModel):
97+
class CMIP6Variable(NamedTuple):
10098
name: str
10199
description: str
102100
encoding_offset: float

0 commit comments

Comments
 (0)