|
1 | 1 | """Generates a comprehensive report of the data stored in the Climate Database.""" |
2 | 2 |
|
| 3 | +from collections.abc import Collection |
3 | 4 | from pathlib import Path |
| 5 | +from typing import Any |
4 | 6 |
|
5 | 7 | import mkdocs_gen_files |
6 | 8 | import pandas as pd |
7 | 9 |
|
| 10 | +from climate_data import constants as cdc |
8 | 11 | from climate_data.data import ClimateData |
| 12 | +from climate_data.generate.scenario_annual import TRANSFORM_MAP as ANNUAL_TRANSFORM_MAP |
| 13 | +from climate_data.generate.scenario_daily import TRANSFORM_MAP as DAILY_TRANSFORM_MAP |
9 | 14 |
|
10 | 15 | nav = mkdocs_gen_files.Nav() # type: ignore[attr-defined, no-untyped-call] |
11 | 16 | doc_root = Path() |
12 | 17 |
|
13 | | -cdata = ClimateData() |
| 18 | +cdata = ClimateData(create_root=False) |
| 19 | + |
| 20 | + |
| 21 | +def list_to_str(lst: Collection[Any]) -> str: |
| 22 | + items = [] |
| 23 | + for item in lst: |
| 24 | + if isinstance(item, str): |
| 25 | + items.append(f"`{item}`") |
| 26 | + else: |
| 27 | + items.append(f"`{item.name}`") |
| 28 | + return ", ".join(items) |
| 29 | + |
14 | 30 |
|
15 | 31 | # Extracted data |
16 | 32 |
|
|
38 | 54 | │ └── _Other Data Sources_/ |
39 | 55 | └── {cdata.results.stem}/ |
40 | 56 | ├── {cdata.annual_results.stem}/ |
| 57 | + │ ├── archive/ |
| 58 | + │ │ ├── historical/ |
| 59 | + │ │ │ └── {{ANNUAL_VARIABLE}}/ |
| 60 | + │ │ │ └── {{YEAR}}.nc |
| 61 | + │ │ └── {{SCENARIO}}/ |
| 62 | + │ │ └── {{ANNUAL_VARIABLE}}/ |
| 63 | + │ │ └── {{YEAR}}.nc |
| 64 | + | ├── {cdata.raw_annual_results.stem}/ |
| 65 | + │ │ ├── {cdata.compiled_annual_results.stem}/ |
| 66 | + │ │ │ └── {{SCENARIO}}/ |
| 67 | + │ │ │ └── {{ANNUAL_VARIABLE}}/ |
| 68 | + │ │ │ └── {{GCM_MEMBER}}.nc |
| 69 | + │ │ ├── historical/ |
| 70 | + │ │ │ └── {{ANNUAL_VARIABLE}}/ |
| 71 | + │ │ │ └── {{YEAR}}_era5.nc |
| 72 | + │ │ └── {{SCENARIO}}/ |
| 73 | + │ │ └── {{ANNUAL_VARIABLE}}/ |
| 74 | + │ │ └── {{YEAR}}_{{GCM_MEMBER}}.nc |
41 | 75 | │ └── {{SCENARIO}}/ |
42 | 76 | │ └── {{ANNUAL_VARIABLE}}/ |
43 | | - │ └── {{YEAR}}_{{DRAW}}.nc |
| 77 | + │ └── {{DRAW}}.nc |
44 | 78 | ├── {cdata.daily_results.stem}/ |
45 | 79 | │ └── {{SCENARIO}}/ |
46 | 80 | │ └── {{DAILY_VARIABLE}}/ |
47 | | - │ └── {{YEAR}}.nc |
| 81 | + │ ├── {{YEAR}}.nc |
| 82 | + │ └── reference.nc |
48 | 83 | └── {cdata.results_metadata.stem}/ |
49 | 84 |
|
50 | 85 | ``` |
|
79 | 114 | it significantly smaller and faster to work with. This is the dataset we use to supplement the ERA5-Land data over regions where the land data is |
80 | 115 | missing or incomplete. We also use this dataset for variables that are not available in the ERA5-Land dataset. |
81 | 116 |
|
82 | | -#### Storage and Naming Conventions |
| 117 | +!!! note "Storage and naming conventions" |
| 118 | + **File Pattern**: `{cdata.extracted_era5}/{{ERA5_DATASET}}_{{ERA5_VARIABLE}}_{{YEAR}}_{{MONTH}}.nc` |
83 | 119 |
|
84 | | - - Storage Root: `{cdata.extracted_era5}` |
85 | | - - Naming Convention: `{{ERA5_DATASET}}_{{ERA5_VARIABLE}}_{{YEAR}}_{{MONTH}}.nc` |
86 | | - * `{{ERA5_DATASET}}`: One of `reanalysis-era5-land`, or `reanalysis-era5-single-levels`. |
87 | | - * `{{ERA5_VARIABLE}}`: The variable being extracted (variable names can be found on the pages linked above). |
88 | | - * `{{YEAR}}` and `{{MONTH}}`: The year and month of the data being extracted. |
| 120 | + **Naming Conventions** |
| 121 | +
|
| 122 | + - `{{ERA5_DATASET}}`: One of `reanalysis-era5-land`, or `reanalysis-era5-single-levels`. |
| 123 | + - `{{ERA5_VARIABLE}}`: The variable being extracted (one of {list_to_str(cdc.ERA5_VARIABLES)}). |
| 124 | + - `{{YEAR}}` and `{{MONTH}}`: The year and month of the data being extracted. `{{YEAR}}` ranges from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`. |
89 | 125 |
|
90 | 126 | """ |
91 | 127 |
|
|
109 | 145 | is a collaborative effort to compare climate models across the globe. The data is organized into |
110 | 146 | different variables, scenarios, and sources. |
111 | 147 |
|
| 148 | +!!! note "Storage and Naming Conventions" |
| 149 | + **File Pattern:** `{cdata.extracted_cmip6}/{{CMIP6_VARIABLE}}_{{CMIP6_EXPERIMENT}}_{{CMIP6_SOURCE}}_{{VARIANT}}.nc` |
| 150 | +
|
| 151 | + **Naming Conventions** |
| 152 | +
|
| 153 | + - `{{CMIP6_VARIABLE}}`: The variable being extracted (one of {list_to_str(cdc.CMIP6_VARIABLES)}). |
| 154 | + - `{{CMIP6_EXPERIMENT}}`: The scenario being extracted (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}). |
| 155 | + - `{{CMIP6_SOURCE}}`: The source model for the data. A source model is a particular model from a particular institution, e.g. `BCC-CSM2-MR`. |
| 156 | + - `{{VARIANT}}`: The variant of the model, which is a particular run of the model with specific initial and boundary conditions and forcing scenarios. |
| 157 | +
|
112 | 158 | #### Model Inclusion |
113 | 159 |
|
114 | 160 | We use a subset of the CMIP6 data in our analysis following a [model evaluation published in Nature](https://www.nature.com/articles/s41597-023-02549-6) |
|
137 | 183 |
|
138 | 184 | {"\n\t".join(source_table.to_markdown().split("\n"))} |
139 | 185 |
|
140 | | -#### Storage and Naming Conventions |
141 | | -
|
142 | | - - Storage Root: `{cdata.extracted_cmip6}` |
143 | | - - Naming Convention: `{{CMIP6_VARIABLE}}_{{CMIP6_EXPERIMENT}}_{{CMIP6_SOURCE}}_{{VARIANT}}.nc` |
144 | | - * `{{CMIP6_VARIABLE}}`: The variable being extracted (variable names can be found in the [CMIP6 database](https://airtable.com/appYNLuWqAgzLbhSq/shrKcLEdssxb8Yvcp/tblL7dJkC3vl5zQLb)). |
145 | | - * `{{CMIP6_EXPERIMENT}}`: The scenario being extracted (one of `ssp126`, `ssp245`, or `ssp585`). |
146 | | - * `{{CMIP6_SOURCE}}`: The source model for the data. A source model is a particular model from a particular institution, e.g. `BCC-CSM2-MR`. |
147 | | - * `{{VARIANT}}`: The variant of the model, which is a particular run of the model with specific initial and boundary conditions and forcing scenarios. |
148 | 186 |
|
149 | 187 | ??? example "Variant Labels" |
150 | 188 |
|
|
178 | 216 |
|
179 | 217 | processed_data_content = f"""## Processed Data |
180 | 218 |
|
181 | | -The processed data is stored in the `{cdata.results}` directory, organized by scenario, variable, and year. |
182 | | -We generally only generate annual results, as storing daily results for all models and all variables would be |
183 | | -prohibitively expensive. |
| 219 | +The processed data is stored in the `{cdata.results}` directory, organized by scenario and variable. |
| 220 | +
|
| 221 | +There are two types of processed data: daily and annual. Daily data is stored for historical data only (and for the |
| 222 | +`mean_temperature` variable for CMIP6 data). We generally only generate annual results, as storing daily results |
| 223 | +for all models and all variables would be prohibitively expensive. Daily data is stored in the `{cdata.daily_results}` directory. |
| 224 | +
|
| 225 | +!!! note "Daily Data Storage and Naming Conventions" |
| 226 | + **File Patterns:** |
| 227 | +
|
| 228 | + - `{cdata.daily_results}/historical/{{DAILY_VARIABLE}}/{{YEAR}}.nc` - Daily data for historical variables. |
| 229 | + - `{cdata.daily_results}/historical/{{DAILY_VARIABLE}}/reference.nc` - Reference climatology data for historical variables. |
| 230 | + - `{cdata.daily_results}/{{SCENARIO}}/mean_temperature/{{YEAR}}.nc` - Daily data for the `mean_temperature` variable for CMIP6 scenarios. |
| 231 | +
|
| 232 | + **Naming Conventions** |
| 233 | +
|
| 234 | + - `{{SCENARIO}}`: The CMIP6 scenario being stored (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}). |
| 235 | + - `{{DAILY_VARIABLE}}`: The name of the variable being stored (one of {list_to_str(DAILY_TRANSFORM_MAP)}). |
| 236 | + - `{{YEAR}}`: The year of the data being stored. In `historical` subdirectories, this runs from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`. |
| 237 | + In scenario subdirectories, this runs from `{cdc.FORECAST_YEARS[0]}` to `{cdc.FORECAST_YEARS[-1]}`. |
| 238 | +
|
| 239 | +The annual data is stored in the `{cdata.annual_results}` directory. Annual data is stored by draw number, with each draw |
| 240 | +representing a random sample of a Global Climate Model (GCM) and variant from CMIP6. Each draw is a full annual time |
| 241 | +series from 1950 to 2100 and collates the historical ERA5 data with the CMIP6 scenario data. |
| 242 | +
|
| 243 | +!!! note "Annual Data Storage and Naming Conventions" |
| 244 | + **Archive File Patterns** |
| 245 | +
|
| 246 | + These store the prior results for the climate database to ease transition to the new draw-level outputs. |
| 247 | + They use an older version of the CMIP6 ensemble and represent an ensemble mean. They should be transitioned |
| 248 | + to the new draw-level outputs as soon as possible. |
| 249 | +
|
| 250 | + - `{cdata.annual_results}/archive/historical/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` - Archived historical annual data using the ERA5 dataset. |
| 251 | + - `{cdata.annual_results}/archive/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` - Archived scenario annual data using the CMIP6 dataset and the original point estimate ensemble. |
| 252 | +
|
| 253 | + **Raw and Compiled File Patterns** |
| 254 | +
|
| 255 | + - `{cdata.raw_annual_results}/historical/{{ANNUAL_VARIABLE}}/{{YEAR}}_era5.nc` - Raw historical annual data using the ERA5 dataset. |
| 256 | + - `{cdata.raw_annual_results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}_{{GCM_MEMBER}}.nc` - Raw scenario annual data using the CMIP6 dataset. Each dataset is a bias-corrected and downscaled GCM-member. |
| 257 | + - `{cdata.compiled_annual_results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{GCM_MEMBER}}.nc` - Annual compilations of the raw scenario data for each GCM-member. |
| 258 | +
|
| 259 | + **Draw File Pattern:** {cdata.results}/{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{DRAW}}.nc |
| 260 | +
|
| 261 | + **Naming Conventions** |
| 262 | +
|
| 263 | + - `{{ANNUAL_VARIABLE}}`: The name of the variable being stored (one of {list_to_str(ANNUAL_TRANSFORM_MAP)}). |
| 264 | + - `{{SCENARIO}}`: The scenario being stored (one of {list_to_str(cdc.CMIP6_EXPERIMENTS)}). |
| 265 | + - `{{YEAR}}`: The year of the data being stored. In `historical` subdirectories, this runs from `{cdc.HISTORY_YEARS[0]}` to `{cdc.HISTORY_YEARS[-1]}`. |
| 266 | + In scenario subdirectories, this runs from `{cdc.FORECAST_YEARS[0]}` to `{cdc.FORECAST_YEARS[-1]}`. |
| 267 | + - `{{GCM_MEMBER}}`: The GCM member being stored. This is a unique identifier for each GCM member combining the source model and variant. |
| 268 | + - `{{DRAW}}`: The draw number of the data being stored as a three digit string (e.g. `027`). |
184 | 269 |
|
185 | | -### Storage and Naming Conventions |
186 | 270 |
|
187 | | - - Daily Storage Root: `{cdata.daily_results}` |
188 | | - - Naming Convention: `{{SCENARIO}}/{{DAILY_VARIABLE}}/{{YEAR}}.nc` (historical data only) |
189 | | - - `{{SCENARIO}}`: Generally, only historical data is available at the daily level, so this will be `historical`. |
190 | | - - `{{DAILY_VARIABLE}}`: The name of the variable being stored. |
191 | | - - `{{YEAR}}`: The year of the data being stored. |
192 | | - - Annual Storage Root: `{cdata.results}` |
193 | | - - Naming Convention: `{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}.nc` or `{{SCENARIO}}/{{ANNUAL_VARIABLE}}/{{YEAR}}_{{DRAW}}.nc` |
194 | 271 |
|
195 | 272 | ### Pipeline Stages |
196 | 273 |
|
197 | 274 | The processing pipelines turn the extracted [ERA5](#era5-data) and [CMIP6](#cmip6-data) data into a coherent set of |
198 | 275 | climate variables with a consistent resolution, time scale, and data storage format. The pipeline is run |
199 | | -with the `cdrun` command (see [Installation](../installation.md) for installation instructions). The pipeline |
| 276 | +with the `cdrun` command (see [Installation](./installation.md) for installation instructions). The pipeline |
200 | 277 | has the following steps: |
201 | 278 |
|
202 | 279 | 1. **Historical Daily** (`cdrun generate historical_daily`): This processes the hourly ERA5-Land and ERA5-Single-Level |
|
210 | 287 | the scenario data by serving as a seasonally-aware reference point we can intercept shift to. |
211 | 288 | 3. **Scenario Inclusion** (`cdrun generate scenario_inclusion`): This produces a set of metadata that determines |
212 | 289 | which CMIP sources and variants are used to generate scenario draws. This is the second stage scenario determination. |
213 | | - When we [extract CMIP6 data](####model-inclusion), we cannot determine the year range of the data until it is extracted. |
| 290 | + When we [extract CMIP6 data](#model-inclusion), we cannot determine the year range of the data until it is extracted. |
214 | 291 | This stage determines which models are included based on the year range of the data and writes this information to a file |
215 | 292 | in {cdata.results_metadata}. |
216 | 293 | 4. **Scenario Daily** (`cdrun generate scenario_daily`): This produces scenario projections from the CMIP6 data by dynamical |
|
0 commit comments