diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index b829d8b..51db7ed 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,15 +1,18 @@ ## Thank you for contributing to ILAMB-Data. Use this template when submitting a pull request. -### ๐Ÿ›  Issue reference -โœจ**Include a reference to the issue # here.** -Summarize the changes you are making to the repo. Are you fixing a bug? Adding a dataset? Describe what you did in a few sentences or bullet points. +### ๐Ÿ›  Summary of changes +โœจ**Include a reference to the issue # in this section** +Summarize the changes you are making to the repo. Are you fixing a bug? Adding a dataset? Describe what you did in a few sentences or bullet points: -### ๐Ÿงช Testing +### ๐Ÿงช NetCDF Validation โœจ**Add an `x` between the brackets to indicate basic testing was completed** -- [ ] I inspected the outputted NetCDF and checked for obvious errors -- [ ] I visualized the outputted NetCDF at a timestamp to check for obvious visual errors -- [ ] I compared my `convert` script to recent existing ones -- [ ] I attempted to create/encode the NetCDF according to CF Compliance guidelines +- [ ] Created a new working directory (e.g., `ILAMB-DATA/GFED5/`) +- [ ] Created a standalone python script that downloads and processes the new dataset (e.g., `ILAMB-DATA/GFED5/convert.py`) + - [ ] The python script outputs a netcdf (e.g., `ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc`) +- [ ] Ran `python3 validate_dataset.py ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` in command line and resolved any errors +- [ ] Visually inspected the outputted netCDF for obvious issues + - [ ] `ncdump -h ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` to visually inspect the netCDF header information + - [ ] `ncview ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` to visually inspect the map (where applicable) ### ๐Ÿงช (Optional) Preview โœจ**Attach an image of your dataset here** @@ -21,51 +24,4 @@ Summarize the changes you are making to the repo. Are you fixing a bug? Adding a - [ ] There are no erroneous console logs, debuggers, or leftover testing code - [ ] There are no hard-coded paths to your local machine in the script - [ ] Useful headings describe sections of your code -- [ ] Variable names are generalizable and easy to read - -### ๐Ÿ“ (Optional) CF Compliance In-Depth Checklist -โœจ**Add an `x` between the brackets to ensure CF compliance** - -#### Dimensions -- [ ] Dimensions include `time` with attributes/encoding: - - [ ] `axis` attribute is `T` - - [ ] `units` attribute/encoding is `days since YYYY-MM-DD HH:MM:SS` - - [ ] `long_name` attribute is `time` - - [ ] `calendar` encoding is `noleap` - - [ ] `bounds` encoding is `time_bounds` -- [ ] Dimensions include `lon` with attributes: - - [ ] `axis` attribute is `X` - - [ ] `units` attribute is `degrees_east` - - [ ] `long_name` attribute is `longitude` -- [ ] Dimensions include `lat` with attributes: - - [ ] `axis` attribute is `Y` - - [ ] `units` attribute is `degrees_north` - - [ ] `long_name` attribute is `latitude` -- [ ] Dimensions include `nv`, which is an array of length `2` that contains the start date and end date bounding the dataset - -#### Data Variables and their Attributes -- [ ] **The variable(s) for model comparison are present** - - [ ] the variables are linked to the `time`,`lat`, and `lon` dimensions - - [ ] `long_name` attribute is specified - - [ ] `units` attribute is specified - - [ ] (If necessary) `ancillary_variables` attribute is specified if an uncertainty value is provided - - [ ] (Optional) Float32 data type - - [ ] (Optional) No-data values masked as NaN -- [ ] **If applicable, a data uncertainty variable is present** (e.g., standard_deviation or standard_error) - - [ ] the variable is linked to the `time`, `lat`, and `lon` dimensions - - [ ] `long_name` attribute is specified (e.g., cSoil standard_deviation) - - [ ] `units` attribute is specified; it is unitless, so it should be `1` -- [ ] **A time_bounds variable is present** - - [ ] the variable is linked to the `time` and `nv` dimensions - - [ ] `long_name` attribute is specified as `time_bounds` - - [ ] `units` is encoded as `days since YYYY-MM-DD HH:MM:SS` - -#### Global Attributes -- [ ] 'title' -- [ ] 'institution' -- [ ] 'source' -- [ ] 'history' -- [ ] 'references' - - [ ] @reference-type{ref-name, author={Last, First and}, title={}, journal={}, year={}, volume={}, pages={num--num}, doi={no-https}} -- [ ] 'comment' -- [ ] 'Conventions' \ No newline at end of file +- [ ] Variable names are generalizable and easy to read \ No newline at end of file diff --git a/.gitignore b/.gitignore index a0ff66d..21c24b7 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ /GIMMS_LAI4g/GIMMS_LAI4g* ILAMB_sample/ HWSD2_RASTER/ -wise_30sec_v1/ \ No newline at end of file +wise_30sec_v1/ +*_temp* +*__pycache__* \ No newline at end of file diff --git a/GFED5/convert.py b/GFED5/convert.py new file mode 100644 index 0000000..d6ba355 --- /dev/null +++ b/GFED5/convert.py @@ -0,0 +1,218 @@ +import glob +import os +import sys +import zipfile +from pathlib import Path + +import numpy as np +import requests +import xarray as xr + +# Determine the parent directory (ILAMB-DATA) +project_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +# Now you can import the helper_funcs module from the scripts package. +from scripts import biblatex_builder as bb +from scripts import helper_funcs as hf + +###################################################################### +# Set Parameters +###################################################################### + +VAR = "burntFractionAll" + +###################################################################### +# Download Data +###################################################################### + +# Specify the dataset title you are looking for +dataset_title = "Global Fire Emissions Database (GFED5) Burned Area" + +# Build the query string to search by title +params = {"q": f'title:"{dataset_title}"'} + +# Define the Zenodo API endpoint +base_url = "https://zenodo.org/api/records" + +# Send the GET request +response = requests.get(base_url, params=params) +if response.status_code != 200: + print("Error during search:", response.status_code) + exit(1) + +# Parse the JSON response +data = response.json() + +# Select the first record; there should only be one +records = data["hits"]["hits"] +i = 0 +for record in records: + i += 1 + title = record["metadata"].get("title") + print(f"\n{i}. Dataset title: {title}") +record = data["hits"]["hits"][0] + +# Download and get timestamps +hf.download_from_zenodo(record) +path_to_zip_file = Path("_temp/BA.zip") +download_stamp = hf.gen_utc_timestamp(path_to_zip_file.stat().st_mtime) +generate_stamp = hf.gen_utc_timestamp() + +# Unzip downloaded data file (BA.zip) +full_path = os.path.abspath(path_to_zip_file) +full_path_without_zip, _ = os.path.splitext(full_path) +with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: + zip_ref.extractall(full_path_without_zip) + +###################################################################### +# Open netcdfs +###################################################################### + +# Get a list of all netCDF files in the unzipped folder +data_dir = "_temp/BA" +all_files = glob.glob(os.path.join(data_dir, "*.nc")) + +# Get separate lists for coarse data and fine data +coarse_files = [] # For 1997-2000 (1 degree) +fine_files = [] # For 2001-2020 (0.25 degree) +for f in all_files: + basename = os.path.basename(f) # e.g., "BA200012.nc" + # Extract the year from the filename; here characters at positions 2:6. + year = int(basename[2:6]) + if year < 2001: + coarse_files.append(f) + else: + fine_files.append(f) + +# Load the coarse and fine datasets separately +ds_coarse = xr.open_mfdataset(coarse_files, combine="by_coords") +ds_fine = xr.open_mfdataset(fine_files, combine="by_coords") + +# Load burnable area (and mask) datasets +da_coarse_mask = xr.open_dataset("_temp/BurnableArea_preMOD.nc")["BurableArea"] +da_fine_mask = xr.open_dataset("_temp/BurnableArea.nc")["BurableArea"] + +###################################################################### +# Process netcdfs +###################################################################### + +# Calculate burned fraction of burnable area as a percent +percent_burned_coarse = (ds_coarse["Total"] / da_coarse_mask) * 100 +ds_coarse = ds_coarse.assign({VAR: percent_burned_coarse}) +percent_burned_fine = (ds_fine["Total"] / da_fine_mask) * 100 +ds_fine = ds_fine.assign({VAR: percent_burned_fine}) + +# Mask the datasets +percent_burned_coarse_masked = ds_coarse.where(da_coarse_mask > 0) +percent_burned_fine_masked = ds_fine.where(da_fine_mask > 0) + +# Interpolate coarse 1 degree data to 0.25 degrees +res = 0.25 +newlon = np.arange(-179.875, 180, res) +newlat = np.arange(-89.875, 90, res) + +# Interpolate 1 degree data to 0.25 degrees +percent_burned_coarse_masked_interp = percent_burned_coarse_masked.interp( + lat=newlat, lon=newlon +) + +# Combine coarse-interpolated and fine data into one dataset at 0.25 degree resolution +ds = xr.concat( + [percent_burned_fine_masked, percent_burned_coarse_masked_interp], dim="time" +) +ds = ds.sortby("time") +ds = ds[VAR].to_dataset() + +###################################################################### +# Set CF compliant netcdf attributes +###################################################################### + +# Set dimension attributes and encoding +ds = hf.set_time_attrs(ds) +ds = hf.set_lat_attrs(ds) +ds = hf.set_lon_attrs(ds) + +# Get variable attribute info via ESGF CMIP variable information +info = hf.get_cmip6_variable_info(VAR) + +# Set variable attributes +ds = hf.set_var_attrs( + ds, + var=VAR, + units=info["variable_units"], + standard_name=info["cf_standard_name"], + long_name=info["variable_long_name"], +) + +# Add time bounds +ds = hf.add_time_bounds_monthly(ds) +time_range = f"{ds['time'].min().dt.year:d}{ds['time'].min().dt.month:02d}" +time_range += f"-{ds['time'].max().dt.year:d}{ds['time'].max().dt.month:02d}" + +# Define global attribute citation information +data_citation = bb.generate_biblatex_dataset( + cite_key="Chen2023", + author=[ + "Chen, Yang", + "Hall, Joanne", + "van Wees, Dave", + "Andela, Niels", + "Hantson, Stijn", + "Giglio, Louis", + "van der Werf, Guido R.", + "Morton, Douglas C.", + "Randerson, James T.", + ], + title="Global Fire Emissions Database (GFED5) Burned Area (0.1)", + year=2023, + url="https://zenodo.org/records/7668424", + doi="10.5281/zenodo.7668424", +) +article_citation = bb.generate_biblatex_article( + cite_key="Chen2023", + author=[ + "Chen, Yang", + "Hall, Joanne", + "van Wees, Dave", + "Andela, Niels", + "Hantson, Stijn", + "Giglio, Louis", + "van der Werf, Guido R.", + "Morton, Douglas C.", + "Randerson, James T.", + ], + title="Multi-decadal trends and variability in burned area from the fifth version of the Global Fire Emissions Database (GFED5)", + journal="Earth Syst. Sci. Data", + year=2023, + volume=15, + number=11, + pages=[5227, 5259], + doi="https://doi.org/10.5194/essd-15-5227-2023", +) + +citations = data_citation + "\n\n" + article_citation + +# Set global netcdf attributes +ds = hf.set_cf_global_attributes( + ds, + title="Global Fire Emissions Database (GFED5) Burned Area", + institution="Global Fire Emissions Database", + source="Combine MODIS, Landsat, and Sentinel-2 to create a 24-year record of global burned area", + history=f"""downloaded: {download_stamp}\nformatted: {generate_stamp}""", + references=citations, + comment="", + conventions="CF 1.12", +) + +###################################################################### +# Export +###################################################################### + +ds.to_netcdf( + "{variable}_{frequency}_{source_id}_{time_mark}.nc".format( + variable=VAR, frequency="mon", source_id="GFED5", time_mark=time_range + ), + encoding={VAR: {"zlib": True}}, +) diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/biblatex_builder.py b/scripts/biblatex_builder.py new file mode 100644 index 0000000..e393220 --- /dev/null +++ b/scripts/biblatex_builder.py @@ -0,0 +1,265 @@ +""" +biblatex_builder.py + +A clean utility for generating properly formatted BibLaTeX entries from structured input. +This module supports tech reports, journal articles, datasets, and books. It enforces formatting rules +for citation keys, author names, and DOIs. +""" + +import re +import textwrap +from typing import Any + +__all__ = [ + "generate_biblatex_article", + "generate_biblatex_techreport", + "generate_biblatex_dataset", + "generate_biblatex_book", +] + + +def _format_biblatex_entry( + entry_type: str, cite_key: str, fields: dict[str, Any] +) -> str: + """ + Format a BibLaTeX entry using consistent indentation and field alignment. + + Args: + entry_type (str): Type of BibLaTeX entry (e.g., "article", "techreport"). + cite_key (str): Unique citation key without spaces. + fields (dict[str, str]): Dictionary of BibLaTeX fields and their values. + + Returns: + str: A formatted BibLaTeX entry as a triple-quoted Python string. + + Raises: + ValueError: If the cite_key contains spaces or invalid characters. + """ + if " " in cite_key: + raise ValueError("Citation key must not contain spaces.") + if not re.fullmatch(r"[\w\-:.]+", cite_key): + raise ValueError("Citation key contains invalid characters.") + + # left justify, align the '=' vertically based on max key length, and remove trailing comma + max_key_len = max(map(len, fields)) + entry_lines = [f"@{entry_type}{{{cite_key},"] + entry_lines += [ + f"{key.ljust(max_key_len)} = {{{value}}}," for key, value in fields.items() + ] + entry_lines[-1] = entry_lines[-1].rstrip(",") # remove trailing comma + entry_lines.append("}") + + return f'"""\n{textwrap.indent("\n".join(entry_lines), " ")}\n"""' + + +def _normalize_doi(doi: str) -> str: + """ + Normalize a DOI string to full URL form. + + Args: + doi (str): The DOI, possibly with or without https://doi.org/ prefix. + + Returns: + str: DOI in full URL form starting with 'https://doi.org/'. + """ + doi = doi.strip() + if doi.startswith("http"): + doi = doi.split("doi.org/")[-1] + elif doi.startswith("doi.org/"): + doi = doi[8:] + return f"https://doi.org/{doi}" + + +def _validate_and_format_authors(authors: list[str]) -> str: + """ + Validate and format a list of authors for BibLaTeX. + + Each author must be in 'Last, First' format. The function ensures a space after + the comma and removes spaces between initials (e.g., 'G. M.' โ†’ 'G.M.'). + + Args: + authors (list[str]): List of author names in 'Last, First' format. + + Returns: + str: BibLaTeX-compatible string with authors joined by ' and '. + + Raises: + ValueError: If an author is not in the expected format. + """ + formatted = [] + for name in authors: + if "," not in name: + raise ValueError( + f"Author '{name}' must be in 'Last, First' format (with a comma)." + ) + + last, first = (part.strip() for part in name.split(",", 1)) + first_clean = re.sub(r"(?<=\.)\s+(?=[A-Z]\.)", "", first) + formatted.append(f"{last}, {first_clean}") + + return " and ".join(formatted) + + +def generate_biblatex_techreport( + cite_key: str, + author: str, + title: str, + institution: str, + year: str, + number: str, +) -> str: + """ + Generate a BibLaTeX @techreport entry. + + Args: + cite_key (str): Citation key (no spaces). + author (str): Author string. + title (str): Title of the report. + institution (str): Publishing institution. + year (str): Publication year. + number (str): Report number or version. + + Returns: + str: Formatted BibLaTeX @techreport entry as a multiline string. + """ + fields = { + "author": author, + "title": title, + "institution": institution, + "year": year, + "number": number, + } + return _format_biblatex_entry("techreport", cite_key, fields) + + +def generate_biblatex_article( + cite_key: str, + author: list[str], + title: str, + journal: str, + year: str, + volume: str, + number: str, + pages: list[int] | str, + doi: str | None = None, +) -> str: + """ + Generate a BibLaTeX @article entry. + + Args: + cite_key (str): Citation key (no spaces). + author (list[str]): List of authors in 'Last, First' format. + title (str): Title of the article. + journal (str): Journal name. + year (str): Publication year. + volume (str): Volume number. + number (str): Issue number. + pages (list[int]): List of [start, end] page numbers. + doi (str, optional): DOI identifier (with or without prefix). + + Returns: + str: Formatted BibLaTeX @article entry as a multiline string. + """ + author_str = _validate_and_format_authors(author) + + fields = { + "author": author_str, + "title": title, + "journal": journal, + "year": year, + "volume": volume, + "number": number, + } + + # Format the page numbers + if isinstance(pages, list): + if len(pages) != 2: + raise ValueError( + "If 'pages' is a list, it must contain exactly two integers: [start, end]." + ) + fields["pages"] = f"{pages[0]}โ€“{pages[1]}" # en dash between pages + elif isinstance(pages, str): + fields["pages"] = pages + elif pages != " ": + raise TypeError( + "'pages' must be a list of two integers, a string, or an empty string." + ) + + if doi: + fields["doi"] = _normalize_doi(doi) + + return _format_biblatex_entry("article", cite_key, fields) + + +def generate_biblatex_dataset( + cite_key: str, + author: list[str], + title: str, + year: str, + url: str, + note: str | None = None, + doi: str | None = None, +) -> str: + """ + Generate a BibLaTeX @misc entry for a dataset. + + Args: + cite_key (str): Citation key (no spaces). + author (list[str]): List of authors in 'Last, First' format. + title (str): Title of the dataset. + year (str): Publication or release year. + url (str): Direct URL to the dataset. + note (str, optional): Any additional notes. + doi (str, optional): DOI identifier (with or without prefix). + + Returns: + str: Formatted BibLaTeX @misc entry as a multiline string. + """ + author_str = _validate_and_format_authors(author) + fields = { + "author": author_str, + "title": title, + "year": year, + "url": url, + } + if note: + fields["note"] = note + if doi: + fields["doi"] = _normalize_doi(doi) + + return _format_biblatex_entry("misc", cite_key, fields) + + +def generate_biblatex_book( + cite_key: str, + author: list[str], + title: str, + publisher: str, + year: str, + edition: str | None = None, +) -> str: + """ + Generate a BibLaTeX @book entry. + + Args: + cite_key (str): Citation key (no spaces). + author (list[str]): List of authors in 'Last, First' format. + title (str): Title of the book. + publisher (str): Name of the publisher. + year (str): Year of publication. + edition (str, optional): Edition information, e.g., '2nd'. + + Returns: + str: Formatted BibLaTeX @book entry as a multiline string. + """ + author_str = _validate_and_format_authors(author) + fields = { + "author": author_str, + "title": title, + "publisher": publisher, + "year": year, + } + if edition: + fields["edition"] = edition + + return _format_biblatex_entry("book", cite_key, fields) diff --git a/compare_datasets.py b/scripts/compare_datasets.py similarity index 100% rename from compare_datasets.py rename to scripts/compare_datasets.py diff --git a/scripts/helper_funcs.py b/scripts/helper_funcs.py new file mode 100644 index 0000000..98b6bbf --- /dev/null +++ b/scripts/helper_funcs.py @@ -0,0 +1,317 @@ +import datetime +import os + +import cftime as cf +import numpy as np +import requests +import xarray as xr +from intake_esgf import ESGFCatalog +from tqdm import tqdm + + +def download_from_html(remote_source: str, local_source: str | None = None) -> str: + """ + Download a file from a remote URL to a local path. + If the "content-length" header is missing, it falls back to a simple download. + """ + if local_source is None: + local_source = os.path.basename(remote_source) + if os.path.isfile(local_source): + return local_source + + resp = requests.get(remote_source, stream=True) + try: + total_size = int(resp.headers.get("content-length")) + except (TypeError, ValueError): + total_size = 0 + + with open(local_source, "wb") as fdl: + if total_size: + with tqdm( + total=total_size, unit="B", unit_scale=True, desc=local_source + ) as pbar: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + fdl.write(chunk) + pbar.update(len(chunk)) + else: + for chunk in resp.iter_content(chunk_size=1024): + if chunk: + fdl.write(chunk) + return local_source + + +def download_from_zenodo(record: dict): + """ + Download all files from a Zenodo record dict into a '_temp' directory. + Example for getting a Zenodo record: + + # Specify the dataset title you are looking for + dataset_title = "Global Fire Emissions Database (GFED5) Burned Area" + + # Build the query string to search by title + params = { + "q": f'title:"{dataset_title}"' + } + + # Define the Zenodo API endpoint + base_url = "https://zenodo.org/api/records" + + # Send the GET request + response = requests.get(base_url, params=params) + if response.status_code != 200: + print("Error during search:", response.status_code) + exit(1) + + # Parse the JSON response + data = response.json() + + # Get record dictionary + records = data['hits']['hits'] + record = data['hits']['hits'][0] + """ + download_dir = "_temp" + os.makedirs(download_dir, exist_ok=True) + + title = record.get("metadata", {}).get("title", "No Title") + pub_date = record.get("metadata", {}).get("publication_date", "No publication date") + print(f"Found record:\n Title: {title}\n Publication Date: {pub_date}") + + for file_info in record.get("files", []): + file_name = file_info.get("key") + file_url = file_info.get("links", {}).get("self") + local_file = os.path.join(download_dir, file_name) + + if file_url: + print(f"Downloading {file_name} from {file_url} into {download_dir}...") + download_from_html(file_url, local_source=local_file) + else: + print(f"File URL not found for file: {file_name}") + + +# I think this can be useful to make sure people export the netcdfs the same way every time +def get_filename(attrs: dict, time_range: str) -> str: + """ + Generate a NetCDF filename using required attributes and a time range. + + Args: + attrs (dict): Dictionary of global attributes. + time_range (str): Time range string to embed in the filename. + + Returns: + str: Formatted filename. + + Raises: + ValueError: If any required attributes are missing from `attrs`. + """ + required_keys = [ + "variable_id", + "frequency", + "source_id", + "variant_label", + "grid_label", + ] + + missing = [key for key in required_keys if key not in attrs] + if missing: + raise ValueError( + f"Missing required attributes: {', '.join(missing)}. " + f"Expected keys: {', '.join(required_keys)}" + ) + + filename = "{variable_id}_{frequency}_{source_id}_{variant_label}_{grid_label}_{time_mark}.nc".format( + **attrs, time_mark=time_range + ) + return filename + + +def get_cmip6_variable_info(variable_id: str) -> dict[str, str]: + """ """ + df = ESGFCatalog().variable_info(variable_id) + return df.iloc[0].to_dict() + + +def set_time_attrs(ds: xr.Dataset) -> xr.Dataset: + """ + Ensure the xarray dataset's time attributes are formatted according to CF-Conventions. + """ + assert "time" in ds + da = ds["time"] + + # Ensure time is an accepted xarray time dtype + if np.issubdtype(da.dtype, np.datetime64): + ref_date = np.datetime_as_string(da.min().values, unit="s") + elif isinstance(da.values[0], cf.datetime): + ref_date = da.values[0].strftime("%Y-%m-%d %H:%M:%S") + else: + raise TypeError( + f"Unsupported xarray time format: {type(da.values[0])}. Accepted types are np.datetime64 or cftime.datetime." + ) + + da.encoding = { + "units": f"days since {ref_date}", + "calendar": da.encoding.get("calendar"), + } + da.attrs = { + "axis": "T", + "standard_name": "time", + "long_name": "time", + } + ds["time"] = da + return ds + + +def set_lat_attrs(ds: xr.Dataset) -> xr.Dataset: + """ + Ensure the xarray dataset's latitude attributes are formatted according to CF-Conventions. + """ + assert "lat" in ds + da = ds["lat"] + da.attrs = { + "axis": "Y", + "units": "degrees_north", + "standard_name": "latitude", + "long_name": "latitude", + } + ds["lat"] = da + return ds + + +def set_lon_attrs(ds: xr.Dataset) -> xr.Dataset: + """ + Ensure the xarray dataset's longitude attributes are formatted according to CF-Conventions. + """ + assert "lon" in ds + da = ds["lon"] + da.attrs = { + "axis": "X", + "units": "degrees_east", + "standard_name": "longitude", + "long_name": "longitude", + } + ds["lon"] = da + return ds + + +def set_var_attrs( + ds: xr.Dataset, var: str, units: str, standard_name: str, long_name: str +) -> xr.Dataset: + """ + Ensure the xarray dataset's variable attributes are formatted according to CF-Conventions. + """ + assert var in ds + da = ds[var] + da.attrs = {"units": units, "standard_name": standard_name, "long_name": long_name} + ds[var] = da + return ds + + +def gen_utc_timestamp(time: float | None = None) -> str: + if time is None: + time = datetime.datetime.now(datetime.UTC) + else: + time = datetime.datetime.fromtimestamp(time) + return time.strftime("%Y-%m-%dT%H:%M:%SZ") + + +def add_time_bounds_monthly(ds: xr.Dataset) -> xr.Dataset: + """ + Add monthly time bounds to an xarray Dataset. + + For each timestamp in the dataset's 'time' coordinate, this function adds a new + coordinate called 'time_bounds' with the first day of the month and the first + day of the next month. These bounds follow CF conventions. + + Args: + ds (xr.Dataset): Dataset with a 'time' coordinate of monthly timestamps. + + Returns: + xr.Dataset: Modified dataset with a 'time_bounds' coordinate and updated + attributes on the 'time' coordinate. + """ + + def _ymd_tuple(da: xr.DataArray) -> tuple[int, int, int]: + """Extract (year, month, day) from a single-element datetime DataArray.""" + if da.size != 1: + raise ValueError("Expected a single-element datetime for conversion.") + return int(da.dt.year), int(da.dt.month), int(da.dt.day) + + def _make_timestamp(t: xr.DataArray, ymd: tuple[int, int, int]) -> np.datetime64: + """Construct a timestamp matching the type of the input time value.""" + try: + return type(t.item())(*ymd) # try using the same class as the input + except Exception: + # fallback to datetime64 if direct construction fails + return np.datetime64(f"{ymd[0]:04d}-{ymd[1]:02d}-{ymd[2]:02d}") + + lower_bounds = [] + upper_bounds = [] + + for t in ds["time"]: + year, month, _ = _ymd_tuple(t) + lower_bounds.append(_make_timestamp(t, (year, month, 1))) + + # First day of the next month (verbose-ified for easier readability) + if month == 12: + next_month = (year + 1, 1, 1) + else: + next_month = (year, month + 1, 1) + upper_bounds.append(_make_timestamp(t, next_month)) + + bounds_array = np.array([lower_bounds, upper_bounds]).T + ds = ds.assign_coords(time_bounds=(("time", "bounds"), bounds_array)) + ds["time_bounds"].attrs["long_name"] = "time_bounds" + ds["time"].attrs["bounds"] = "time_bounds" + + return ds + + +def set_cf_global_attributes( + ds: xr.Dataset, + *, # keyword only for the following args + title: str, + institution: str, + source: str, + history: str, + references: str, + comment: str, + conventions: str, +) -> xr.Dataset: + """ + Set required NetCDF global attributes according to CF-Conventions 1.12. + + Args: + ds (xr.Dataset): The xarray dataset to which global attributes will be added. + title (str): Short description of the file contents. + institution (str): Where the original data was produced. + source (str): Method of production of the original data. + history (str): List of applications that have modified the original data. + references (str): References describing the data or methods used to produce it. + comment (str): Miscellaneous information about the data or methods used. + conventions (str): The name of the conventions followed by the dataset. + + Returns: + xr.Dataset: The dataset with updated global attributes. + + Raises: + ValueError: If a required global attribute is missing. + """ + + # Build and validate attributes + attrs = { + "title": title, + "institution": institution, + "source": source, + "history": history, + "references": references, + "comment": comment, + "Conventions": conventions, + } + + # Ensure all values are explicitly set (None not allowed) + missing = [k for k, v in attrs.items() if v is None] + if missing: + raise ValueError(f"Missing required global attributes: {', '.join(missing)}") + + ds.attrs.update(attrs) + return ds diff --git a/scripts/validate_dataset.py b/scripts/validate_dataset.py new file mode 100644 index 0000000..dbbaca4 --- /dev/null +++ b/scripts/validate_dataset.py @@ -0,0 +1,347 @@ +""" +A script that checks an input dataset (netCDF file) for adherence to ILAMB standards. +The netCDF can contain site data or gridded data. +""" + +import sys +from typing import Literal + +import cftime +import numpy as np +import xarray as xr +from pydantic import BaseModel, ConfigDict, field_validator + + +def get_dim_name( + dset: xr.Dataset | xr.DataArray, + dim: Literal["time", "lat", "lon", "depth", "site"], +) -> str: + dim_names = { + "time": ["time"], + "lat": ["lat", "latitude", "Latitude", "y", "lat_"], + "lon": ["lon", "longitude", "Longitude", "x", "lon_"], + "depth": ["depth"], + } + # Assumption: the 'site' dimension is what is left over after all others are removed + if dim == "site": + try: + get_dim_name(dset, "lat") + get_dim_name(dset, "lon") + # raise NoSiteDimension("Dataset/dataarray is spatial") + except KeyError: + pass + possible_names = list( + set(dset.dims) - set([d for _, dims in dim_names.items() for d in dims]) + ) + if len(possible_names) == 1: + return possible_names[0] + msg = f"Ambiguity in locating a site dimension, found: {possible_names}" + # raise NoSiteDimension(msg) + possible_names = dim_names[dim] + dim_name = set(dset.dims).intersection(possible_names) + if len(dim_name) != 1: + msg = f"{dim} dimension not found: {dset.dims} " + msg += f"not in [{','.join(possible_names)}]" + raise KeyError(msg) + return str(dim_name.pop()) + + +def is_spatial(da: xr.DataArray) -> bool: + try: + get_dim_name(da, "lat") + get_dim_name(da, "lon") + return True + except KeyError: + pass + return False + + +# spatial validator +class ILAMBDataset(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + ds: xr.Dataset + + @field_validator("ds") + @classmethod + def check_vars(cls, ds: xr.Dataset) -> xr.Dataset: + # Check that there are data variables + if not ds.data_vars: + raise ValueError( + "Dataset does not have any data variables. An example data variable is 'cSoil'." + ) + # Check that the dataset has at least one variable but not more than 2 + if len(ds.data_vars) >= 3: + raise ValueError( + f"Dataset has too many data variables {ds.data_vars}. The measurement and the uncertainty are the only expected data variables. There should be one netCDF file per data variable if a dataset has multiple data variables." + ) + return ds + + @field_validator("ds") + @classmethod + def global_attrs(cls, ds: xr.Dataset) -> xr.Dataset: + # Check that the dataset has the required global attribute keys + missing = set( + [ + "title", + "institution", + "source", + "history", + "references", + "comment", + "Conventions", + ] + ) - set(ds.attrs.keys()) + if missing: + raise ValueError( + f"Dataset does not properly encode global attributes, {missing=}" + ) + return ds + + @field_validator("ds") + @classmethod + def time_dim(cls, ds: xr.Dataset) -> xr.Dataset: + # Check that the dataset has a properly set-up time dimension + dimensions = ds.dims + time_dim_present = "time" in dimensions + time_var = ds["time"] + time_attrs = time_var.attrs + + # Check if the time dimension is present + if not time_dim_present: + raise ValueError( + f"Dataset does not have a time dimension, {dimensions=}. Expected a dimension called 'time'." + ) + + # Check if time values are decoded as datetime objects + time_dtype = type(time_var.values[0]) + if not ( + np.issubdtype(time_var.values.dtype, np.datetime64) + or isinstance(time_var.values[0], cftime.datetime) + ): + raise TypeError( + f"Time values are not properly decoded as datetime objects: {time_dtype=}" + ) + + # Check time attributes: axis, long_name, standard_name + missing = set(["axis", "long_name", "standard_name"]) - set(time_attrs) + if missing: + raise ValueError( + f"Dataset is missing time-specific attributes, {missing=}." + ) + else: + # Check the axis and standard_name to ensure they're correct; long_name can vary. + correct_axis_name = time_attrs["axis"] == "T" + if not correct_axis_name: + raise TypeError( + f"The time dimension's axis attribute is {time_attrs['axis']}. Expected 'T'." + ) + correct_std_name = time_attrs["standard_name"] == "time" + if not correct_std_name: + raise TypeError( + f"The time dimension's standard_name attribute is {time_attrs['standard_name']}. Expected 'time'." + ) + + # Check time units encoding and formatting + time_encoding = time_var.encoding + if "units" not in time_encoding or "since" not in time_encoding["units"]: + raise ValueError( + f"Time encoding is missing or incorrect, {time_encoding=}. Expected 'days since YYYY:MM:DD HH:MM'" + ) + + # Check time calendar encoding + if "calendar" in time_encoding: + valid_calendars = [ + "standard", + "gregorian", + "proleptic_gregorian", + "noleap", + "all_leap", + "360_day", + "julian", + ] + + if time_encoding["calendar"] not in valid_calendars: + # Check for explicitly defined calendar attributes + if "month_lengths" in time_attrs: + # Validate month_lengths + month_lengths = time_attrs["month_lengths"] + if len(month_lengths) != 12 or not all( + isinstance(m, (int, np.integer)) for m in month_lengths + ): + raise ValueError( + "month_lengths must be a list of 12 integer values." + ) + + # Validate leap year settings if present + if "leap_year" in time_attrs: + leap_year = time_attrs["leap_year"] + if not isinstance(leap_year, (int, np.integer)): + raise ValueError("leap_year must be an integer.") + + if "leap_month" in time_attrs: + leap_month = time_attrs["leap_month"] + if not (1 <= leap_month <= 12): + raise ValueError("leap_month must be between 1 and 12.") + else: + raise ValueError( + f"Unrecognized calendar '{time_encoding['calendar']}' and no explicit month_lengths provided." + ) + else: + raise ValueError("Calendar attribute is missing from the time encoding.") + + # Check bounds encoding + time_bounds_name = time_attrs["bounds"] + if time_bounds_name not in ds: + raise ValueError( + f"Time bounds variable '{time_bounds_name=}' is missing from dataset. Expected 'time_bounds'" + ) + + # Check time_bounds structure + time_bounds = ds[time_bounds_name] + if len(time_bounds.dims) != 2 or time_bounds.dims[0] != "time": + raise ValueError( + f"Time bounds, '{time_bounds_name=}', has incorrect dimensions, {time_bounds.dims}." + "Expected two dimensions: ('time', )." + ) + + # Check that the second dimension length is 2 (indicating time bounds) + if time_bounds.shape[1] != 2: + raise ValueError( + f"Time bounds '{time_bounds_name}' has incorrect shape {time_bounds.shape}. " + "The second dimension should have length 2 to represent time bounds." + ) + + # Check for the correct 'long_name' attribute for time_bounds + if ( + "long_name" not in time_bounds.attrs + or time_bounds.attrs["long_name"] != "time_bounds" + ): + raise ValueError( + f"Time bounds '{time_bounds_name}' is missing its 'long_name':'time_bounds' attribute." + ) + + return ds + + @field_validator("ds") + @classmethod + def lat_dim(cls, ds: xr.Dataset) -> xr.Dataset: + # Check that the dataset has a properly set-up latitude dimension + lat_names = {"lat", "latitude", "y"} + dims = ds.dims + dims_lower = { + dim.lower(): dim for dim in dims + } # Map lowercased dims to original names + lat_names_found = [ + dims_lower[name.lower()] for name in lat_names if name.lower() in dims_lower + ] + + # Ensure there is only one latitude dimension + if len(lat_names_found) != 1: + raise ValueError( + f"Dataset has {len(lat_names_found)} latitude dimensions, expected exactly one. Found: {lat_names_found}" + ) + + lat_name = lat_names_found[0] + + # Check that one of the accepted latitude long_names exists + lat_var = ds[lat_name] + lat_attrs = lat_var.attrs + + # Check for missing latitude attributes + missing = set(["axis", "long_name", "standard_name", "units"]) - set(lat_attrs) + if missing: + raise ValueError( + f"Dataset is missing latitude-specific attributes, {missing=}" + ) + else: + # Check axis + if lat_attrs["axis"] != "Y": + raise ValueError( + f"Incorrect latitude axis attribute: {lat_attrs['axis']}. Expected 'Y' (case sensitive)." + ) + # Check standard_name + if lat_attrs["standard_name"] != "latitude": + raise ValueError( + f"Incorrect latitude standard_name attribute: {lat_attrs['standard_name']}. Expected 'latitude' (case sensitive)." + ) + # Check units + valid_lat_units = { + "degrees_north", + "degree_north", + "degree_N", + "degrees_N", + "degreeN", + "degreesN", + } + lat_units = lat_attrs.get("units") + if lat_units not in valid_lat_units: + raise ValueError( + f"Invalid 'units' attribute for latitude dimension. Found: {lat_units}. Expected one of {valid_lat_units}." + ) + + return ds + + @field_validator("ds") + @classmethod + def lon_dim(cls, ds: xr.Dataset) -> xr.Dataset: + # Check that the dataset has a properly set-up longitude dimension + lon_names = {"lon", "longitude", "x"} + dims = ds.dims + dims_lower = { + dim.lower(): dim for dim in dims + } # Map lowercased dims to original names + lon_names_found = [ + dims_lower[name.lower()] for name in lon_names if name.lower() in dims_lower + ] + + # Ensure there is only one longitude dimension + if len(lon_names_found) != 1: + raise ValueError( + f"Dataset has {len(lon_names_found)} longitude dimensions, expected exactly one. Found: {lon_names_found}" + ) + + lon_name = lon_names_found[0] + + # Check that one of the accepted longitude long_names exists + lon_var = ds[lon_name] + lon_attrs = lon_var.attrs + + # Check for missing longitude attributes + missing = set(["axis", "long_name", "standard_name", "units"]) - set(lon_attrs) + if missing: + raise ValueError( + f"Dataset is missing longitude-specific attributes, {missing=}" + ) + else: + # Check axis + if lon_attrs["axis"] != "X": + raise ValueError( + f"Incorrect latitude axis attribute: {lon_attrs['axis']}. Expected 'X' (case sensitive)" + ) + # Check standard_name + if lon_attrs["standard_name"] != "longitude": + raise ValueError( + f"Incorrect latitude standard_name attribute: {lon_attrs['standard_name']}. Expected 'longitude' (case sensitive)" + ) + + # Check units + valid_lon_units = { + "degrees_east", + "degree_east", + "degree_E", + "degrees_E", + "degreeE", + "degreesE", + } + lon_units = lon_attrs.get("units") + if lon_units not in valid_lon_units: + raise ValueError( + f"Invalid 'units' attribute for longitude dimension. Found: {lon_units}. Expected one of {valid_lon_units}." + ) + + return ds + + +if __name__ == "__main__": + dset = xr.open_dataset(sys.argv[1]) + test = ILAMBDataset(ds=dset)