diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index b829d8b..51db7ed 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,15 +1,18 @@
 ## Thank you for contributing to ILAMB-Data. Use this template when submitting a pull request.
 
-### 🛠 Issue reference
-✨**Include a reference to the issue # here.**
-Summarize the changes you are making to the repo. Are you fixing a bug? Adding a dataset? Describe what you did in a few sentences or bullet points.
+### 🛠 Summary of changes
+✨**Include a reference to the issue # in this section**
+Summarize the changes you are making to the repo. Are you fixing a bug? Adding a dataset? Describe what you did in a few sentences or bullet points:
 
-### 🧪 Testing
+### 🧪 NetCDF Validation
 ✨**Add an `x` between the brackets to indicate basic testing was completed**
-- [ ] I inspected the outputted NetCDF and checked for obvious errors
-- [ ] I visualized the outputted NetCDF at a timestamp to check for obvious visual errors
-- [ ] I compared my `convert` script to recent existing ones
-- [ ] I attempted to create/encode the NetCDF according to CF Compliance guidelines
+- [ ] Created a new working directory (e.g., `ILAMB-DATA/GFED5/`)
+- [ ] Created a standalone python script that downloads and processes the new dataset (e.g., `ILAMB-DATA/GFED5/convert.py`)
+    - [ ] The python script outputs a netcdf (e.g., `ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc`)
+- [ ] Ran `python3 validate_dataset.py ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` in command line and resolved any errors
+- [ ] Visually inspected the outputted netCDF for obvious issues
+    - [ ] `ncdump -h ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` to visually inspect the netCDF header information
+    - [ ] `ncview ILAMB-DATA/GFED5/burntFractionAll_mon_GFED5_199701-202012.nc` to visually inspect the map (where applicable)
 
 ### 🧪 (Optional) Preview
 ✨**Attach an image of your dataset here**
@@ -21,51 +24,4 @@ Summarize the changes you are making to the repo. Are you fixing a bug? Adding a
 - [ ] There are no erroneous console logs, debuggers, or leftover testing code
 - [ ] There are no hard-coded paths to your local machine in the script
 - [ ] Useful headings describe sections of your code
-- [ ] Variable names are generalizable and easy to read
-
-### 📏 (Optional) CF Compliance In-Depth Checklist
-✨**Add an `x` between the brackets to ensure CF compliance**
-
-#### Dimensions
-- [ ] Dimensions include `time` with attributes/encoding:
-    - [ ] `axis` attribute is `T`
-    - [ ] `units` attribute/encoding is `days since YYYY-MM-DD HH:MM:SS`
-    - [ ] `long_name` attribute is `time`
-    - [ ] `calendar` encoding is `noleap`
-    - [ ] `bounds` encoding is `time_bounds`
-- [ ] Dimensions include `lon` with attributes:
-    - [ ] `axis` attribute is `X`
-    - [ ] `units` attribute is `degrees_east`
-    - [ ] `long_name` attribute is  `longitude`
-- [ ] Dimensions include `lat` with attributes:
-    - [ ] `axis` attribute is `Y`
-    - [ ] `units` attribute is `degrees_north`
-    - [ ] `long_name` attribute is `latitude`
-- [ ] Dimensions include `nv`, which is an array of length `2` that contains the start date and end date bounding the dataset
-
-#### Data Variables and their Attributes
-- [ ] **The variable(s) for model comparison are present**
-    - [ ] the variables are linked to the `time`,`lat`, and `lon` dimensions
-    - [ ] `long_name` attribute is specified
-    - [ ] `units` attribute is specified
-    - [ ] (If necessary) `ancillary_variables` attribute is specified if an uncertainty value is provided
-    - [ ] (Optional) Float32 data type
-    - [ ] (Optional) No-data values masked as NaN
-- [ ] **If applicable, a data uncertainty variable is present** (e.g., standard_deviation or standard_error)
-    - [ ] the variable is linked to the `time`, `lat`, and `lon` dimensions
-    - [ ] `long_name` attribute is specified (e.g., cSoil standard_deviation)
-    - [ ] `units` attribute is specified; it is unitless, so it should be `1`
-- [ ] **A time_bounds variable is present**
-    - [ ] the variable is linked to the `time` and `nv` dimensions
-    - [ ] `long_name` attribute is specified as `time_bounds`
-    - [ ] `units` is encoded as `days since YYYY-MM-DD HH:MM:SS`
-
-#### Global Attributes
-- [ ] 'title'
-- [ ] 'institution'
-- [ ] 'source'
-- [ ] 'history'
-- [ ] 'references'
-    - [ ] @reference-type{ref-name, author={Last, First and}, title={}, journal={}, year={}, volume={}, pages={num--num}, doi={no-https}}
-- [ ] 'comment'
-- [ ] 'Conventions'
\ No newline at end of file
+- [ ] Variable names are generalizable and easy to read
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index a0ff66d..21c24b7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,6 @@
 /GIMMS_LAI4g/GIMMS_LAI4g*
 ILAMB_sample/
 HWSD2_RASTER/
-wise_30sec_v1/
\ No newline at end of file
+wise_30sec_v1/
+*_temp*
+*__pycache__*
\ No newline at end of file
diff --git a/GFED5/convert.py b/GFED5/convert.py
new file mode 100644
index 0000000..d6ba355
--- /dev/null
+++ b/GFED5/convert.py
@@ -0,0 +1,218 @@
+import glob
+import os
+import sys
+import zipfile
+from pathlib import Path
+
+import numpy as np
+import requests
+import xarray as xr
+
+# Determine the parent directory (ILAMB-DATA)
+project_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
+if project_dir not in sys.path:
+    sys.path.insert(0, project_dir)
+
+# Now you can import the helper_funcs module from the scripts package.
+from scripts import biblatex_builder as bb
+from scripts import helper_funcs as hf
+
+######################################################################
+# Set Parameters
+######################################################################
+
+VAR = "burntFractionAll"
+
+######################################################################
+# Download Data
+######################################################################
+
+# Specify the dataset title you are looking for
+dataset_title = "Global Fire Emissions Database (GFED5) Burned Area"
+
+# Build the query string to search by title
+params = {"q": f'title:"{dataset_title}"'}
+
+# Define the Zenodo API endpoint
+base_url = "https://zenodo.org/api/records"
+
+# Send the GET request
+response = requests.get(base_url, params=params)
+if response.status_code != 200:
+    print("Error during search:", response.status_code)
+    exit(1)
+
+# Parse the JSON response
+data = response.json()
+
+# Select the first record; there should only be one
+records = data["hits"]["hits"]
+i = 0
+for record in records:
+    i += 1
+    title = record["metadata"].get("title")
+    print(f"\n{i}. Dataset title: {title}")
+record = data["hits"]["hits"][0]
+
+# Download and get timestamps
+hf.download_from_zenodo(record)
+path_to_zip_file = Path("_temp/BA.zip")
+download_stamp = hf.gen_utc_timestamp(path_to_zip_file.stat().st_mtime)
+generate_stamp = hf.gen_utc_timestamp()
+
+# Unzip downloaded data file (BA.zip)
+full_path = os.path.abspath(path_to_zip_file)
+full_path_without_zip, _ = os.path.splitext(full_path)
+with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref:
+    zip_ref.extractall(full_path_without_zip)
+
+######################################################################
+# Open netcdfs
+######################################################################
+
+# Get a list of all netCDF files in the unzipped folder
+data_dir = "_temp/BA"
+all_files = glob.glob(os.path.join(data_dir, "*.nc"))
+
+# Get separate lists for coarse data and fine data
+coarse_files = []  # For 1997-2000 (1 degree)
+fine_files = []  # For 2001-2020 (0.25 degree)
+for f in all_files:
+    basename = os.path.basename(f)  # e.g., "BA200012.nc"
+    # Extract the year from the filename; here characters at positions 2:6.
+    year = int(basename[2:6])
+    if year < 2001:
+        coarse_files.append(f)
+    else:
+        fine_files.append(f)
+
+# Load the coarse and fine datasets separately
+ds_coarse = xr.open_mfdataset(coarse_files, combine="by_coords")
+ds_fine = xr.open_mfdataset(fine_files, combine="by_coords")
+
+# Load burnable area (and mask) datasets
+da_coarse_mask = xr.open_dataset("_temp/BurnableArea_preMOD.nc")["BurableArea"]
+da_fine_mask = xr.open_dataset("_temp/BurnableArea.nc")["BurableArea"]
+
+######################################################################
+# Process netcdfs
+######################################################################
+
+# Calculate burned fraction of burnable area as a percent
+percent_burned_coarse = (ds_coarse["Total"] / da_coarse_mask) * 100
+ds_coarse = ds_coarse.assign({VAR: percent_burned_coarse})
+percent_burned_fine = (ds_fine["Total"] / da_fine_mask) * 100
+ds_fine = ds_fine.assign({VAR: percent_burned_fine})
+
+# Mask the datasets
+percent_burned_coarse_masked = ds_coarse.where(da_coarse_mask > 0)
+percent_burned_fine_masked = ds_fine.where(da_fine_mask > 0)
+
+# Interpolate coarse 1 degree data to 0.25 degrees
+res = 0.25
+newlon = np.arange(-179.875, 180, res)
+newlat = np.arange(-89.875, 90, res)
+
+# Interpolate 1 degree data to 0.25 degrees
+percent_burned_coarse_masked_interp = percent_burned_coarse_masked.interp(
+    lat=newlat, lon=newlon
+)
+
+# Combine coarse-interpolated and fine data into one dataset at 0.25 degree resolution
+ds = xr.concat(
+    [percent_burned_fine_masked, percent_burned_coarse_masked_interp], dim="time"
+)
+ds = ds.sortby("time")
+ds = ds[VAR].to_dataset()
+
+######################################################################
+# Set CF compliant netcdf attributes
+######################################################################
+
+# Set dimension attributes and encoding
+ds = hf.set_time_attrs(ds)
+ds = hf.set_lat_attrs(ds)
+ds = hf.set_lon_attrs(ds)
+
+# Get variable attribute info via ESGF CMIP variable information
+info = hf.get_cmip6_variable_info(VAR)
+
+# Set variable attributes
+ds = hf.set_var_attrs(
+    ds,
+    var=VAR,
+    units=info["variable_units"],
+    standard_name=info["cf_standard_name"],
+    long_name=info["variable_long_name"],
+)
+
+# Add time bounds
+ds = hf.add_time_bounds_monthly(ds)
+time_range = f"{ds['time'].min().dt.year:d}{ds['time'].min().dt.month:02d}"
+time_range += f"-{ds['time'].max().dt.year:d}{ds['time'].max().dt.month:02d}"
+
+# Define global attribute citation information
+data_citation = bb.generate_biblatex_dataset(
+    cite_key="Chen2023",
+    author=[
+        "Chen, Yang",
+        "Hall, Joanne",
+        "van Wees, Dave",
+        "Andela, Niels",
+        "Hantson, Stijn",
+        "Giglio, Louis",
+        "van der Werf, Guido R.",
+        "Morton, Douglas C.",
+        "Randerson, James T.",
+    ],
+    title="Global Fire Emissions Database (GFED5) Burned Area (0.1)",
+    year=2023,
+    url="https://zenodo.org/records/7668424",
+    doi="10.5281/zenodo.7668424",
+)
+article_citation = bb.generate_biblatex_article(
+    cite_key="Chen2023",
+    author=[
+        "Chen, Yang",
+        "Hall, Joanne",
+        "van Wees, Dave",
+        "Andela, Niels",
+        "Hantson, Stijn",
+        "Giglio, Louis",
+        "van der Werf, Guido R.",
+        "Morton, Douglas C.",
+        "Randerson, James T.",
+    ],
+    title="Multi-decadal trends and variability in burned area from the fifth version of the Global Fire Emissions Database (GFED5)",
+    journal="Earth Syst. Sci. Data",
+    year=2023,
+    volume=15,
+    number=11,
+    pages=[5227, 5259],
+    doi="https://doi.org/10.5194/essd-15-5227-2023",
+)
+
+citations = data_citation + "\n\n" + article_citation
+
+# Set global netcdf attributes
+ds = hf.set_cf_global_attributes(
+    ds,
+    title="Global Fire Emissions Database (GFED5) Burned Area",
+    institution="Global Fire Emissions Database",
+    source="Combine MODIS, Landsat, and Sentinel-2 to create a 24-year record of global burned area",
+    history=f"""downloaded: {download_stamp}\nformatted: {generate_stamp}""",
+    references=citations,
+    comment="",
+    conventions="CF 1.12",
+)
+
+######################################################################
+# Export
+######################################################################
+
+ds.to_netcdf(
+    "{variable}_{frequency}_{source_id}_{time_mark}.nc".format(
+        variable=VAR, frequency="mon", source_id="GFED5", time_mark=time_range
+    ),
+    encoding={VAR: {"zlib": True}},
+)
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scripts/biblatex_builder.py b/scripts/biblatex_builder.py
new file mode 100644
index 0000000..e393220
--- /dev/null
+++ b/scripts/biblatex_builder.py
@@ -0,0 +1,265 @@
+"""
+biblatex_builder.py
+
+A clean utility for generating properly formatted BibLaTeX entries from structured input.
+This module supports tech reports, journal articles, datasets, and books. It enforces formatting rules
+for citation keys, author names, and DOIs.
+"""
+
+import re
+import textwrap
+from typing import Any
+
+__all__ = [
+    "generate_biblatex_article",
+    "generate_biblatex_techreport",
+    "generate_biblatex_dataset",
+    "generate_biblatex_book",
+]
+
+
+def _format_biblatex_entry(
+    entry_type: str, cite_key: str, fields: dict[str, Any]
+) -> str:
+    """
+    Format a BibLaTeX entry using consistent indentation and field alignment.
+
+    Args:
+        entry_type (str): Type of BibLaTeX entry (e.g., "article", "techreport").
+        cite_key (str): Unique citation key without spaces.
+        fields (dict[str, str]): Dictionary of BibLaTeX fields and their values.
+
+    Returns:
+        str: A formatted BibLaTeX entry as a triple-quoted Python string.
+
+    Raises:
+        ValueError: If the cite_key contains spaces or invalid characters.
+    """
+    if " " in cite_key:
+        raise ValueError("Citation key must not contain spaces.")
+    if not re.fullmatch(r"[\w\-:.]+", cite_key):
+        raise ValueError("Citation key contains invalid characters.")
+
+    # left justify, align the '=' vertically based on max key length, and remove trailing comma
+    max_key_len = max(map(len, fields))
+    entry_lines = [f"@{entry_type}{{{cite_key},"]
+    entry_lines += [
+        f"{key.ljust(max_key_len)} = {{{value}}}," for key, value in fields.items()
+    ]
+    entry_lines[-1] = entry_lines[-1].rstrip(",")  # remove trailing comma
+    entry_lines.append("}")
+
+    return f'"""\n{textwrap.indent("\n".join(entry_lines), "    ")}\n"""'
+
+
+def _normalize_doi(doi: str) -> str:
+    """
+    Normalize a DOI string to full URL form.
+
+    Args:
+        doi (str): The DOI, possibly with or without https://doi.org/ prefix.
+
+    Returns:
+        str: DOI in full URL form starting with 'https://doi.org/'.
+    """
+    doi = doi.strip()
+    if doi.startswith("http"):
+        doi = doi.split("doi.org/")[-1]
+    elif doi.startswith("doi.org/"):
+        doi = doi[8:]
+    return f"https://doi.org/{doi}"
+
+
+def _validate_and_format_authors(authors: list[str]) -> str:
+    """
+    Validate and format a list of authors for BibLaTeX.
+
+    Each author must be in 'Last, First' format. The function ensures a space after
+    the comma and removes spaces between initials (e.g., 'G. M.' → 'G.M.').
+
+    Args:
+        authors (list[str]): List of author names in 'Last, First' format.
+
+    Returns:
+        str: BibLaTeX-compatible string with authors joined by ' and '.
+
+    Raises:
+        ValueError: If an author is not in the expected format.
+    """
+    formatted = []
+    for name in authors:
+        if "," not in name:
+            raise ValueError(
+                f"Author '{name}' must be in 'Last, First' format (with a comma)."
+            )
+
+        last, first = (part.strip() for part in name.split(",", 1))
+        first_clean = re.sub(r"(?<=\.)\s+(?=[A-Z]\.)", "", first)
+        formatted.append(f"{last}, {first_clean}")
+
+    return " and ".join(formatted)
+
+
+def generate_biblatex_techreport(
+    cite_key: str,
+    author: str,
+    title: str,
+    institution: str,
+    year: str,
+    number: str,
+) -> str:
+    """
+    Generate a BibLaTeX @techreport entry.
+
+    Args:
+        cite_key (str): Citation key (no spaces).
+        author (str): Author string.
+        title (str): Title of the report.
+        institution (str): Publishing institution.
+        year (str): Publication year.
+        number (str): Report number or version.
+
+    Returns:
+        str: Formatted BibLaTeX @techreport entry as a multiline string.
+    """
+    fields = {
+        "author": author,
+        "title": title,
+        "institution": institution,
+        "year": year,
+        "number": number,
+    }
+    return _format_biblatex_entry("techreport", cite_key, fields)
+
+
+def generate_biblatex_article(
+    cite_key: str,
+    author: list[str],
+    title: str,
+    journal: str,
+    year: str,
+    volume: str,
+    number: str,
+    pages: list[int] | str,
+    doi: str | None = None,
+) -> str:
+    """
+    Generate a BibLaTeX @article entry.
+
+    Args:
+        cite_key (str): Citation key (no spaces).
+        author (list[str]): List of authors in 'Last, First' format.
+        title (str): Title of the article.
+        journal (str): Journal name.
+        year (str): Publication year.
+        volume (str): Volume number.
+        number (str): Issue number.
+        pages (list[int]): List of [start, end] page numbers.
+        doi (str, optional): DOI identifier (with or without prefix).
+
+    Returns:
+        str: Formatted BibLaTeX @article entry as a multiline string.
+    """
+    author_str = _validate_and_format_authors(author)
+
+    fields = {
+        "author": author_str,
+        "title": title,
+        "journal": journal,
+        "year": year,
+        "volume": volume,
+        "number": number,
+    }
+
+    # Format the page numbers
+    if isinstance(pages, list):
+        if len(pages) != 2:
+            raise ValueError(
+                "If 'pages' is a list, it must contain exactly two integers: [start, end]."
+            )
+        fields["pages"] = f"{pages[0]}–{pages[1]}"  # en dash between pages
+    elif isinstance(pages, str):
+        fields["pages"] = pages
+    elif pages != " ":
+        raise TypeError(
+            "'pages' must be a list of two integers, a string, or an empty string."
+        )
+
+    if doi:
+        fields["doi"] = _normalize_doi(doi)
+
+    return _format_biblatex_entry("article", cite_key, fields)
+
+
+def generate_biblatex_dataset(
+    cite_key: str,
+    author: list[str],
+    title: str,
+    year: str,
+    url: str,
+    note: str | None = None,
+    doi: str | None = None,
+) -> str:
+    """
+    Generate a BibLaTeX @misc entry for a dataset.
+
+    Args:
+        cite_key (str): Citation key (no spaces).
+        author (list[str]): List of authors in 'Last, First' format.
+        title (str): Title of the dataset.
+        year (str): Publication or release year.
+        url (str): Direct URL to the dataset.
+        note (str, optional): Any additional notes.
+        doi (str, optional): DOI identifier (with or without prefix).
+
+    Returns:
+        str: Formatted BibLaTeX @misc entry as a multiline string.
+    """
+    author_str = _validate_and_format_authors(author)
+    fields = {
+        "author": author_str,
+        "title": title,
+        "year": year,
+        "url": url,
+    }
+    if note:
+        fields["note"] = note
+    if doi:
+        fields["doi"] = _normalize_doi(doi)
+
+    return _format_biblatex_entry("misc", cite_key, fields)
+
+
+def generate_biblatex_book(
+    cite_key: str,
+    author: list[str],
+    title: str,
+    publisher: str,
+    year: str,
+    edition: str | None = None,
+) -> str:
+    """
+    Generate a BibLaTeX @book entry.
+
+    Args:
+        cite_key (str): Citation key (no spaces).
+        author (list[str]): List of authors in 'Last, First' format.
+        title (str): Title of the book.
+        publisher (str): Name of the publisher.
+        year (str): Year of publication.
+        edition (str, optional): Edition information, e.g., '2nd'.
+
+    Returns:
+        str: Formatted BibLaTeX @book entry as a multiline string.
+    """
+    author_str = _validate_and_format_authors(author)
+    fields = {
+        "author": author_str,
+        "title": title,
+        "publisher": publisher,
+        "year": year,
+    }
+    if edition:
+        fields["edition"] = edition
+
+    return _format_biblatex_entry("book", cite_key, fields)
diff --git a/compare_datasets.py b/scripts/compare_datasets.py
similarity index 100%
rename from compare_datasets.py
rename to scripts/compare_datasets.py
diff --git a/scripts/helper_funcs.py b/scripts/helper_funcs.py
new file mode 100644
index 0000000..98b6bbf
--- /dev/null
+++ b/scripts/helper_funcs.py
@@ -0,0 +1,317 @@
+import datetime
+import os
+
+import cftime as cf
+import numpy as np
+import requests
+import xarray as xr
+from intake_esgf import ESGFCatalog
+from tqdm import tqdm
+
+
+def download_from_html(remote_source: str, local_source: str | None = None) -> str:
+    """
+    Download a file from a remote URL to a local path.
+    If the "content-length" header is missing, it falls back to a simple download.
+    """
+    if local_source is None:
+        local_source = os.path.basename(remote_source)
+    if os.path.isfile(local_source):
+        return local_source
+
+    resp = requests.get(remote_source, stream=True)
+    try:
+        total_size = int(resp.headers.get("content-length"))
+    except (TypeError, ValueError):
+        total_size = 0
+
+    with open(local_source, "wb") as fdl:
+        if total_size:
+            with tqdm(
+                total=total_size, unit="B", unit_scale=True, desc=local_source
+            ) as pbar:
+                for chunk in resp.iter_content(chunk_size=1024):
+                    if chunk:
+                        fdl.write(chunk)
+                        pbar.update(len(chunk))
+        else:
+            for chunk in resp.iter_content(chunk_size=1024):
+                if chunk:
+                    fdl.write(chunk)
+    return local_source
+
+
+def download_from_zenodo(record: dict):
+    """
+    Download all files from a Zenodo record dict into a '_temp' directory.
+    Example for getting a Zenodo record:
+
+        # Specify the dataset title you are looking for
+        dataset_title = "Global Fire Emissions Database (GFED5) Burned Area"
+
+        # Build the query string to search by title
+        params = {
+            "q": f'title:"{dataset_title}"'
+        }
+
+        # Define the Zenodo API endpoint
+        base_url = "https://zenodo.org/api/records"
+
+        # Send the GET request
+        response = requests.get(base_url, params=params)
+        if response.status_code != 200:
+            print("Error during search:", response.status_code)
+            exit(1)
+
+        # Parse the JSON response
+        data = response.json()
+
+        # Get record dictionary
+        records = data['hits']['hits']
+        record = data['hits']['hits'][0]
+    """
+    download_dir = "_temp"
+    os.makedirs(download_dir, exist_ok=True)
+
+    title = record.get("metadata", {}).get("title", "No Title")
+    pub_date = record.get("metadata", {}).get("publication_date", "No publication date")
+    print(f"Found record:\n  Title: {title}\n  Publication Date: {pub_date}")
+
+    for file_info in record.get("files", []):
+        file_name = file_info.get("key")
+        file_url = file_info.get("links", {}).get("self")
+        local_file = os.path.join(download_dir, file_name)
+
+        if file_url:
+            print(f"Downloading {file_name} from {file_url} into {download_dir}...")
+            download_from_html(file_url, local_source=local_file)
+        else:
+            print(f"File URL not found for file: {file_name}")
+
+
+# I think this can be useful to make sure people export the netcdfs the same way every time
+def get_filename(attrs: dict, time_range: str) -> str:
+    """
+    Generate a NetCDF filename using required attributes and a time range.
+
+    Args:
+        attrs (dict): Dictionary of global attributes.
+        time_range (str): Time range string to embed in the filename.
+
+    Returns:
+        str: Formatted filename.
+
+    Raises:
+        ValueError: If any required attributes are missing from `attrs`.
+    """
+    required_keys = [
+        "variable_id",
+        "frequency",
+        "source_id",
+        "variant_label",
+        "grid_label",
+    ]
+
+    missing = [key for key in required_keys if key not in attrs]
+    if missing:
+        raise ValueError(
+            f"Missing required attributes: {', '.join(missing)}. "
+            f"Expected keys: {', '.join(required_keys)}"
+        )
+
+    filename = "{variable_id}_{frequency}_{source_id}_{variant_label}_{grid_label}_{time_mark}.nc".format(
+        **attrs, time_mark=time_range
+    )
+    return filename
+
+
+def get_cmip6_variable_info(variable_id: str) -> dict[str, str]:
+    """ """
+    df = ESGFCatalog().variable_info(variable_id)
+    return df.iloc[0].to_dict()
+
+
+def set_time_attrs(ds: xr.Dataset) -> xr.Dataset:
+    """
+    Ensure the xarray dataset's time attributes are formatted according to CF-Conventions.
+    """
+    assert "time" in ds
+    da = ds["time"]
+
+    # Ensure time is an accepted xarray time dtype
+    if np.issubdtype(da.dtype, np.datetime64):
+        ref_date = np.datetime_as_string(da.min().values, unit="s")
+    elif isinstance(da.values[0], cf.datetime):
+        ref_date = da.values[0].strftime("%Y-%m-%d %H:%M:%S")
+    else:
+        raise TypeError(
+            f"Unsupported xarray time format: {type(da.values[0])}. Accepted types are np.datetime64 or cftime.datetime."
+        )
+
+    da.encoding = {
+        "units": f"days since {ref_date}",
+        "calendar": da.encoding.get("calendar"),
+    }
+    da.attrs = {
+        "axis": "T",
+        "standard_name": "time",
+        "long_name": "time",
+    }
+    ds["time"] = da
+    return ds
+
+
+def set_lat_attrs(ds: xr.Dataset) -> xr.Dataset:
+    """
+    Ensure the xarray dataset's latitude attributes are formatted according to CF-Conventions.
+    """
+    assert "lat" in ds
+    da = ds["lat"]
+    da.attrs = {
+        "axis": "Y",
+        "units": "degrees_north",
+        "standard_name": "latitude",
+        "long_name": "latitude",
+    }
+    ds["lat"] = da
+    return ds
+
+
+def set_lon_attrs(ds: xr.Dataset) -> xr.Dataset:
+    """
+    Ensure the xarray dataset's longitude attributes are formatted according to CF-Conventions.
+    """
+    assert "lon" in ds
+    da = ds["lon"]
+    da.attrs = {
+        "axis": "X",
+        "units": "degrees_east",
+        "standard_name": "longitude",
+        "long_name": "longitude",
+    }
+    ds["lon"] = da
+    return ds
+
+
+def set_var_attrs(
+    ds: xr.Dataset, var: str, units: str, standard_name: str, long_name: str
+) -> xr.Dataset:
+    """
+    Ensure the xarray dataset's variable attributes are formatted according to CF-Conventions.
+    """
+    assert var in ds
+    da = ds[var]
+    da.attrs = {"units": units, "standard_name": standard_name, "long_name": long_name}
+    ds[var] = da
+    return ds
+
+
+def gen_utc_timestamp(time: float | None = None) -> str:
+    if time is None:
+        time = datetime.datetime.now(datetime.UTC)
+    else:
+        time = datetime.datetime.fromtimestamp(time)
+    return time.strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def add_time_bounds_monthly(ds: xr.Dataset) -> xr.Dataset:
+    """
+    Add monthly time bounds to an xarray Dataset.
+
+    For each timestamp in the dataset's 'time' coordinate, this function adds a new
+    coordinate called 'time_bounds' with the first day of the month and the first
+    day of the next month. These bounds follow CF conventions.
+
+    Args:
+        ds (xr.Dataset): Dataset with a 'time' coordinate of monthly timestamps.
+
+    Returns:
+        xr.Dataset: Modified dataset with a 'time_bounds' coordinate and updated
+                    attributes on the 'time' coordinate.
+    """
+
+    def _ymd_tuple(da: xr.DataArray) -> tuple[int, int, int]:
+        """Extract (year, month, day) from a single-element datetime DataArray."""
+        if da.size != 1:
+            raise ValueError("Expected a single-element datetime for conversion.")
+        return int(da.dt.year), int(da.dt.month), int(da.dt.day)
+
+    def _make_timestamp(t: xr.DataArray, ymd: tuple[int, int, int]) -> np.datetime64:
+        """Construct a timestamp matching the type of the input time value."""
+        try:
+            return type(t.item())(*ymd)  # try using the same class as the input
+        except Exception:
+            # fallback to datetime64 if direct construction fails
+            return np.datetime64(f"{ymd[0]:04d}-{ymd[1]:02d}-{ymd[2]:02d}")
+
+    lower_bounds = []
+    upper_bounds = []
+
+    for t in ds["time"]:
+        year, month, _ = _ymd_tuple(t)
+        lower_bounds.append(_make_timestamp(t, (year, month, 1)))
+
+        # First day of the next month (verbose-ified for easier readability)
+        if month == 12:
+            next_month = (year + 1, 1, 1)
+        else:
+            next_month = (year, month + 1, 1)
+        upper_bounds.append(_make_timestamp(t, next_month))
+
+    bounds_array = np.array([lower_bounds, upper_bounds]).T
+    ds = ds.assign_coords(time_bounds=(("time", "bounds"), bounds_array))
+    ds["time_bounds"].attrs["long_name"] = "time_bounds"
+    ds["time"].attrs["bounds"] = "time_bounds"
+
+    return ds
+
+
+def set_cf_global_attributes(
+    ds: xr.Dataset,
+    *,  # keyword only for the following args
+    title: str,
+    institution: str,
+    source: str,
+    history: str,
+    references: str,
+    comment: str,
+    conventions: str,
+) -> xr.Dataset:
+    """
+    Set required NetCDF global attributes according to CF-Conventions 1.12.
+
+    Args:
+        ds (xr.Dataset): The xarray dataset to which global attributes will be added.
+        title (str): Short description of the file contents.
+        institution (str): Where the original data was produced.
+        source (str): Method of production of the original data.
+        history (str): List of applications that have modified the original data.
+        references (str): References describing the data or methods used to produce it.
+        comment (str): Miscellaneous information about the data or methods used.
+        conventions (str): The name of the conventions followed by the dataset.
+
+    Returns:
+        xr.Dataset: The dataset with updated global attributes.
+
+    Raises:
+        ValueError: If a required global attribute is missing.
+    """
+
+    # Build and validate attributes
+    attrs = {
+        "title": title,
+        "institution": institution,
+        "source": source,
+        "history": history,
+        "references": references,
+        "comment": comment,
+        "Conventions": conventions,
+    }
+
+    # Ensure all values are explicitly set (None not allowed)
+    missing = [k for k, v in attrs.items() if v is None]
+    if missing:
+        raise ValueError(f"Missing required global attributes: {', '.join(missing)}")
+
+    ds.attrs.update(attrs)
+    return ds
diff --git a/scripts/validate_dataset.py b/scripts/validate_dataset.py
new file mode 100644
index 0000000..dbbaca4
--- /dev/null
+++ b/scripts/validate_dataset.py
@@ -0,0 +1,347 @@
+"""
+A script that checks an input dataset (netCDF file) for adherence to ILAMB standards.
+The netCDF can contain site data or gridded data.
+"""
+
+import sys
+from typing import Literal
+
+import cftime
+import numpy as np
+import xarray as xr
+from pydantic import BaseModel, ConfigDict, field_validator
+
+
+def get_dim_name(
+    dset: xr.Dataset | xr.DataArray,
+    dim: Literal["time", "lat", "lon", "depth", "site"],
+) -> str:
+    dim_names = {
+        "time": ["time"],
+        "lat": ["lat", "latitude", "Latitude", "y", "lat_"],
+        "lon": ["lon", "longitude", "Longitude", "x", "lon_"],
+        "depth": ["depth"],
+    }
+    # Assumption: the 'site' dimension is what is left over after all others are removed
+    if dim == "site":
+        try:
+            get_dim_name(dset, "lat")
+            get_dim_name(dset, "lon")
+            # raise NoSiteDimension("Dataset/dataarray is spatial")
+        except KeyError:
+            pass
+        possible_names = list(
+            set(dset.dims) - set([d for _, dims in dim_names.items() for d in dims])
+        )
+        if len(possible_names) == 1:
+            return possible_names[0]
+        msg = f"Ambiguity in locating a site dimension, found: {possible_names}"
+        # raise NoSiteDimension(msg)
+    possible_names = dim_names[dim]
+    dim_name = set(dset.dims).intersection(possible_names)
+    if len(dim_name) != 1:
+        msg = f"{dim} dimension not found: {dset.dims} "
+        msg += f"not in [{','.join(possible_names)}]"
+        raise KeyError(msg)
+    return str(dim_name.pop())
+
+
+def is_spatial(da: xr.DataArray) -> bool:
+    try:
+        get_dim_name(da, "lat")
+        get_dim_name(da, "lon")
+        return True
+    except KeyError:
+        pass
+    return False
+
+
+# spatial validator
+class ILAMBDataset(BaseModel):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    ds: xr.Dataset
+
+    @field_validator("ds")
+    @classmethod
+    def check_vars(cls, ds: xr.Dataset) -> xr.Dataset:
+        # Check that there are data variables
+        if not ds.data_vars:
+            raise ValueError(
+                "Dataset does not have any data variables. An example data variable is 'cSoil'."
+            )
+        # Check that the dataset has at least one variable but not more than 2
+        if len(ds.data_vars) >= 3:
+            raise ValueError(
+                f"Dataset has too many data variables {ds.data_vars}. The measurement and the uncertainty are the only expected data variables. There should be one netCDF file per data variable if a dataset has multiple data variables."
+            )
+        return ds
+
+    @field_validator("ds")
+    @classmethod
+    def global_attrs(cls, ds: xr.Dataset) -> xr.Dataset:
+        # Check that the dataset has the required global attribute keys
+        missing = set(
+            [
+                "title",
+                "institution",
+                "source",
+                "history",
+                "references",
+                "comment",
+                "Conventions",
+            ]
+        ) - set(ds.attrs.keys())
+        if missing:
+            raise ValueError(
+                f"Dataset does not properly encode global attributes, {missing=}"
+            )
+        return ds
+
+    @field_validator("ds")
+    @classmethod
+    def time_dim(cls, ds: xr.Dataset) -> xr.Dataset:
+        # Check that the dataset has a properly set-up time dimension
+        dimensions = ds.dims
+        time_dim_present = "time" in dimensions
+        time_var = ds["time"]
+        time_attrs = time_var.attrs
+
+        # Check if the time dimension is present
+        if not time_dim_present:
+            raise ValueError(
+                f"Dataset does not have a time dimension, {dimensions=}. Expected a dimension called 'time'."
+            )
+
+        # Check if time values are decoded as datetime objects
+        time_dtype = type(time_var.values[0])
+        if not (
+            np.issubdtype(time_var.values.dtype, np.datetime64)
+            or isinstance(time_var.values[0], cftime.datetime)
+        ):
+            raise TypeError(
+                f"Time values are not properly decoded as datetime objects: {time_dtype=}"
+            )
+
+        # Check time attributes: axis, long_name, standard_name
+        missing = set(["axis", "long_name", "standard_name"]) - set(time_attrs)
+        if missing:
+            raise ValueError(
+                f"Dataset is missing time-specific attributes, {missing=}."
+            )
+        else:
+            # Check the axis and standard_name to ensure they're correct; long_name can vary.
+            correct_axis_name = time_attrs["axis"] == "T"
+            if not correct_axis_name:
+                raise TypeError(
+                    f"The time dimension's axis attribute is {time_attrs['axis']}. Expected 'T'."
+                )
+            correct_std_name = time_attrs["standard_name"] == "time"
+            if not correct_std_name:
+                raise TypeError(
+                    f"The time dimension's standard_name attribute is {time_attrs['standard_name']}. Expected 'time'."
+                )
+
+        # Check time units encoding and formatting
+        time_encoding = time_var.encoding
+        if "units" not in time_encoding or "since" not in time_encoding["units"]:
+            raise ValueError(
+                f"Time encoding is missing or incorrect, {time_encoding=}. Expected 'days since YYYY:MM:DD HH:MM'"
+            )
+
+        # Check time calendar encoding
+        if "calendar" in time_encoding:
+            valid_calendars = [
+                "standard",
+                "gregorian",
+                "proleptic_gregorian",
+                "noleap",
+                "all_leap",
+                "360_day",
+                "julian",
+            ]
+
+            if time_encoding["calendar"] not in valid_calendars:
+                # Check for explicitly defined calendar attributes
+                if "month_lengths" in time_attrs:
+                    # Validate month_lengths
+                    month_lengths = time_attrs["month_lengths"]
+                    if len(month_lengths) != 12 or not all(
+                        isinstance(m, (int, np.integer)) for m in month_lengths
+                    ):
+                        raise ValueError(
+                            "month_lengths must be a list of 12 integer values."
+                        )
+
+                    # Validate leap year settings if present
+                    if "leap_year" in time_attrs:
+                        leap_year = time_attrs["leap_year"]
+                        if not isinstance(leap_year, (int, np.integer)):
+                            raise ValueError("leap_year must be an integer.")
+
+                        if "leap_month" in time_attrs:
+                            leap_month = time_attrs["leap_month"]
+                            if not (1 <= leap_month <= 12):
+                                raise ValueError("leap_month must be between 1 and 12.")
+                else:
+                    raise ValueError(
+                        f"Unrecognized calendar '{time_encoding['calendar']}' and no explicit month_lengths provided."
+                    )
+        else:
+            raise ValueError("Calendar attribute is missing from the time encoding.")
+
+        # Check bounds encoding
+        time_bounds_name = time_attrs["bounds"]
+        if time_bounds_name not in ds:
+            raise ValueError(
+                f"Time bounds variable '{time_bounds_name=}' is missing from dataset. Expected 'time_bounds'"
+            )
+
+        # Check time_bounds structure
+        time_bounds = ds[time_bounds_name]
+        if len(time_bounds.dims) != 2 or time_bounds.dims[0] != "time":
+            raise ValueError(
+                f"Time bounds, '{time_bounds_name=}', has incorrect dimensions, {time_bounds.dims}."
+                "Expected two dimensions: ('time', <second_dimension>)."
+            )
+
+        # Check that the second dimension length is 2 (indicating time bounds)
+        if time_bounds.shape[1] != 2:
+            raise ValueError(
+                f"Time bounds '{time_bounds_name}' has incorrect shape {time_bounds.shape}. "
+                "The second dimension should have length 2 to represent time bounds."
+            )
+
+        # Check for the correct 'long_name' attribute for time_bounds
+        if (
+            "long_name" not in time_bounds.attrs
+            or time_bounds.attrs["long_name"] != "time_bounds"
+        ):
+            raise ValueError(
+                f"Time bounds '{time_bounds_name}' is missing its 'long_name':'time_bounds' attribute."
+            )
+
+        return ds
+
+    @field_validator("ds")
+    @classmethod
+    def lat_dim(cls, ds: xr.Dataset) -> xr.Dataset:
+        # Check that the dataset has a properly set-up latitude dimension
+        lat_names = {"lat", "latitude", "y"}
+        dims = ds.dims
+        dims_lower = {
+            dim.lower(): dim for dim in dims
+        }  # Map lowercased dims to original names
+        lat_names_found = [
+            dims_lower[name.lower()] for name in lat_names if name.lower() in dims_lower
+        ]
+
+        # Ensure there is only one latitude dimension
+        if len(lat_names_found) != 1:
+            raise ValueError(
+                f"Dataset has {len(lat_names_found)} latitude dimensions, expected exactly one. Found: {lat_names_found}"
+            )
+
+        lat_name = lat_names_found[0]
+
+        # Check that one of the accepted latitude long_names exists
+        lat_var = ds[lat_name]
+        lat_attrs = lat_var.attrs
+
+        # Check for missing latitude attributes
+        missing = set(["axis", "long_name", "standard_name", "units"]) - set(lat_attrs)
+        if missing:
+            raise ValueError(
+                f"Dataset is missing latitude-specific attributes, {missing=}"
+            )
+        else:
+            # Check axis
+            if lat_attrs["axis"] != "Y":
+                raise ValueError(
+                    f"Incorrect latitude axis attribute: {lat_attrs['axis']}. Expected 'Y' (case sensitive)."
+                )
+            # Check standard_name
+            if lat_attrs["standard_name"] != "latitude":
+                raise ValueError(
+                    f"Incorrect latitude standard_name attribute: {lat_attrs['standard_name']}. Expected 'latitude' (case sensitive)."
+                )
+            # Check units
+            valid_lat_units = {
+                "degrees_north",
+                "degree_north",
+                "degree_N",
+                "degrees_N",
+                "degreeN",
+                "degreesN",
+            }
+            lat_units = lat_attrs.get("units")
+            if lat_units not in valid_lat_units:
+                raise ValueError(
+                    f"Invalid 'units' attribute for latitude dimension. Found: {lat_units}. Expected one of {valid_lat_units}."
+                )
+
+        return ds
+
+    @field_validator("ds")
+    @classmethod
+    def lon_dim(cls, ds: xr.Dataset) -> xr.Dataset:
+        # Check that the dataset has a properly set-up longitude dimension
+        lon_names = {"lon", "longitude", "x"}
+        dims = ds.dims
+        dims_lower = {
+            dim.lower(): dim for dim in dims
+        }  # Map lowercased dims to original names
+        lon_names_found = [
+            dims_lower[name.lower()] for name in lon_names if name.lower() in dims_lower
+        ]
+
+        # Ensure there is only one longitude dimension
+        if len(lon_names_found) != 1:
+            raise ValueError(
+                f"Dataset has {len(lon_names_found)} longitude dimensions, expected exactly one. Found: {lon_names_found}"
+            )
+
+        lon_name = lon_names_found[0]
+
+        # Check that one of the accepted longitude long_names exists
+        lon_var = ds[lon_name]
+        lon_attrs = lon_var.attrs
+
+        # Check for missing longitude attributes
+        missing = set(["axis", "long_name", "standard_name", "units"]) - set(lon_attrs)
+        if missing:
+            raise ValueError(
+                f"Dataset is missing longitude-specific attributes, {missing=}"
+            )
+        else:
+            # Check axis
+            if lon_attrs["axis"] != "X":
+                raise ValueError(
+                    f"Incorrect latitude axis attribute: {lon_attrs['axis']}. Expected 'X' (case sensitive)"
+                )
+            # Check standard_name
+            if lon_attrs["standard_name"] != "longitude":
+                raise ValueError(
+                    f"Incorrect latitude standard_name attribute: {lon_attrs['standard_name']}. Expected 'longitude' (case sensitive)"
+                )
+
+            # Check units
+            valid_lon_units = {
+                "degrees_east",
+                "degree_east",
+                "degree_E",
+                "degrees_E",
+                "degreeE",
+                "degreesE",
+            }
+            lon_units = lon_attrs.get("units")
+            if lon_units not in valid_lon_units:
+                raise ValueError(
+                    f"Invalid 'units' attribute for longitude dimension. Found: {lon_units}. Expected one of {valid_lon_units}."
+                )
+
+        return ds
+
+
+if __name__ == "__main__":
+    dset = xr.open_dataset(sys.argv[1])
+    test = ILAMBDataset(ds=dset)