rename granularity + add doc

juliettelavoie · juliettelavoie · commit dae5ab67af45 · 2025-11-06T10:50:34.000-05:00
diff --git a/changelog/6.feature.md b/changelog/6.feature.md
diff --git a/changelog/7.breaking.md b/changelog/7.breaking.md
@@ -1,2 +1,2 @@
-Changed default citations to be 'model-level' citations rather than 'model-experiment-level' citations.
+Changed default citations to be 'model' granularity citations rather than 'model-experiment' granularity citations.
 In practice, this means that you will get fewer citations and they will be the ones that apply to all submissions for a given model, rather than one citation for each model-experiment combination that is found.
diff --git a/changelog/7.docs.md b/changelog/7.docs.md
diff --git a/changelog/7.feature.md b/changelog/7.feature.md
@@ -0,0 +1,2 @@
+- Added support for taking paths to CMIP netCDF files as input
+- Added `doi_granularity` option so users can specify whether they want citations at the model or experiment granularity
diff --git a/docs/how-to-guides/get-citations-advanced.py b/docs/how-to-guides/get-citations-advanced.py
@@ -30,7 +30,7 @@
 
 from cmipcite.citations import (
     AuthorListStyle,
-    DOILevel,
+    DOIGranularity,
     get_bibtex_citation,
     get_citations,
     get_text_citation,
@@ -52,7 +52,7 @@
 # %%
 bibtex_citations = get_citations(
     ["hdl:21.14100/f2f502c9-9626-31c6-b016-3f7c0534803b"],
-    doi_level=DOILevel.MODEL,
+    doi_granularity=DOIGranularity.MODEL,
     get_citation=get_bibtex_citation,
 )
 print(f"{len(bibtex_citations)=}")
@@ -70,7 +70,7 @@
 # %%
 plaintex_citations = get_citations(
     ["hdl:21.14100/f2f502c9-9626-31c6-b016-3f7c0534803b"],
-    doi_level=DOILevel.EXPERIMENT,
+    doi_granularity=DOIGranularity.EXPERIMENT,
     get_citation=partial(get_text_citation, author_list_style=AuthorListStyle.LONG),
 )
 
@@ -84,7 +84,7 @@
 # %%
 plaintex_citations = get_citations(
     ["hdl:21.14100/f2f502c9-9626-31c6-b016-3f7c0534803b"],
-    doi_level=DOILevel.MODEL,
+    doi_granularity=DOIGranularity.MODEL,
     get_citation=partial(get_text_citation, author_list_style=AuthorListStyle.SHORT),
 )
 
@@ -120,7 +120,7 @@ def get_my_citation(doi: str, version: str) -> str:
 # %%
 custom_citations = get_citations(
     ["hdl:21.14100/f2f502c9-9626-31c6-b016-3f7c0534803b"],
-    doi_level=DOILevel.MODEL,
+    doi_granularity=DOIGranularity.MODEL,
     get_citation=get_my_citation,
 )
 
diff --git a/docs/how-to-guides/get-citations-basic.py b/docs/how-to-guides/get-citations-basic.py
@@ -15,16 +15,17 @@
 # %% [markdown] editable=true slideshow={"slide_type": ""}
 # # How to get citations ? (Basic version)
 #
-# Here, we show how you can get citations for CMIP data.
-# Citation can be retrieved with the help of the Persistent IDentifiers (PIDs).
+# Citations can be retrieved with the help of the Persistent IDentifiers (PIDs).
 # In the CMIP world, there are two types of PIDs:
-#   * file PID (also called tracking_id)
-#   * dataset PID (often referred to as just PID).
-#
-# A dataset is a collection of files from a single variable sampled at a single
-# frequency from a single model running a single experiment.
-# All the datasets from a single model or a single experiment (and model) are grouped
-# under a DOI.
+
+#     * file PID (normally referred to as a tracking ID)
+#     * dataset PID (normally simply referred to as PID).
+
+# A dataset is a collection of files
+# (for CMIP, this collection of files
+# is for a single variable sampled at a single frequency and spatial sampling
+# from a single model running a single experiment).
+# Both PID types can be passed to `ids_or_paths`.
 
 
 # %% [markdown]
@@ -78,12 +79,20 @@
 
 
 # %% [markdown]
-# You can specify the level of the DOI (model or experiment)
+# There are multiple possibilities for the retrieved DOI.
+# These vary based on the granularity of the DOI.
+# At the moment, as far as we know, there are two granularities:
+#     * model (capturing all submissions to a given MIP by a given model)
+#        * DRS: `<mip_era>/<activity_id>/<institution_id>/<source_id>`
+#     * experiment (capturing all submissions to a given MIP by a given model for a
+#     given experiment.
+#        * DRS: `<mip_era>/<activity_id>/<institution_id>/<source_id>/<experiment_id>`
+# This is controlled by `doi_granularity`.
 
 # %%
 citations = get(
     ["hdl:21.14100/90f93a05-357c-4ea2-b61f-bf2418700791"],
-    doi_level="experiment",
+    doi_granularity="experiment",
 )
 print(citations[0])
 
diff --git a/src/cmipcite/citations.py b/src/cmipcite/citations.py
@@ -42,32 +42,42 @@ class AuthorListStyle(StrEnum):
     """
 
 
-# TODO: change to DOIGranularity throughout
-class DOILevel(StrEnum):
+class DOIGranularity(StrEnum):
     """
-    DOI level
-
-    DOIs can be minted at different levels of granularity
-    i.e. they can capture different groups of datasets.
-    For example, DOIs minted at the 'model' level
-    apply to all submissions from that model for a given MIP.
-    DOIs minted at the 'experiment' level
-    apply to all outputs from a given experiment
-    run by a given model in a given MIP.
+    DOI granularity
+
+    CMIP data can be aggregated at different granularities.
+    Data citations are designated on data aggregations belonging to a model
+    contribution to a MIP (or activity_id) and on data belonging to an experiment
+    contributed by a specific model:
+    model: <mip_era>/<activity_id>/<institution_id>/<source_id>
+    experiment: <mip_era>/<activity_id>/<institution_id>/<source_id>/<experiment_id>.
     """
 
-    # TODO: update notes.
-    # We use the 'lowest-level' from the DRS as a short-hand.
-    # experiment is short for mip-model-experiment.
-    # model is short for mip-model.
     EXPERIMENT = "experiment"
     """
-    Experiment level DOI.
+    mip-model-experiment granularity of DOI.
     """
 
     MODEL = "model"
     """
-    Model level DOI
+    mip-model granularity of DOI.
+    """
+
+
+class FormatOption(StrEnum):
+    """
+    Citation format options
+    """
+
+    BIBTEX = "bibtex"
+    """
+    Bibtex format
+    """
+
+    TEXT = "text"
+    """
+    Plain text file
     """
 
 
@@ -183,7 +193,7 @@ def get_tracking_id_from_cmip_netcdf(nc_path: Path) -> str:
 
 def get_doi_and_version(  # type: ignore
     in_value: str,
-    doi_level: DOILevel,
+    doi_granularity: DOIGranularity,
     client: RESTHandleClient | None = None,
     get_tracking_id_from_path: Callable[[Path], str] = get_tracking_id_from_cmip_netcdf,
     multi_dataset_handling: MultiDatasetHandlingStrategy | None = None,
@@ -196,11 +206,14 @@ def get_doi_and_version(  # type: ignore
     in_value
         Input ID or path to a netCDF file
 
-    doi_level
-        TODO: rename and update
-        Level of DOI to retrieve.
+    doi_granularity
+        Granularity of DOI to retrieve.
 
-        See [DOILevel][(m).] for details.
+        We use the 'lowest-level' from the DRS as a short-hand.
+        "experiment" is short for mip-model-experiment.
+        "model" is short for mip-model.
+
+        See [DOIGranularity][(m).] for details.
 
     client
         Client to use for interacting with pyhandle's REST API
@@ -260,7 +273,7 @@ def get_doi_and_version(  # type: ignore
     doi_raw = client.get_value_from_handle(pid, "IS_PART_OF")
     doi = doi_raw.replace("doi:", "")
 
-    if doi_level == DOILevel.MODEL:
+    if doi_granularity == DOIGranularity.MODEL:
         # get model doi
         r = httpx.get(
             f"https://api.datacite.org/dois/{doi}",
@@ -270,12 +283,12 @@ def get_doi_and_version(  # type: ignore
             "identifier"
         ]
 
-    elif doi_level == DOILevel.EXPERIMENT:
+    elif doi_granularity == DOIGranularity.EXPERIMENT:
         # doi is already in the desired form
         pass
 
     else:  # pragma: no cover
-        raise NotImplementedError(doi_level)
+        raise NotImplementedError(doi_granularity)
 
     version = client.get_value_from_handle(pid, "VERSION_NUMBER")
 
@@ -285,7 +298,7 @@ def get_doi_and_version(  # type: ignore
 def get_citations(  # type: ignore
     ids_or_paths: list[str],
     get_citation: Callable[[str, str], str],
-    doi_level: DOILevel,
+    doi_granularity: DOIGranularity,
     client: RESTHandleClient | None = None,
     multi_dataset_handling: MultiDatasetHandlingStrategy | None = None,
 ) -> list[str]:
@@ -315,11 +328,10 @@ def get_citations(  # type: ignore
 
         For example, [get_bibtex_citation][(m).].
 
-    doi_level
-        TODO: rename and update
-        Level of DOI to retrieve.
+    doi_granularity
+        Granularity of DOI to retrieve.
 
-        See [DOILevel][(m).] for details.
+        See [DOIGranularity][(m).] for details.
 
     client
         Client to use for interacting with pyhandle's REST API
@@ -350,19 +362,22 @@ def get_citations(  # type: ignore
     (for CMIP, this collection of files
     is for a single variable sampled at a single frequency and spatial sampling
     from a single model running a single experiment).
-    For a given PID, we can retrieve the associated DOI.
+    Both PID types can be passed to `ids_or_paths`.
+
+    For a given PID, we can retrieve an associated DOI.
     However, there are multiple possibilities for the retrieved DOI.
     These vary based on the granularity of the DOI.
     At the moment, as far as we know, there are two granularities:
-    a) capturing all submissions to a given MIP by a given model
-    b) capturing all submissions to a given MIP by a given model for a given experiment.
-    The `doi_granularity` controls which DOI grouping level you get.
+        * model (capturing all submissions to a given MIP by a given model)
+        * experiment (capturing all submissions to a given MIP by a given model for a
+        given experiment.
+    This is controlled by `doi_granularity`.
 
     Examples
     --------
     >>> citations = get_citations(
     ...     ["hdl:21.14100/f2f502c9-9626-31c6-b016-3f7c0534803b"],
-    ...     doi_level=DOILevel.MODEL,
+    ...     doi_granularity=DOIGranularity.MODEL,
     ...     get_citation=get_bibtex_citation,
     ... )
     >>> print(citations[0])
@@ -386,7 +401,7 @@ def get_citations(  # type: ignore
             v,
             client=client,
             multi_dataset_handling=multi_dataset_handling,
-            doi_level=doi_level,
+            doi_granularity=doi_granularity,
         )
         for v in ids_or_paths
     ]
@@ -398,22 +413,6 @@ def get_citations(  # type: ignore
     return res
 
 
-class FormatOption(StrEnum):
-    """
-    Citation format options
-    """
-
-    BIBTEX = "bibtex"
-    """
-    Bibtex format
-    """
-
-    TEXT = "text"
-    """
-    Plain text file
-    """
-
-
 def translate_get_args_to_get_citations_kwargs(
     format: FormatOption,
     author_list_style: AuthorListStyle,
@@ -467,7 +466,7 @@ def get(  # noqa: PLR0913
     in_values: list[str],
     format: FormatOption = FormatOption.TEXT,
     author_list_style: AuthorListStyle = AuthorListStyle.LONG,
-    doi_level: DOILevel = DOILevel.MODEL,
+    doi_granularity: DOIGranularity = DOIGranularity.MODEL,
     multi_dataset_handling: MultiDatasetHandlingStrategy | None = None,
     handle_server_url: str = "http://hdl.handle.net/",
 ) -> list[str]:
@@ -489,10 +488,10 @@ def get(  # noqa: PLR0913
         Whether, if the format is text,
         the author list should be long (all names) or short (et al.)
 
-    doi_level
-        Level of DOI to retrieve.
+    doi_granularity
+        Granularity of DOI to retrieve.
 
-        See [DOILevel][(m).] for details.
+        See [DOIGranularity][(m).] for details.
 
     multi_dataset_handling
         Strategy to use when a given ID or file belongs to multiple datasets
@@ -519,13 +518,16 @@ def get(  # noqa: PLR0913
     (for CMIP, this collection of files
     is for a single variable sampled at a single frequency and spatial sampling
     from a single model running a single experiment).
-    For a given PID, we can retrieve the associated DOI.
+    Both PID types can be passed to `in_values`.
+
+    For a given PID, we can retrieve an associated DOI.
     However, there are multiple possibilities for the retrieved DOI.
     These vary based on the granularity of the DOI.
     At the moment, as far as we know, there are two granularities:
-    a) capturing all submissions to a given MIP by a given model
-    b) capturing all submissions to a given MIP by a given model for a given experiment.
-    The `doi_granularity` controls which DOI grouping level you get.
+        * model (capturing all submissions to a given MIP by a given model)
+        * experiment (capturing all submissions to a given MIP by a given model for a
+        given experiment.
+    This is controlled by `doi_granularity`.
     """
     get_citations_kwargs = translate_get_args_to_get_citations_kwargs(
         format=format,
@@ -537,7 +539,7 @@ def get(  # noqa: PLR0913
         citations = get_citations(
             ids_or_paths=in_values,
             multi_dataset_handling=multi_dataset_handling,
-            doi_level=doi_level,
+            doi_granularity=doi_granularity,
             **get_citations_kwargs,
         )
     except MultipleDatasetMemberError as exc:
diff --git a/src/cmipcite/cli/__init__.py b/src/cmipcite/cli/__init__.py
@@ -12,7 +12,7 @@
 import cmipcite
 from cmipcite.citations import (
     AuthorListStyle,
-    DOILevel,
+    DOIGranularity,
     FormatOption,
     get_citations,
     translate_get_args_to_get_citations_kwargs,
@@ -75,11 +75,10 @@ def get(  # noqa: PLR0913
             help="Whether the author list should be long (all names) or short (et al.)."
         ),
     ] = AuthorListStyle.LONG,
-    # TODO: rename to doi_granularity here and throughout
-    doi_level: Annotated[
-        DOILevel,
+    doi_granularity: Annotated[
+        DOIGranularity,
         typer.Option(help="Desired granularity of the retrieved DOIs."),
-    ] = DOILevel.MODEL,
+    ] = DOIGranularity.MODEL,
     multi_dataset_handling: Annotated[
         Optional[MultiDatasetHandlingStrategy],
         typer.Option(
@@ -105,7 +104,7 @@ def get(  # noqa: PLR0913
     try:
         citations = get_citations(
             ids_or_paths=in_values,
-            doi_level=doi_level,
+            doi_granularity=doi_granularity,
             multi_dataset_handling=multi_dataset_handling,
             **get_citations_kwargs,
         )
diff --git a/tests/regression/test_cli_get_regression.py b/tests/regression/test_cli_get_regression.py

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-Changed default citations to be 'model-level' citations rather than 'model-experiment-level' citations.`
	`1`	`+Changed default citations to be 'model' granularity citations rather than 'model-experiment' granularity citations.`
`2`	`2`	`In practice, this means that you will get fewer citations and they will be the ones that apply to all submissions for a given model, rather than one citation for each model-experiment combination that is found.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+- Added support for taking paths to CMIP netCDF files as input`
	`2`	+- Added `doi_granularity` option so users can specify whether they want citations at the model or experiment granularity