biopragmatics · cthoyt · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025 · Apr 6, 2025
diff --git a/src/curies/metamodel.py b/src/curies/metamodel.py
@@ -0,0 +1,214 @@
+"""Quick metadata model."""
+
+from __future__ import annotations
+
+import csv
+import types
+import typing
+from collections import Counter
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, TypeVar
+
+from pydantic import BaseModel
+
+from curies import NamableReference
+
+__all__ = [
+    "from_tsv",
+    "iter_records",
+]
+
+Model = TypeVar("Model", bound=BaseModel)
+
+
+def from_tsv(
+    path: str | Path, cls: type[Model], names: dict[str, str] | None = None
+) -> Iterable[Model]:
+    """Load models from a TSV.
+
+    :param path: The path to a TSV file
+    :param cls: The model class to parse into
+    :param names:
+        A mapping from column names corresponding to reference fields to column names representing the labels
+    :yields: Validated models
+
+    Let's use a similar table, now with the prefix and identifier combine into CURIEs.
+
+    =========== ======== ======
+    curie       name     smiles
+    =========== ======== ======
+    CHEBI:16236 ethanol  CCO
+    CHEBI:28831 propanol CCCO
+    CHEBI:44884 pentanol CCCCCO
+    =========== ======== ======
+
+    In the following code, we simulate reading that file and show where the error shows up:
+
+    .. code-block:: python
+
+        from pydantic import BaseModel
+
+        from curies import NamedReference
+        from curies.metamodel import iter_records
+
+
+        class Row(BaseModel):
+            curie: NamedReference
+            smiles: str
+
+
+        records = [
+            {"curie": "CHEBI:16236", "name": "ethanol", "smiles": "CCO"},
+            {"curie": "CHEBI:28831", "name": "propanol", "smiles": "CCCO"},
+            {"curie": "CHEBI:44884", "name": "pentanol", "smiles": "CCCCCO"},
+        ]
+
+        models = list(iter_records(records, Row, names={"curie": "name"}))
+        print(models)
+
+    In the following example, we encode SSSOM in a Pydantic model:
+
+    .. code-block:: python
+
+        import datetime
+        from typing import Literal
+
+        from pydantic import BaseModel, ConfigDict, Field
+        from curies import NamableReference, Reference
+        from curies.metamodel import iter_records
+
+
+        class SemanticMapping(BaseModel):
+            # required when using aliases
+            model_config = ConfigDict(
+                populate_by_name=True,
+            )
+
+            subject: NamableReference = Field(..., alias="subject_id")
+            predicate: NamableReference = Field(..., alias="predicate_id")
+            predicate_modifier: Literal["Not"] | None = Field(None)
+            object: NamableReference = Field(..., alias="object_id")
+            mapping_justification: Reference = Field(...)
+            license: Reference | None = Field(None)
+            creator: NamableReference | None = Field(None, alias="creator_id")
+            author: NamableReference | None = Field(None, alias="author_id")
+            reviewer: NamableReference | None = Field(None, alias="reviewer_id")
+            publication_date: datetime.date | None = Field(None)
+            issue_tracker_item: str | None = Field(None)
+            comment: str | None = Field(None)
+
+
+        records = [
+            {
+                "subject_id": "CHEBI:16236",
+                "subject_label": "ethanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:702",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+            {
+                "subject_id": "CHEBI:28831",
+                "subject_label": "propanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:1031",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+            {
+                "subject_id": "CHEBI:44884",
+                "subject_label": "pentanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:6276",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+        ]
+
+        models = list(
+            iter_records(
+                records,
+                SemanticMapping,
+                names={
+                    "subject_id": "subject_label",
+                    "predicate_id": "predicate_label",
+                    "object_id": "object_label",
+                    "author_id": "author_label",
+                    "reviewer_id": "reviewer_label",
+                    "creator_id": "creator_label",
+                },
+            )
+        )
+        print(models)
+
+    """
+    path = Path(path).expanduser().resolve()
+    with path.open() as file:
+        reader = csv.DictReader(file, delimiter="\t")
+        yield from iter_records(reader, cls, names=names)
+
+
+def iter_records(
+    records: Iterable[dict[str, Any]], cls: type[Model], names: dict[str, str] | None = None
+) -> Iterable[Model]:
+    """Get records."""
+    if names is None:
+        names = {}
+
+    # maps from aliases back to the names of the fields in the Pydantic model class
+    # e.g., in SSSOM, we map `subject_id` as an alias to `subject` as the field name
+    # in the Pydantic model class
+    alias_to_field: dict[str, str] = {}
+    field_to_alias: dict[str, str] = {}
+    for key, model_field in cls.model_fields.items():
+        if model_field.alias:
+            field_to_alias[key] = model_field.alias
+            alias_to_field[model_field.alias] = key
+
+    # Check that all keys in the names dictionary
+    # are actually in the model
+    for curie_key, name_key in names.items():
+        norm_curie_key = alias_to_field.get(curie_key)
+        if norm_curie_key is None:
+            raise ValueError(
+                f"Incorrectly specified name reconciliation key - {curie_key} is not a model field"
+            )
+        if name_key in alias_to_field:
+            raise ValueError(f"name key {name_key} should not appear as a model field nor alias")
+
+    # check that no values are used for multiple columns
+    counter = Counter(names.values())
+    bad_keys = {key for key, count in counter.items() if count > 1}
+    if bad_keys:
+        raise ValueError(f"duplicate usage of name columns: {bad_keys}")
+
+    # Look into the model to get the type for each field
+    # that appears in the names dictionary
+    alias_to_type: dict[str, type[NamableReference]] = {}
+    for curie_key, field_info in cls.model_fields.items():
+        if not field_info or not field_info.annotation:
+            raise ValueError
+        norm_curie_key = field_to_alias.get(curie_key, curie_key)
+        alias_to_type[norm_curie_key] = _strip_optional(field_info.annotation)
+
+    for record in records:
+        if names:
+            for curie_key, name_key in names.items():
+                if curie_key not in record:
+                    continue
+                reference_cls: type[NamableReference] = alias_to_type[curie_key]
+                record[curie_key] = reference_cls.from_curie(
+                    record.pop(curie_key), name=record.pop(name_key, None)
+                )
+
+        model = cls.model_validate(record)
+        yield model
+
+
+def _strip_optional(x: Any) -> Any:
+    if typing.get_origin(x) != types.UnionType:
+        return x
+    else:
+        args = [arg for arg in typing.get_args(x) if arg is not type(None)]
+        if len(args) == 1:
+            return args[0]
+        else:
+            raise NotImplementedError
diff --git a/tests/test_metamodel.py b/tests/test_metamodel.py
@@ -0,0 +1,124 @@
+"""Test the lightweight metadata model."""
+
+import datetime
+import unittest
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+from curies import NamableReference, Reference, vocabulary
+from curies.metamodel import iter_records
+
+
+class TestModel(unittest.TestCase):
+    """Test parsing models."""
+
+    def test_model(self) -> None:
+        """Test parsing into a namable reference."""
+
+        class MM(BaseModel):
+            """Test model."""
+
+            curie: NamableReference
+
+        names = {"curie": "curie_label"}
+        records = [
+            {"curie": "GO:0000001", "curie_label": "Test 1"},
+            {"curie": "GO:0000002", "curie_label": "Test 2"},
+        ]
+
+        models = list(iter_records(records, MM, names=names))
+        self.assertEqual(
+            [
+                MM(curie=NamableReference(prefix="GO", identifier="0000001", name="Test 1")),
+                MM(curie=NamableReference(prefix="GO", identifier="0000002", name="Test 2")),
+            ],
+            models,
+        )
+
+    def test_model_with_aliases(self) -> None:
+        """Test metamodel that has aliases."""
+
+        class SemanticMapping(BaseModel):
+            """A model for SSSOM semantic mapping."""
+
+            model_config = ConfigDict(
+                populate_by_name=True,
+            )
+
+            subject: NamableReference = Field(..., alias="subject_id")
+            predicate: NamableReference = Field(..., alias="predicate_id")
+            predicate_modifier: Literal["Not"] | None = Field(None)
+            object: NamableReference = Field(..., alias="object_id")
+            mapping_justification: Reference = Field(...)
+            license: Reference | None = Field(None)
+            creator: NamableReference | None = Field(None, alias="creator_id")
+            author: NamableReference | None = Field(None, alias="author_id")
+            reviewer: NamableReference | None = Field(None, alias="reviewer_id")
+            publication_date: datetime.date | None = Field(None)
+            issue_tracker_item: str | None = Field(None)
+            comment: str | None = Field(None)
+
+        records = [
+            {
+                "subject_id": "CHEBI:16236",
+                "subject_label": "ethanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:702",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+            {
+                "subject_id": "CHEBI:28831",
+                "subject_label": "propanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:1031",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+            {
+                "subject_id": "CHEBI:44884",
+                "subject_label": "pentanol",
+                "predicate_id": "skos:exactMatch",
+                "object_id": "pubchem.compound:6276",
+                "mapping_justification": "semapv:ManualMappingCuration",
+            },
+        ]
+
+        models = list(
+            iter_records(
+                records,
+                SemanticMapping,
+                names={
+                    "subject_id": "subject_label",
+                    "predicate_id": "predicate_label",
+                    "object_id": "object_label",
+                    "author_id": "author_label",
+                    "reviewer_id": "reviewer_label",
+                    "creator_id": "creator_label",
+                },
+            )
+        )
+
+        exact_match = NamableReference.from_reference(vocabulary.exact_match)
+        self.assertEqual(
+            [
+                SemanticMapping(
+                    subject=NamableReference.from_curie("CHEBI:16236", name="ethanol"),
+                    predicate=exact_match,
+                    object=NamableReference.from_curie("pubchem.compound:702"),
+                    mapping_justification=vocabulary.manual_mapping_curation,
+                ),
+                SemanticMapping(
+                    subject=NamableReference.from_curie("CHEBI:28831", name="propanol"),
+                    predicate=exact_match,
+                    object=NamableReference.from_curie("pubchem.compound:1031"),
+                    mapping_justification=vocabulary.manual_mapping_curation,
+                ),
+                SemanticMapping(
+                    subject=NamableReference.from_curie("CHEBI:44884", name="pentanol"),
+                    predicate=exact_match,
+                    object=NamableReference.from_curie("pubchem.compound:6276"),
+                    mapping_justification=vocabulary.manual_mapping_curation,
+                ),
+            ],
+            models,
+        )