diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst index c68afa99..80979a02 100644 --- a/docs/source/w3c.rst +++ b/docs/source/w3c.rst @@ -5,3 +5,123 @@ W3C Validation :no-inheritance-diagram: :no-heading: :include-all-objects: + +Opting in to W3C Validation with a :class:`curies.Converter` +------------------------------------------------------------ + +In practice, some usages do not conform to these standards, often due to encoding things +that aren't *really* supposed to be CURIEs, such as like `SMILES strings `_ for molecules, +`UCUM codes `_ for units, or other language-like "identifiers". + +Therefore, it's on the roadmap for the ``curies`` package to support operations for +validating against the W3C standards and mapping between "loose" (i.e., un-URL-encoded) +and strict (i.e., URL-encoded) CURIEs and IRIs. In practice, this will often solve +issues with special characters like square brackets (``[`` and ``]``). + +.. code-block:: + + looseCURIE <-> strictCURIE + ^. \./. ^ + | X | + v / \. v + looseURI <-> strictURI + +A first step towards accomplishing this was implemented in +https://github.com/biopragmatics/curies/pull/104 by adding a ``w3c_validate`` flag to +both the initialization of a :mod:`curies.Converter` as well as in the +:meth:`curies.Converter.expand` function. + +Here's an example of using W3C validation during expansion: + +.. code-block:: + + import curies + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O") + https://bioregistry.io/smiles:CC(=O)NC([H])(C)C(=O)O + + >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True) + Traceback (most recent call last): + File "", line 1, in + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand + raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") + W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O + + This can also be used to extend + Traceback (most recent call last): + File "", line 1, in + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand + raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") + W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O + +This can also be used to extend :meth:`curies.Converter.is_curie` + +.. code-block:: + + import curies + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O") + True + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True) + False + + Finally, this can be used during instantiation of a converter: + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O") + True + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True) + False + + Finally, this can be used during instantiation of a converter: + + converter = curies.Converter.from_prefix_map({ + "smiles": "https://bioregistry.io/smiles:", + }) + + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O") + True + >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validate=True) + False + +Finally, this can be used during instantiation of a converter: + +.. code-block:: + + import curies + + >>> curies.Converter.from_prefix_map( + ... {"4dn.biosource": "https://data.4dnucleome.org/biosources/"}, + ... w3c_validate=True, + ... ) + Traceback (most recent call last): + File "", line 1, in + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 816, in from_prefix_map + return cls( + ^^^^ + File "/Users/cthoyt/dev/curies/src/curies/api.py", line 527, in __init__ + raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}") + curies.api.W3CValidationError: Records not conforming to W3C: + + - Record(prefix='4dn.biosource', uri_prefix='https://data.4dnucleome.org/biosources/', prefix_synonyms=[], uri_prefix_synonyms=[], pattern=None) + +.. seealso:: + + 1. Discussion on the ``curies`` issue tracker about handling CURIEs that include + e.g. square brackets and therefore don't conform to the W3C specification: + https://github.com/biopragmatics/curies/issues/103 + 2. Discussion on languages that shouldn't really get encoded in CURIEs, but still + do: https://github.com/biopragmatics/bioregistry/issues/460 + 3. Related to (2) - discussion on how to properly encode UCUM in CURIEs: + https://github.com/biopragmatics/bioregistry/issues/648 diff --git a/src/curies/__init__.py b/src/curies/__init__.py index a31cb545..33734d16 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -13,6 +13,7 @@ Records, Reference, ReferenceTuple, + W3CValidationError, chain, load_extended_prefix_map, load_jsonld_context, @@ -62,6 +63,7 @@ "Reference", "ReferenceTuple", "Triple", + "W3CValidationError", "chain", "discover", "discover_from_rdf", diff --git a/src/curies/api.py b/src/curies/api.py index 5a7d7d3f..c7b74a4b 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -6,6 +6,7 @@ import itertools as itt import json import logging +import urllib.parse import warnings from collections import defaultdict from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence @@ -35,7 +36,9 @@ ) from pydantic_core import core_schema from pytrie import StringTrie -from typing_extensions import Self +from typing_extensions import Self, TypeAlias + +from .w3c import is_w3c_curie, is_w3c_prefix if TYPE_CHECKING: # pragma: no cover import pandas @@ -54,6 +57,7 @@ "Records", "Reference", "ReferenceTuple", + "W3CValidationError", "chain", "load_extended_prefix_map", "load_jsonld_context", @@ -147,6 +151,14 @@ def curie(self) -> str: """ return f"{self.prefix}:{self.identifier}" + def quote(self) -> Self: + """Get a new tuple with the identifier URL-quoted.""" + return self.__class__(prefix=self.prefix, identifier=urllib.parse.quote(self.identifier)) + + def unquote(self) -> Self: + """Get a new tuple with the identifier URL-unquoted.""" + return self.__class__(prefix=self.prefix, identifier=urllib.parse.unquote(self.identifier)) + @classmethod def from_curie(cls, curie: str, *, sep: str = ":") -> Self: """Parse a CURIE string and populate a reference tuple. @@ -445,6 +457,14 @@ def curie(self) -> str: """ return f"{self.prefix}:{self.identifier}" + def quote(self) -> Self: + """Get a new model with the identifier URL-quoted.""" + return self.model_copy(update={"identifier": urllib.parse.quote(self.identifier)}) + + def unquote(self) -> Self: + """Get a new model with the identifier URL-unquoted.""" + return self.model_copy(update={"identifier": urllib.parse.unquote(self.identifier)}) + @property def pair(self) -> ReferenceTuple: """Get the reference as a 2-tuple of prefix and identifier.""" @@ -650,6 +670,12 @@ def _key(self) -> RecordKey: ",".join(sorted(self.uri_prefix_synonyms)), ) + def is_w3c_valid(self) -> bool: + """Check if all prefixes in this record are W3C compliant, based on :func:`is_w3c_prefix`.""" + all_curie_prefixes_valid = all(is_w3c_prefix(prefix) for prefix in self._all_prefixes) + # TODO extend to check URI prefixes? + return all_curie_prefixes_valid + # An explanation of RootModels in Pydantic V2 can be found on # https://docs.pydantic.dev/latest/concepts/models/#rootmodel-and-custom-root-types @@ -740,6 +766,10 @@ class URIStandardizationError(StandardizationError): """An error raise when a URI can't be standardized.""" +class W3CValidationError(ValueError): + """An error when W3C validation fails.""" + + def _get_duplicate_uri_prefixes(records: list[Record]) -> list[DuplicateSummary]: return [ DuplicateSummary(record_1, record_2, uri_prefix) @@ -810,6 +840,9 @@ def _get_remote_json(url: str, timeout: int = 15) -> Any: return json.loads(json_str_data) +W3CMode: TypeAlias = Literal["loose", "strict", "autocodec"] + + class Converter: """A cached prefix map data structure. @@ -836,6 +869,20 @@ class Converter: # Example with missing prefix: >>> converter.expand("missing:0000000") + + + If you want to ensure that all prefixes conform to the W3C specifications, + use ``w3c_mode`` like in: + + >>> converter = Converter.from_prefix_map( + ... { + ... "4dn": "http://purl.obolibrary.org/obo/CHEBI_", + ... }, + ... w3c_mode="strict", + ... ) + + This will raise a :class:`W3CValidationError`. See more information at + :mod:`curies.w3c`. """ #: The expansion dictionary with prefixes as keys and priority URI prefixes as values @@ -850,7 +897,12 @@ class Converter: pattern_map: dict[str, str] def __init__( - self, records: Iterable[Record], *, delimiter: str = ":", strict: bool = True + self, + records: Iterable[Record], + *, + delimiter: str = ":", + strict: bool = True, + w3c_mode: W3CMode = "loose", ) -> None: """Instantiate a converter. @@ -860,8 +912,17 @@ def __init__( If true, raises issues on duplicate URI prefixes :param delimiter: The delimiter used for CURIEs. Defaults to a colon. + :param w3c_mode: + If "strict", validate all records against the + `W3C CURIE Syntax 1.0 `_. + This includes the following: + + 1. Checking CURIE prefixes and CURIE prefix synonyms against the + W3C definition for `NCName `_ + :raises DuplicatePrefixes: if any records share any synonyms :raises DuplicateURIPrefixes: if any records share any URI prefixes + :raises W3CValidationError: If w3c validation is on and there are non-conformant records """ records = sorted(records, key=lambda r: r.prefix) if strict: @@ -872,6 +933,12 @@ def __init__( if duplicate_prefixes: raise DuplicatePrefixes(duplicate_prefixes) + if w3c_mode in {"strict", "autocodec"}: + broken = [record for record in records if not record.is_w3c_valid()] + if broken: + msg = "\n".join(f" - {record!r}" for record in records) + raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}") + self.delimiter = delimiter self.records = records self.prefix_map = _get_prefix_map(records) @@ -1656,10 +1723,12 @@ def parse_uri( else: return ReferenceTuple(prefix, uri[len(value) :]) - def is_curie(self, s: str) -> bool: + def is_curie(self, s: str, *, w3c_mode: W3CMode = "loose") -> bool: """Check if the string can be parsed as a CURIE by this converter. :param s: A string that might be a CURIE + :param w3c_mode: If "strict", requires CURIEs to be valid against the + `W3C CURIE specification `_. :returns: If the string can be parsed as a CURIE by this converter. Note that some valid CURIEs, when passed to this function, will result in False if their prefixes are not registered with this @@ -1680,7 +1749,7 @@ def is_curie(self, s: str) -> bool: False """ try: - return self.expand(s) is not None + return self.expand(s, w3c_mode=w3c_mode) is not None except ValueError: return False @@ -1767,34 +1836,60 @@ def expand_strict(self, curie: str) -> str: # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[True] = True, passthrough: bool = ... + self, + curie: str, + *, + strict: Literal[True] = True, + passthrough: bool = ..., + w3c_mode: W3CMode = ..., ) -> str: ... # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True + self, + curie: str, + *, + strict: Literal[False] = False, + passthrough: Literal[True] = True, + w3c_mode: W3CMode = ..., ) -> str: ... # docstr-coverage:excused `overload` @overload def expand( - self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False + self, + curie: str, + *, + strict: Literal[False] = False, + passthrough: Literal[False] = False, + w3c_mode: W3CMode = ..., ) -> str | None: ... - def expand(self, curie: str, *, strict: bool = False, passthrough: bool = False) -> str | None: + def expand( + self, + curie: str, + *, + strict: bool = False, + passthrough: bool = False, + w3c_mode: W3CMode = "loose", + ) -> str | None: """Expand a CURIE to a URI, if possible. :param curie: A string representing a compact URI (CURIE) :param strict: If true and the CURIE can't be expanded, returns an error. Defaults to false. :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input. - Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using + Defaults to false. If your strings can either be a CURIE or a URI, consider using :meth:`Converter.expand_or_standardize` instead. + :param w3c_mode: If true, requires CURIEs to be valid against the + `W3C CURIE specification `_. :returns: A URI if this converter contains a URI prefix for the prefix in this CURIE :raises ExpansionError: If strict is true and the CURIE can't be expanded + :raises W3CValidationError: + If W3C validation is turned on and the CURIE is not valid under the CURIE specification >>> from curies import Converter >>> converter = Converter.from_prefix_map( @@ -1808,8 +1903,12 @@ def expand(self, curie: str, *, strict: bool = False, passthrough: bool = False) 'http://purl.obolibrary.org/obo/CHEBI_138488' >>> converter.expand("missing:0000000") """ + if w3c_mode == "strict" and not is_w3c_curie(curie): + raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}") reference = self.parse_curie(curie, strict=False) if reference is not None: + if w3c_mode == "autocodec": + reference = reference.quote() return self.expand_reference(reference, strict=strict, passthrough=passthrough) # type:ignore[no-any-return,call-overload] if strict: raise ExpansionError(curie) diff --git a/tests/test_api.py b/tests/test_api.py index 19322562..3e06756c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -5,6 +5,7 @@ import json import tempfile import unittest +import urllib.parse from pathlib import Path from tempfile import TemporaryDirectory @@ -29,6 +30,7 @@ Reference, ReferenceTuple, URIStandardizationError, + W3CValidationError, chain, upgrade_prefix_map, ) @@ -45,6 +47,39 @@ CHEBI_URI_PREFIX = "http://purl.obolibrary.org/obo/CHEBI_" GO_URI_PREFIX = "http://purl.obolibrary.org/obo/GO_" +SMILES_EX = "CC(=O)NC([H])(C)C(=O)O" +SMILES_ENCODED_EX = urllib.parse.quote(SMILES_EX) + + +class TestRecord(unittest.TestCase): + """Tests for the record data structure.""" + + def test_w3c_prefix(self) -> None: + """Test CURIE prefix correctness.""" + valid_prefixes = [ + "go", + "GO", + "NCBITaxon", + "ncbi.taxon", + "ncbi_taxon", + "_", + "_secret", + "secret_", + "_secret", + ] + invalid_prefixes = ["", "4dn", "GO:GO:", "GO:"] + examples = [ + *((prefix, True) for prefix in valid_prefixes), + *((prefix, False) for prefix in invalid_prefixes), + ] + for prefix, value in examples: + uri_prefix = f"https://example.com/{prefix}" + r1 = Record(prefix=prefix, uri_prefix=uri_prefix) + r2 = Record(prefix="prefix", prefix_synonyms=[prefix], uri_prefix=uri_prefix) + with self.subTest(prefix=prefix): + self.assertEqual(value, r1.is_w3c_valid()) + self.assertEqual(value, r2.is_w3c_valid()) + class TestStruct(unittest.TestCase): """Test the data structures.""" @@ -173,6 +208,15 @@ def test_reference_constructor(self) -> None: NamedReference.from_reference(r4), ) + def test_quoting(self) -> None: + """Test quoting and unquoting.""" + reference = Reference(prefix="smiles", identifier=SMILES_EX) + self.assertEqual(SMILES_ENCODED_EX, reference.quote().identifier) + self.assertEqual(SMILES_EX, reference.quote().unquote().identifier) + + self.assertEqual(SMILES_ENCODED_EX, reference.pair.quote().identifier) + self.assertEqual(SMILES_EX, reference.pair.quote().unquote().identifier) + class TestAddRecord(unittest.TestCase): """Test adding records.""" @@ -913,6 +957,44 @@ def test_rdflib(self) -> None: converter_2 = Converter.from_rdflib(graph.namespace_manager) self._assert_convert(converter_2) + def test_expand_w3c_invalid(self) -> None: + """Test that expanding a non-w3c-conformant CURIE can lead to errors.""" + value = "CC(=O)NC([H])(C)C(=O)O" + encoded_value = urllib.parse.quote(value) + converter = Converter.from_prefix_map( + { + "smiles": "https://bioregistry.io/smiles:", + } + ) + valids = [ + ("smiles:CCO", "https://bioregistry.io/smiles:CCO"), + (f"smiles:{encoded_value}", f"https://bioregistry.io/smiles:{encoded_value}"), + ] + invalids = [ + (f"smiles:{value}", f"https://bioregistry.io/smiles:{encoded_value}"), + ] + + for valid_curie, valid_expected_uri in valids: + with self.subTest(curie=valid_curie): + self.assertEqual( + valid_expected_uri, converter.expand(valid_curie, w3c_mode="loose") + ) + self.assertEqual( + valid_expected_uri, converter.expand(valid_curie, w3c_mode="strict") + ) + self.assertEqual( + valid_expected_uri, converter.expand(valid_curie, w3c_mode="autocodec") + ) + + for invalid, expected in invalids: + with self.subTest(curie=invalid): + self.assertIsNotNone(converter.expand(invalid, w3c_mode="loose")) + with self.assertRaises(W3CValidationError): + converter.expand(invalid, w3c_mode="strict") + self.assertEqual(expected, converter.expand(invalid, w3c_mode="autocodec")) + + # TODO add autocodec for compression too + def test_parse_curie(self) -> None: """Tests for parse CURIE.""" converter = Converter(