biopragmatics · cthoyt · Feb 28, 2024 · Feb 28, 2024 · Mar 9, 2024 · Mar 11, 2024
diff --git a/docs/source/w3c.rst b/docs/source/w3c.rst
@@ -5,3 +5,123 @@ W3C Validation
     :no-inheritance-diagram:
     :no-heading:
     :include-all-objects:
+
+Opting in to W3C Validation with a :class:`curies.Converter`
+------------------------------------------------------------
+
+In practice, some usages do not conform to these standards, often due to encoding things
+that aren't *really* supposed to be CURIEs, such as like `SMILES strings <https://en.wikipedia.org/wiki/Simplified_Molecular_Input_Line_Entry_System>`_ for molecules,
+`UCUM codes <https://ucum.org/>`_ for units, or other language-like "identifiers".
+
+Therefore, it's on the roadmap for the ``curies`` package to support operations for
+validating against the W3C standards and mapping between "loose" (i.e., un-URL-encoded)
+and strict (i.e., URL-encoded) CURIEs and IRIs. In practice, this will often solve
+issues with special characters like square brackets (``[`` and ``]``).
+
+.. code-block::
+
+    looseCURIE <-> strictCURIE
+         ^.    \./.    ^
+         |      X      |
+         v     / \.    v
+     looseURI  <->  strictURI
+
+A first step towards accomplishing this was implemented in
+https://github.com/biopragmatics/curies/pull/104 by adding a ``w3c_validate`` flag to
+both the initialization of a :mod:`curies.Converter` as well as in the
+:meth:`curies.Converter.expand` function.
+
+Here's an example of using W3C validation during expansion:
+
+.. code-block::
+
+    import curies
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O")
+    https://bioregistry.io/smiles:CC(=O)NC([H])(C)C(=O)O
+
+    >>> converter.expand("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True)
+        Traceback (most recent call last):
+          File "<stdin>", line 1, in <module>
+          File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand
+            raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
+        W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O
+
+    This can also be used to extend
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 1362, in expand
+        raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
+    W3CValidationError: CURIE is not valid under W3C spec: smiles:CC(=O)NC([H])(C)C(=O)O
+
+This can also be used to extend :meth:`curies.Converter.is_curie`
+
+.. code-block::
+
+    import curies
+
+        converter = curies.Converter.from_prefix_map({
+            "smiles": "https://bioregistry.io/smiles:",
+        })
+
+        >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O")
+        True
+        >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True)
+        False
+
+    Finally, this can be used during instantiation of a converter:
+
+        converter = curies.Converter.from_prefix_map({
+            "smiles": "https://bioregistry.io/smiles:",
+        })
+
+        >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O")
+        True
+        >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_mode=True)
+        False
+
+    Finally, this can be used during instantiation of a converter:
+
+    converter = curies.Converter.from_prefix_map({
+        "smiles": "https://bioregistry.io/smiles:",
+    })
+
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O")
+    True
+    >>> converter.is_curie("smiles:CC(=O)NC([H])(C)C(=O)O", w3c_validate=True)
+    False
+
+Finally, this can be used during instantiation of a converter:
+
+.. code-block::
+
+    import curies
+
+    >>> curies.Converter.from_prefix_map(
+    ...     {"4dn.biosource": "https://data.4dnucleome.org/biosources/"},
+    ...     w3c_validate=True,
+    ... )
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 816, in from_prefix_map
+        return cls(
+               ^^^^
+      File "/Users/cthoyt/dev/curies/src/curies/api.py", line 527, in __init__
+        raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
+    curies.api.W3CValidationError: Records not conforming to W3C:
+
+      - Record(prefix='4dn.biosource', uri_prefix='https://data.4dnucleome.org/biosources/', prefix_synonyms=[], uri_prefix_synonyms=[], pattern=None)
+
+.. seealso::
+
+    1. Discussion on the ``curies`` issue tracker about handling CURIEs that include
+       e.g. square brackets and therefore don't conform to the W3C specification:
+       https://github.com/biopragmatics/curies/issues/103
+    2. Discussion on languages that shouldn't really get encoded in CURIEs, but still
+       do: https://github.com/biopragmatics/bioregistry/issues/460
+    3. Related to (2) - discussion on how to properly encode UCUM in CURIEs:
+       https://github.com/biopragmatics/bioregistry/issues/648
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
@@ -13,6 +13,7 @@
     Records,
     Reference,
     ReferenceTuple,
+    W3CValidationError,
     chain,
     load_extended_prefix_map,
     load_jsonld_context,
@@ -62,6 +63,7 @@
     "Reference",
     "ReferenceTuple",
     "Triple",
+    "W3CValidationError",
     "chain",
     "discover",
     "discover_from_rdf",

diff --git a/src/curies/api.py b/src/curies/api.py
@@ -6,6 +6,7 @@
 import itertools as itt
 import json
 import logging
+import urllib.parse
 import warnings
 from collections import defaultdict
 from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
@@ -35,7 +36,9 @@
 )
 from pydantic_core import core_schema
 from pytrie import StringTrie
-from typing_extensions import Self
+from typing_extensions import Self, TypeAlias
+
+from .w3c import is_w3c_curie, is_w3c_prefix
 
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
@@ -54,6 +57,7 @@
     "Records",
     "Reference",
     "ReferenceTuple",
+    "W3CValidationError",
     "chain",
     "load_extended_prefix_map",
     "load_jsonld_context",
@@ -147,6 +151,14 @@ def curie(self) -> str:
         """
         return f"{self.prefix}:{self.identifier}"
 
+    def quote(self) -> Self:
+        """Get a new tuple with the identifier URL-quoted."""
+        return self.__class__(prefix=self.prefix, identifier=urllib.parse.quote(self.identifier))
+
+    def unquote(self) -> Self:
+        """Get a new tuple with the identifier URL-unquoted."""
+        return self.__class__(prefix=self.prefix, identifier=urllib.parse.unquote(self.identifier))
+
     @classmethod
     def from_curie(cls, curie: str, *, sep: str = ":") -> Self:
         """Parse a CURIE string and populate a reference tuple.
@@ -445,6 +457,14 @@ def curie(self) -> str:
         """
         return f"{self.prefix}:{self.identifier}"
 
+    def quote(self) -> Self:
+        """Get a new model with the identifier URL-quoted."""
+        return self.model_copy(update={"identifier": urllib.parse.quote(self.identifier)})
+
+    def unquote(self) -> Self:
+        """Get a new model with the identifier URL-unquoted."""
+        return self.model_copy(update={"identifier": urllib.parse.unquote(self.identifier)})
+
     @property
     def pair(self) -> ReferenceTuple:
         """Get the reference as a 2-tuple of prefix and identifier."""
@@ -650,6 +670,12 @@ def _key(self) -> RecordKey:
             ",".join(sorted(self.uri_prefix_synonyms)),
         )
 
+    def is_w3c_valid(self) -> bool:
+        """Check if all prefixes in this record are W3C compliant, based on :func:`is_w3c_prefix`."""
+        all_curie_prefixes_valid = all(is_w3c_prefix(prefix) for prefix in self._all_prefixes)
+        # TODO extend to check URI prefixes?
+        return all_curie_prefixes_valid
+
 
 # An explanation of RootModels in Pydantic V2 can be found on
 # https://docs.pydantic.dev/latest/concepts/models/#rootmodel-and-custom-root-types
@@ -740,6 +766,10 @@ class URIStandardizationError(StandardizationError):
     """An error raise when a URI can't be standardized."""
 
 
+class W3CValidationError(ValueError):
+    """An error when W3C validation fails."""
+
+
 def _get_duplicate_uri_prefixes(records: list[Record]) -> list[DuplicateSummary]:
     return [
         DuplicateSummary(record_1, record_2, uri_prefix)
@@ -810,6 +840,9 @@ def _get_remote_json(url: str, timeout: int = 15) -> Any:
     return json.loads(json_str_data)
 
 
+W3CMode: TypeAlias = Literal["loose", "strict", "autocodec"]
+
+
 class Converter:
     """A cached prefix map data structure.
 
@@ -836,6 +869,20 @@ class Converter:
 
         # Example with missing prefix:
         >>> converter.expand("missing:0000000")
+
+
+    If you want to ensure that all prefixes conform to the W3C specifications,
+    use ``w3c_mode`` like in:
+
+    >>> converter = Converter.from_prefix_map(
+        ...     {
+        ...         "4dn": "http://purl.obolibrary.org/obo/CHEBI_",
+        ...     },
+        ...     w3c_mode="strict",
+        ... )
+
+    This will raise a :class:`W3CValidationError`. See more information at
+    :mod:`curies.w3c`.
     """
 
     #: The expansion dictionary with prefixes as keys and priority URI prefixes as values
@@ -850,7 +897,12 @@ class Converter:
     pattern_map: dict[str, str]
 
     def __init__(
-        self, records: Iterable[Record], *, delimiter: str = ":", strict: bool = True
+        self,
+        records: Iterable[Record],
+        *,
+        delimiter: str = ":",
+        strict: bool = True,
+        w3c_mode: W3CMode = "loose",
     ) -> None:
         """Instantiate a converter.
 
@@ -860,8 +912,17 @@ def __init__(
             If true, raises issues on duplicate URI prefixes
         :param delimiter:
             The delimiter used for CURIEs. Defaults to a colon.
+        :param w3c_mode:
+            If "strict", validate all records against the
+            `W3C CURIE Syntax 1.0 <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
+            This includes the following:
+
+            1. Checking CURIE prefixes and CURIE prefix synonyms against the
+               W3C definition for `NCName <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_
+
         :raises DuplicatePrefixes: if any records share any synonyms
         :raises DuplicateURIPrefixes: if any records share any URI prefixes
+        :raises W3CValidationError: If w3c validation is on and there are non-conformant records
         """
         records = sorted(records, key=lambda r: r.prefix)
         if strict:
@@ -872,6 +933,12 @@ def __init__(
             if duplicate_prefixes:
                 raise DuplicatePrefixes(duplicate_prefixes)
 
+        if w3c_mode in {"strict", "autocodec"}:
+            broken = [record for record in records if not record.is_w3c_valid()]
+            if broken:
+                msg = "\n".join(f"  - {record!r}" for record in records)
+                raise W3CValidationError(f"Records not conforming to W3C:\n\n{msg}")
+
         self.delimiter = delimiter
         self.records = records
         self.prefix_map = _get_prefix_map(records)
@@ -1656,10 +1723,12 @@ def parse_uri(
         else:
             return ReferenceTuple(prefix, uri[len(value) :])
 
-    def is_curie(self, s: str) -> bool:
+    def is_curie(self, s: str, *, w3c_mode: W3CMode = "loose") -> bool:
         """Check if the string can be parsed as a CURIE by this converter.
 
         :param s: A string that might be a CURIE
+        :param w3c_mode: If "strict", requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns: If the string can be parsed as a CURIE by this converter.
             Note that some valid CURIEs, when passed to this function, will
             result in False if their prefixes are not registered with this
@@ -1680,7 +1749,7 @@ def is_curie(self, s: str) -> bool:
         False
         """
         try:
-            return self.expand(s) is not None
+            return self.expand(s, w3c_mode=w3c_mode) is not None
         except ValueError:
             return False
 
@@ -1767,34 +1836,60 @@ def expand_strict(self, curie: str) -> str:
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[True] = True, passthrough: bool = ...
+        self,
+        curie: str,
+        *,
+        strict: Literal[True] = True,
+        passthrough: bool = ...,
+        w3c_mode: W3CMode = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[True] = True
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[True] = True,
+        w3c_mode: W3CMode = ...,
     ) -> str: ...
 
     # docstr-coverage:excused `overload`
     @overload
     def expand(
-        self, curie: str, *, strict: Literal[False] = False, passthrough: Literal[False] = False
+        self,
+        curie: str,
+        *,
+        strict: Literal[False] = False,
+        passthrough: Literal[False] = False,
+        w3c_mode: W3CMode = ...,
     ) -> str | None: ...
 
-    def expand(self, curie: str, *, strict: bool = False, passthrough: bool = False) -> str | None:
+    def expand(
+        self,
+        curie: str,
+        *,
+        strict: bool = False,
+        passthrough: bool = False,
+        w3c_mode: W3CMode = "loose",
+    ) -> str | None:
         """Expand a CURIE to a URI, if possible.
 
         :param curie:
             A string representing a compact URI (CURIE)
         :param strict: If true and the CURIE can't be expanded, returns an error. Defaults to false.
         :param passthrough: If true, strict is false, and the CURIE can't be expanded, return the input.
-            Defaults to false. If your strings can either be a CURIE _or_ a URI, consider using
+            Defaults to false. If your strings can either be a CURIE or a URI, consider using
             :meth:`Converter.expand_or_standardize` instead.
+        :param w3c_mode: If true, requires CURIEs to be valid against the
+            `W3C CURIE specification <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_.
         :returns:
             A URI if this converter contains a URI prefix for the prefix in this CURIE
         :raises ExpansionError:
             If strict is true and the CURIE can't be expanded
+        :raises W3CValidationError:
+            If W3C validation is turned on and the CURIE is not valid under the CURIE specification
 
         >>> from curies import Converter
         >>> converter = Converter.from_prefix_map(
@@ -1808,8 +1903,12 @@ def expand(self, curie: str, *, strict: bool = False, passthrough: bool = False)
         'http://purl.obolibrary.org/obo/CHEBI_138488'
         >>> converter.expand("missing:0000000")
         """
+        if w3c_mode == "strict" and not is_w3c_curie(curie):
+            raise W3CValidationError(f"CURIE is not valid under W3C spec: {curie}")
         reference = self.parse_curie(curie, strict=False)
         if reference is not None:
+            if w3c_mode == "autocodec":
+                reference = reference.quote()
             return self.expand_reference(reference, strict=strict, passthrough=passthrough)  # type:ignore[no-any-return,call-overload]
         if strict:
             raise ExpansionError(curie)