biopragmatics · cthoyt · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025 · Sep 5, 2025
diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -3,3 +3,6 @@ API Reference
 
 .. automodapi:: curies
     :no-heading:
+
+.. automodapi:: curies.dataframe
+    :no-heading:
diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst
@@ -0,0 +1,51 @@
+Working with Dataframes
+=======================
+
+Filtering
+---------
+
+In the following examples, we'll use a dataframe representing semantic mappings between
+disease ontologies in the SSSOM format:
+
+============ =============== =============== ============================
+subject_id   predicate_id    object_id       mapping_justification
+============ =============== =============== ============================
+DOID:0080795 skos:exactMatch EFO:0003029     semapv:ManualMappingCuration
+DOID:0080795 skos:exactMatch mesh:D015471    semapv:ManualMappingCuration
+DOID:0080799 skos:exactMatch EFO:1000527     semapv:ManualMappingCuration
+DOID:0080808 skos:exactMatch mesh:D000069295 semapv:ManualMappingCuration
+============ =============== =============== ============================
+
+First, to filter to objects that use EFO, use
+:func:`curies.dataframe.filter_df_by_prefixes`:
+
+.. code-block:: python
+
+    from curies.dataframe import filter_df_by_prefixes
+
+    df = ...
+    df = filter_df_by_prefixes(df, column="object_id", prefixes=["efo"])
+
+============ =============== =========== ============================
+subject_id   predicate_id    object_id   mapping_justification
+============ =============== =========== ============================
+DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration
+DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration
+============ =============== =========== ============================
+
+Second, tto filter to rows that have the subject ``DOID:0080795``, use
+:func:`curies.dataframe.filter_df_by_curies`:
+
+.. code-block:: python
+
+    from curies.dataframe import filter_df_by_curies
+
+    df = ...
+    df = filter_df_by_curies(df, column="subjects_id", curies=["DOID:0080795"])
+
+============ =============== ============ ============================
+subject_id   predicate_id    object_id    mapping_justification
+============ =============== ============ ============================
+DOID:0080795 skos:exactMatch EFO:0003029  semapv:ManualMappingCuration
+DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration
+============ =============== ============ ============================
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -70,3 +70,4 @@ The most recent code and data can be installed directly from GitHub with:
     w3c
     preprocessing
     database
+    dataframe
diff --git a/src/curies/__init__.py b/src/curies/__init__.py
@@ -24,6 +24,7 @@
     write_shacl,
     write_tsv,
 )
+from .dataframe import filter_df_by_curies, filter_df_by_prefixes
 from .discovery import discover, discover_from_rdf
 from .preprocessing import (
     PostprocessingRewrites,
@@ -65,6 +66,8 @@
     "chain",
     "discover",
     "discover_from_rdf",
+    "filter_df_by_curies",
+    "filter_df_by_prefixes",
     "get_bioregistry_converter",
     "get_go_converter",
     "get_monarch_converter",

diff --git a/src/curies/api.py b/src/curies/api.py
@@ -37,6 +37,8 @@
 from pytrie import StringTrie
 from typing_extensions import Self
 
+from .utils import _split
+
 if TYPE_CHECKING:  # pragma: no cover
     import pandas
     import rdflib
@@ -77,13 +79,6 @@ def _get_field_validator_values(values, key: str):  # type:ignore
     return values.data[key]
 
 
-def _split(curie: str, *, sep: str = ":") -> tuple[str, str]:
-    prefix, delimiter, identifier = curie.partition(sep)
-    if not delimiter:
-        raise NoCURIEDelimiterError(curie)
-    return prefix, identifier
-
-
 class ReferenceTuple(NamedTuple):
     """A pair of a prefix (corresponding to a semantic space) and a local unique identifier in that semantic space.
 
@@ -669,17 +664,6 @@ class DuplicateSummary(NamedTuple):
     prefix: str
 
 
-class NoCURIEDelimiterError(ValueError):
-    """An error thrown on a string with no delimiter."""
-
-    def __init__(self, curie: str):
-        """Initialize the error."""
-        self.curie = curie
-
-    def __str__(self) -> str:
-        return f"{self.curie} does not appear to be a CURIE - missing a delimiter"
-
-
 class DuplicateValueError(ValueError):
     """An error raised with constructing a converter with data containing duplicate values."""
 

diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py
@@ -0,0 +1,177 @@
+"""Dataframe operations."""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from collections.abc import Collection
+from typing import TYPE_CHECKING, Callable, Literal
+
+from typing_extensions import TypeAlias
+
+from .utils import _prefix_from_curie
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+    from .api import Converter
+
+__all__ = [
+    "PrefixIndexMethod",
+    "filter_df_by_curies",
+    "filter_df_by_prefixes",
+    "get_df_curies_index",
+    "get_df_prefixes_index",
+    "get_filter_df_by_curies_index",
+    "get_filter_df_by_prefixes_index",
+]
+
+
+def _get_prefix_checker(prefix: str | Collection[str]) -> Callable[[str], bool]:
+    """Get a function that checks if a CURIE starts with a prefix."""
+    if isinstance(prefix, str):
+        prefix_with_colon = prefix + ":"
+
+        def _func(curie: str) -> bool:
+            return curie.startswith(prefix_with_colon)
+
+    else:
+        prefixes_with_colons = {p + ":" for p in prefix}
+
+        def _func(curie: str) -> bool:
+            return any(
+                curie.startswith(prefix_with_colon) for prefix_with_colon in prefixes_with_colons
+            )
+
+    return _func
+
+
+def _get_prefixes_from_curie_column(
+    df: pd.DataFrame, column: int | str, converter: Converter, validate: bool
+) -> pd.Series[str]:
+    return df[column].map(_get_curie_parser(converter=converter, validate=validate))
+
+
+def _get_curie_parser(
+    *, converter: Converter | None = None, validate: bool = False
+) -> Callable[[str], str]:
+    # TODO what if it can't parse?
+    # TODO handle None?
+    # TODO handle invalid CURIEs?
+
+    if not validate:
+        return _prefix_from_curie
+    elif converter is None:
+        raise ValueError("converter is required for validation")
+    else:
+
+        def _func(curie: str) -> str:
+            reference = converter.parse_curie(curie, strict=True)
+            return reference.prefix
+
+    return _func
+
+
+#: The method for filtering on prefixe
+PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"]
+
+
+def get_filter_df_by_prefixes_index(
+    df: pd.DataFrame,
+    column: str | int,
+    prefix: str | Collection[str],
+    *,
+    method: PrefixIndexMethod | None = None,
+    converter: Converter | None = None,
+    validate: bool = False,
+) -> pd.Series[bool]:
+    """Get an index of CURIEs in the given column that start with the prefix(es)."""
+    if method == "iterative" or method is None:
+        return df[column].map(_get_prefix_checker(prefix))
+    elif method == "precalculated":
+        if converter is None:  # pragma: no cover
+            raise ValueError("a converter is required for method B")
+        prefix_series = _get_prefixes_from_curie_column(df, column, converter, validate=validate)
+        if isinstance(prefix, str):
+            return prefix_series == prefix
+        else:
+            return prefix_series.isin(prefix)
+    else:  # pragma: no cover
+        raise ValueError(f"invalid method given: {method}")
+
+
+def filter_df_by_prefixes(
+    df: pd.DataFrame,
+    column: str | int,
+    prefix: str | Collection[str],
+    *,
+    method: PrefixIndexMethod | None = None,
+    converter: Converter | None = None,
+) -> pd.DataFrame:
+    """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.
+
+    :param df: A dataframe
+    :param column: The integer index or column name of a column containing CURIEs
+    :param prefix: The prefix (given as a string) or collection of prefixes (given as a
+        list, set, etc.) to keep
+    :param method: The implementation for getting the prefix index
+    :param converter: A converter
+
+    :returns: If not in place, return a new dataframe.
+    """
+    idx = get_filter_df_by_prefixes_index(
+        df=df, column=column, prefix=prefix, method=method, converter=converter
+    )
+    return df[idx]
+
+
+def get_filter_df_by_curies_index(
+    df: pd.DataFrame,
+    column: str | int,
+    curie: str | Collection[str],
+) -> pd.Series[bool]:
+    """Get an index of CURIEs in the given column that are the given CURIE(s)."""
+    if isinstance(curie, str):
+        return df[column] == curie
+    else:
+        return df[column].isin(set(curie))
+
+
+def get_df_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]:
+    """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear."""
+    dd: defaultdict[str, list[int]] = defaultdict(list)
+    for i, curie in enumerate(df[column]):
+        dd[curie].append(i)
+    return dict(dd)
+
+
+def filter_df_by_curies(
+    df: pd.DataFrame,
+    column: str | int,
+    curie: str | Collection[str],
+) -> pd.DataFrame:
+    """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.
+
+    :param df: A dataframe
+    :param column: The integer index or column name of a column containing CURIEs
+    :param curie: The CURIE (given as a string) or collection of CURIEs (given as a
+        list, set, etc.) to keep
+
+    :returns: If not in place, return a new dataframe.
+    """
+    idx = get_filter_df_by_curies_index(df=df, column=column, curie=curie)
+    return df[idx]
+
+
+def get_df_prefixes_index(
+    df: pd.DataFrame,
+    column: str | int,
+    *,
+    converter: Converter | None = None,
+    validate: bool = False,
+) -> dict[str, list[int]]:
+    """Get a dictionary from prefixes that appear in the column to the row indexes where they appear."""
+    dd: defaultdict[str, list[int]] = defaultdict(list)
+    f = _get_curie_parser(converter=converter, validate=validate)
+    for i, prefix in enumerate(df[column].map(f)):
+        dd[prefix].append(i)
+    return dict(dd)
diff --git a/src/curies/utils.py b/src/curies/utils.py
@@ -0,0 +1,32 @@
+"""Utilities for working with strings."""
+
+from __future__ import annotations
+
+__all__ = [
+    "NoCURIEDelimiterError",
+    "_split",
+]
+
+
+class NoCURIEDelimiterError(ValueError):
+    """An error thrown on a string with no delimiter."""
+
+    def __init__(self, curie: str):
+        """Initialize the error."""
+        self.curie = curie
+
+    def __str__(self) -> str:
+        return f"{self.curie} does not appear to be a CURIE - missing a delimiter"
+
+
+def _split(curie: str, *, sep: str = ":") -> tuple[str, str]:
+    """Split a CURIE string using string operations."""
+    prefix, delimiter, identifier = curie.partition(sep)
+    if not delimiter:
+        raise NoCURIEDelimiterError(curie)
+    return prefix, identifier
+
+
+def _prefix_from_curie(curie: str, *, sep: str = ":") -> str:
+    """Split a CURIE string using string operations and return the prefix."""
+    return _split(curie, sep=sep)[0]
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -22,7 +22,6 @@
     ExpansionError,
     NamableReference,
     NamedReference,
-    NoCURIEDelimiterError,
     PrefixStandardizationError,
     Record,
     Records,
@@ -39,6 +38,7 @@
     get_monarch_converter,
     get_obo_converter,
 )
+from curies.utils import NoCURIEDelimiterError
 from curies.version import get_version
 from tests.constants import SLOW