Skip to content
Merged
3 changes: 3 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ API Reference

.. automodapi:: curies
:no-heading:

.. automodapi:: curies.dataframe
:no-heading:
51 changes: 51 additions & 0 deletions docs/source/dataframe.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
Working with Dataframes
=======================

Filtering
---------

In the following examples, we'll use a dataframe representing semantic mappings between
disease ontologies in the SSSOM format:

============ =============== =============== ============================
subject_id predicate_id object_id mapping_justification
============ =============== =============== ============================
DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration
DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration
DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration
DOID:0080808 skos:exactMatch mesh:D000069295 semapv:ManualMappingCuration
============ =============== =============== ============================

First, to filter to objects that use EFO, use
:func:`curies.dataframe.filter_df_by_prefixes`:

.. code-block:: python

from curies.dataframe import filter_df_by_prefixes

df = ...
df = filter_df_by_prefixes(df, column="object_id", prefixes=["efo"])

============ =============== =========== ============================
subject_id predicate_id object_id mapping_justification
============ =============== =========== ============================
DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration
DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration
============ =============== =========== ============================

Second, tto filter to rows that have the subject ``DOID:0080795``, use
:func:`curies.dataframe.filter_df_by_curies`:

.. code-block:: python

from curies.dataframe import filter_df_by_curies

df = ...
df = filter_df_by_curies(df, column="subjects_id", curies=["DOID:0080795"])

============ =============== ============ ============================
subject_id predicate_id object_id mapping_justification
============ =============== ============ ============================
DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration
DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration
============ =============== ============ ============================
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,4 @@ The most recent code and data can be installed directly from GitHub with:
w3c
preprocessing
database
dataframe
3 changes: 3 additions & 0 deletions src/curies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
write_shacl,
write_tsv,
)
from .dataframe import filter_df_by_curies, filter_df_by_prefixes
from .discovery import discover, discover_from_rdf
from .preprocessing import (
PostprocessingRewrites,
Expand Down Expand Up @@ -65,6 +66,8 @@
"chain",
"discover",
"discover_from_rdf",
"filter_df_by_curies",
"filter_df_by_prefixes",
"get_bioregistry_converter",
"get_go_converter",
"get_monarch_converter",
Expand Down
20 changes: 2 additions & 18 deletions src/curies/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
from pytrie import StringTrie
from typing_extensions import Self

from .utils import _split

if TYPE_CHECKING: # pragma: no cover
import pandas
import rdflib
Expand Down Expand Up @@ -77,13 +79,6 @@ def _get_field_validator_values(values, key: str): # type:ignore
return values.data[key]


def _split(curie: str, *, sep: str = ":") -> tuple[str, str]:
prefix, delimiter, identifier = curie.partition(sep)
if not delimiter:
raise NoCURIEDelimiterError(curie)
return prefix, identifier


class ReferenceTuple(NamedTuple):
"""A pair of a prefix (corresponding to a semantic space) and a local unique identifier in that semantic space.

Expand Down Expand Up @@ -669,17 +664,6 @@ class DuplicateSummary(NamedTuple):
prefix: str


class NoCURIEDelimiterError(ValueError):
"""An error thrown on a string with no delimiter."""

def __init__(self, curie: str):
"""Initialize the error."""
self.curie = curie

def __str__(self) -> str:
return f"{self.curie} does not appear to be a CURIE - missing a delimiter"


class DuplicateValueError(ValueError):
"""An error raised with constructing a converter with data containing duplicate values."""

Expand Down
177 changes: 177 additions & 0 deletions src/curies/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""Dataframe operations."""

from __future__ import annotations

from collections import defaultdict
from collections.abc import Collection
from typing import TYPE_CHECKING, Callable, Literal

from typing_extensions import TypeAlias

from .utils import _prefix_from_curie

if TYPE_CHECKING:
import pandas as pd

from .api import Converter

__all__ = [
"PrefixIndexMethod",
"filter_df_by_curies",
"filter_df_by_prefixes",
"get_df_curies_index",
"get_df_prefixes_index",
"get_filter_df_by_curies_index",
"get_filter_df_by_prefixes_index",
]


def _get_prefix_checker(prefix: str | Collection[str]) -> Callable[[str], bool]:
"""Get a function that checks if a CURIE starts with a prefix."""
if isinstance(prefix, str):
prefix_with_colon = prefix + ":"

def _func(curie: str) -> bool:
return curie.startswith(prefix_with_colon)

else:
prefixes_with_colons = {p + ":" for p in prefix}

def _func(curie: str) -> bool:
return any(
curie.startswith(prefix_with_colon) for prefix_with_colon in prefixes_with_colons
)

return _func


def _get_prefixes_from_curie_column(
df: pd.DataFrame, column: int | str, converter: Converter, validate: bool
) -> pd.Series[str]:
return df[column].map(_get_curie_parser(converter=converter, validate=validate))


def _get_curie_parser(
*, converter: Converter | None = None, validate: bool = False
) -> Callable[[str], str]:
# TODO what if it can't parse?
# TODO handle None?
# TODO handle invalid CURIEs?

if not validate:
return _prefix_from_curie
elif converter is None:
raise ValueError("converter is required for validation")
else:

def _func(curie: str) -> str:
reference = converter.parse_curie(curie, strict=True)
return reference.prefix

return _func


#: The method for filtering on prefixe
PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"]


def get_filter_df_by_prefixes_index(
df: pd.DataFrame,
column: str | int,
prefix: str | Collection[str],
*,
method: PrefixIndexMethod | None = None,
converter: Converter | None = None,
validate: bool = False,
) -> pd.Series[bool]:
"""Get an index of CURIEs in the given column that start with the prefix(es)."""
if method == "iterative" or method is None:
return df[column].map(_get_prefix_checker(prefix))
elif method == "precalculated":
if converter is None: # pragma: no cover
raise ValueError("a converter is required for method B")
prefix_series = _get_prefixes_from_curie_column(df, column, converter, validate=validate)
if isinstance(prefix, str):
return prefix_series == prefix
else:
return prefix_series.isin(prefix)
else: # pragma: no cover
raise ValueError(f"invalid method given: {method}")


def filter_df_by_prefixes(
df: pd.DataFrame,
column: str | int,
prefix: str | Collection[str],
*,
method: PrefixIndexMethod | None = None,
converter: Converter | None = None,
) -> pd.DataFrame:
"""Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.

:param df: A dataframe
:param column: The integer index or column name of a column containing CURIEs
:param prefix: The prefix (given as a string) or collection of prefixes (given as a
list, set, etc.) to keep
:param method: The implementation for getting the prefix index
:param converter: A converter

:returns: If not in place, return a new dataframe.
"""
idx = get_filter_df_by_prefixes_index(
df=df, column=column, prefix=prefix, method=method, converter=converter
)
return df[idx]


def get_filter_df_by_curies_index(
df: pd.DataFrame,
column: str | int,
curie: str | Collection[str],
) -> pd.Series[bool]:
"""Get an index of CURIEs in the given column that are the given CURIE(s)."""
if isinstance(curie, str):
return df[column] == curie
else:
return df[column].isin(set(curie))


def get_df_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]:
"""Get a dictionary from CURIEs that appear in the column to the row indexes where they appear."""
dd: defaultdict[str, list[int]] = defaultdict(list)
for i, curie in enumerate(df[column]):
dd[curie].append(i)
return dict(dd)


def filter_df_by_curies(
df: pd.DataFrame,
column: str | int,
curie: str | Collection[str],
) -> pd.DataFrame:
"""Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes.

:param df: A dataframe
:param column: The integer index or column name of a column containing CURIEs
:param curie: The CURIE (given as a string) or collection of CURIEs (given as a
list, set, etc.) to keep

:returns: If not in place, return a new dataframe.
"""
idx = get_filter_df_by_curies_index(df=df, column=column, curie=curie)
return df[idx]


def get_df_prefixes_index(
df: pd.DataFrame,
column: str | int,
*,
converter: Converter | None = None,
validate: bool = False,
) -> dict[str, list[int]]:
"""Get a dictionary from prefixes that appear in the column to the row indexes where they appear."""
dd: defaultdict[str, list[int]] = defaultdict(list)
f = _get_curie_parser(converter=converter, validate=validate)
for i, prefix in enumerate(df[column].map(f)):
dd[prefix].append(i)
return dict(dd)
32 changes: 32 additions & 0 deletions src/curies/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Utilities for working with strings."""

from __future__ import annotations

__all__ = [
"NoCURIEDelimiterError",
"_split",
]


class NoCURIEDelimiterError(ValueError):
"""An error thrown on a string with no delimiter."""

def __init__(self, curie: str):
"""Initialize the error."""
self.curie = curie

def __str__(self) -> str:
return f"{self.curie} does not appear to be a CURIE - missing a delimiter"


def _split(curie: str, *, sep: str = ":") -> tuple[str, str]:
"""Split a CURIE string using string operations."""
prefix, delimiter, identifier = curie.partition(sep)
if not delimiter:
raise NoCURIEDelimiterError(curie)
return prefix, identifier


def _prefix_from_curie(curie: str, *, sep: str = ":") -> str:
"""Split a CURIE string using string operations and return the prefix."""
return _split(curie, sep=sep)[0]
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
ExpansionError,
NamableReference,
NamedReference,
NoCURIEDelimiterError,
PrefixStandardizationError,
Record,
Records,
Expand All @@ -39,6 +38,7 @@
get_monarch_converter,
get_obo_converter,
)
from curies.utils import NoCURIEDelimiterError
from curies.version import get_version
from tests.constants import SLOW

Expand Down
Loading
Loading