From 1e2b59ec9389839f0aaf0e7773f7693f6487b386 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 11:31:24 +0200 Subject: [PATCH 01/19] Add dataframe utilities --- src/curies/dataframe.py | 132 ++++++++++++++++++++++++++++++++++++++++ tests/test_dataframe.py | 38 ++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 src/curies/dataframe.py create mode 100644 tests/test_dataframe.py diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py new file mode 100644 index 00000000..57f2661e --- /dev/null +++ b/src/curies/dataframe.py @@ -0,0 +1,132 @@ +"""Dataframe operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, overload, Literal, Callable, Collection, Iterable +import itertools as itt +from curies import Converter + +if TYPE_CHECKING: + import pandas as pd + import sssom + +__all__ = [ + "get_prefix_index", + "filter_df_prefix", +] + + +def _get_prefix_checker(prefix: str | Collection[str]) -> Callable[[str], bool]: + """Get a function that checks if a CURIE starts with a prefix.""" + if isinstance(prefix, str): + pp = prefix + ":" + + def func(x: str) -> bool: + return x.startswith(pp) + + else: + strings = {p + ":" for p in prefix} + + def func(x: str) -> bool: + return any(x.startswith(pp) for pp in strings) + + return func + + +def _get_prefixes_from_curie_column(df: pd.DataFrame, column: int | str, converter: Converter) -> pd.Series: + raise NotImplementedError + + +def get_prefix_index( + df: pd.DataFrame, column: str | int, prefix: str | Collection[str], *, method: Literal['a', 'b'] = 'a', + converter: Converter | None = None, +) -> pd.Series: + if method == "a": + return df[column].map(_get_prefix_checker(prefix)) + elif method == "b": + if converter is None: + raise ValueError("a converter is required for method B") + prefix_series = _get_prefixes_from_curie_column(df, column, converter) + if isinstance(prefix, str): + return prefix_series == prefix + else: + return prefix_series.isin(prefix) + else: + raise ValueError + + +@overload +def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: Literal[True] = ...) -> None: ... + + +@overload +def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: Literal[False] = ...) -> pd.DataFrame: ... + + +def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: bool = False) -> pd.DataFrame | None: + """""" + if inplace: + return _filter_df_prefix_inplace(df, prefix) + else: + return _filter_df_prefix(df, prefix) + + +def _filter_df_prefix_inplace(df: pd.DataFrame, prefix: str) -> None: + raise NotImplementedError + + +def _filter_df_prefix(df: pd.DataFrame, column: str | int, prefix: str) -> pd.DataFrame: + pp = prefix + ":" + idx = df[column].map(lambda value: value.startswith(pp)) + + +# this is split out from SSSOM +def split_dataframe_by_prefix( + df: pd.DataFrame, + subject_prefixes: Collection[str], + predicates: Collection[str], + object_prefixes: Collection[str], +) -> dict[tuple[str, str, str], pd.DataFrame]: + rv = {} + s_indexes = { + subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) + for subject_prefix in subject_prefixes + } + p_indexes = { + predicate: df["predicate_id"] == predicate + for predicate in predicates + } + o_indexes = { + object_prefix: get_prefix_index(df, column="object_id", prefix=object_prefix) + for object_prefix in object_prefixes + } + for subject_prefix, predicate, object_prefix in itt.product( + subject_prefixes, predicates, object_prefixes + ): + idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] + if not idx.any(): + continue + rv[subject_prefix, predicate, object_prefix] = df[idx] + + return rv + + +def split_msdf_by_prefix( + msdf: sssom.MappingSetDataFrame, + subject_prefixes: Collection[str], + predicates: Collection[str], + object_prefixes: Collection[str], +) -> dict[str, sssom.MappingSetDataFrame]: + from sssom.io import from_sssom_dataframe + rr = split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) + rv = {} + for (subject_prefix, predicate, object_prefix), df in rr.items(): + c = msdf.converter.parse_curie(predicate, strict=True) + subconverter = msdf.converter.get_subconverter( + [subject_prefix, c.prefix, object_prefix] + ) + split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" + rv[split] = from_sssom_dataframe( + df, prefix_map=dict(subconverter.bimap), meta=msdf.meta + ) + return rv diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py new file mode 100644 index 00000000..b8480fbe --- /dev/null +++ b/tests/test_dataframe.py @@ -0,0 +1,38 @@ +"""Test dataframe utilities.""" + +import unittest + +from curies import Converter +from curies.dataframe import get_prefix_index +import pandas as pd + +CONVERTER = Converter.from_prefix_map({ + "a": "https://example.org/a/", + "b": "https://example.org/b/", +}) + + +class TestDataframe(unittest.TestCase): + """A test case for dataframe utilities.""" + + def test_get_prefix_index(self) -> None: + """Test getting a prefix index.""" + curies = [ + *(f"a:{i}" for i in range(5)), + *(f"b:{i}" for i in range(5)), + *(f"c:{i}" for i in range(5)), + ] + rows = [(curie,) for curie in curies] + df = pd.DataFrame(rows, columns=["curie"]) + + for method in ["a", "b"]: + with self.subTest(method=method): + idx = get_prefix_index(df, "curie", "a", method=method, converter=CONVERTER) + self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) + + idx = get_prefix_index(df, "curie", ["a", "b"], method=method, converter=CONVERTER) + self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) + + +def _rr(series: pd.Series) -> list[int]: + return [index for index, value in enumerate(series) if value] From b61fde01feb497708fb7c2800148868fabbfbce2 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 11:47:47 +0200 Subject: [PATCH 02/19] Pass rc --- src/curies/dataframe.py | 137 ++++++++++++++++++++++------------------ tests/test_dataframe.py | 19 ++++-- 2 files changed, 86 insertions(+), 70 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 57f2661e..e2105ba0 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -2,45 +2,66 @@ from __future__ import annotations -from typing import TYPE_CHECKING, overload, Literal, Callable, Collection, Iterable import itertools as itt -from curies import Converter +from collections.abc import Collection, Iterable +from typing import TYPE_CHECKING, Callable, Literal, TypeAlias + +from curies.api import Converter, _split if TYPE_CHECKING: import pandas as pd import sssom __all__ = [ - "get_prefix_index", "filter_df_prefix", + "get_prefix_index", ] def _get_prefix_checker(prefix: str | Collection[str]) -> Callable[[str], bool]: """Get a function that checks if a CURIE starts with a prefix.""" if isinstance(prefix, str): - pp = prefix + ":" + prefix_with_colon = prefix + ":" - def func(x: str) -> bool: - return x.startswith(pp) + def _func(curie: str) -> bool: + return curie.startswith(prefix_with_colon) else: - strings = {p + ":" for p in prefix} + prefixes_with_colons = {p + ":" for p in prefix} + + def _func(curie: str) -> bool: + return any( + curie.startswith(prefix_with_colon) for prefix_with_colon in prefixes_with_colons + ) + + return _func + - def func(x: str) -> bool: - return any(x.startswith(pp) for pp in strings) +def _get_prefixes_from_curie_column( + df: pd.DataFrame, column: int | str, converter: Converter, validate: bool = True +) -> pd.Series: + # TODO what if it can't parse? + # TODO handle None? + # TODO handle invalid CURIEs? - return func + if validate: + return df[column].map(lambda curie: converter.parse_curie(curie, strict=True).prefix) + else: + return df[column].map(lambda curie: _split(curie)[0]) -def _get_prefixes_from_curie_column(df: pd.DataFrame, column: int | str, converter: Converter) -> pd.Series: - raise NotImplementedError +Method: TypeAlias = Literal["a", "b"] def get_prefix_index( - df: pd.DataFrame, column: str | int, prefix: str | Collection[str], *, method: Literal['a', 'b'] = 'a', + df: pd.DataFrame, + column: str | int, + prefix: str | Collection[str], + *, + method: Method | None = None, converter: Converter | None = None, ) -> pd.Series: + """Get an index of CURIEs in the given column that start with the prefix(es).""" if method == "a": return df[column].map(_get_prefix_checker(prefix)) elif method == "b": @@ -55,47 +76,60 @@ def get_prefix_index( raise ValueError -@overload -def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: Literal[True] = ...) -> None: ... - - -@overload -def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: Literal[False] = ...) -> pd.DataFrame: ... - - -def filter_df_prefix(df: pd.DataFrame, prefix: str, *, inplace: bool = False) -> pd.DataFrame | None: - """""" - if inplace: - return _filter_df_prefix_inplace(df, prefix) - else: - return _filter_df_prefix(df, prefix) +def filter_df_prefix( + df: pd.DataFrame, + column: str | int, + prefix: str, + *, + method: Method | None = None, + converter: Converter | None = None, +) -> pd.DataFrame: + """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes. + :param df: A dataframe + :param column: + The integer index or column name of a column containing CURIEs + :param prefix: + The prefix (given as a string) or collection of prefixes (given as a list, set, etc.) to keep + :returns: If not in place, return a new dataframe. + """ + idx = get_prefix_index(df=df, column=column, prefix=prefix, method=method, converter=converter) + return df[idx] -def _filter_df_prefix_inplace(df: pd.DataFrame, prefix: str) -> None: - raise NotImplementedError +def split_msdf_by_prefix( + msdf: sssom.MappingSetDataFrame, + subject_prefixes: Collection[str], + predicates: Collection[str], + object_prefixes: Collection[str], +) -> dict[str, sssom.MappingSetDataFrame]: + """Split a MSDF, a drop-in replacement for :func:`sssom.parsers.split_dataframe_by_prefix`.""" + from sssom.io import from_sssom_dataframe -def _filter_df_prefix(df: pd.DataFrame, column: str | int, prefix: str) -> pd.DataFrame: - pp = prefix + ":" - idx = df[column].map(lambda value: value.startswith(pp)) + rr = _split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) + rv = {} + for (subject_prefix, predicate, object_prefix), df in rr: + predicate_reference = msdf.converter.parse_curie(predicate, strict=True) + subconverter = msdf.converter.get_subconverter( + [subject_prefix, predicate_reference.prefix, object_prefix] + ) + split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" + rv[split] = from_sssom_dataframe(df, prefix_map=dict(subconverter.bimap), meta=msdf.meta) + return rv # this is split out from SSSOM -def split_dataframe_by_prefix( +def _split_dataframe_by_prefix( df: pd.DataFrame, subject_prefixes: Collection[str], predicates: Collection[str], object_prefixes: Collection[str], -) -> dict[tuple[str, str, str], pd.DataFrame]: - rv = {} +) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: s_indexes = { subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) for subject_prefix in subject_prefixes } - p_indexes = { - predicate: df["predicate_id"] == predicate - for predicate in predicates - } + p_indexes = {predicate: df["predicate_id"] == predicate for predicate in predicates} o_indexes = { object_prefix: get_prefix_index(df, column="object_id", prefix=object_prefix) for object_prefix in object_prefixes @@ -106,27 +140,4 @@ def split_dataframe_by_prefix( idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] if not idx.any(): continue - rv[subject_prefix, predicate, object_prefix] = df[idx] - - return rv - - -def split_msdf_by_prefix( - msdf: sssom.MappingSetDataFrame, - subject_prefixes: Collection[str], - predicates: Collection[str], - object_prefixes: Collection[str], -) -> dict[str, sssom.MappingSetDataFrame]: - from sssom.io import from_sssom_dataframe - rr = split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) - rv = {} - for (subject_prefix, predicate, object_prefix), df in rr.items(): - c = msdf.converter.parse_curie(predicate, strict=True) - subconverter = msdf.converter.get_subconverter( - [subject_prefix, c.prefix, object_prefix] - ) - split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" - rv[split] = from_sssom_dataframe( - df, prefix_map=dict(subconverter.bimap), meta=msdf.meta - ) - return rv + yield (subject_prefix, predicate, object_prefix), df[idx] diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index b8480fbe..7cbf3a35 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -1,15 +1,20 @@ """Test dataframe utilities.""" +import typing import unittest -from curies import Converter -from curies.dataframe import get_prefix_index import pandas as pd -CONVERTER = Converter.from_prefix_map({ - "a": "https://example.org/a/", - "b": "https://example.org/b/", -}) +from curies import Converter +from curies.dataframe import Method, get_prefix_index + +CONVERTER = Converter.from_prefix_map( + { + "a": "https://example.org/a/", + "b": "https://example.org/b/", + "c": "https://example.org/c/", + } +) class TestDataframe(unittest.TestCase): @@ -25,7 +30,7 @@ def test_get_prefix_index(self) -> None: rows = [(curie,) for curie in curies] df = pd.DataFrame(rows, columns=["curie"]) - for method in ["a", "b"]: + for method in typing.get_args(Method): with self.subTest(method=method): idx = get_prefix_index(df, "curie", "a", method=method, converter=CONVERTER) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) From 4f55f3067b2b72f28162ad122b88051f4cae99e2 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:13:57 +0200 Subject: [PATCH 03/19] Improve implementations --- src/curies/dataframe.py | 146 +++++++++++++++++++++++++++++++++------- tests/test_dataframe.py | 55 ++++++++++++--- 2 files changed, 168 insertions(+), 33 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index e2105ba0..578e3961 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -3,6 +3,7 @@ from __future__ import annotations import itertools as itt +from collections import defaultdict from collections.abc import Collection, Iterable from typing import TYPE_CHECKING, Callable, Literal, TypeAlias @@ -13,7 +14,11 @@ import sssom __all__ = [ + "filter_df_curie", "filter_df_prefix", + "get_curie_index", + "get_dense_curie", + "get_dense_prefix", "get_prefix_index", ] @@ -40,14 +45,29 @@ def _func(curie: str) -> bool: def _get_prefixes_from_curie_column( df: pd.DataFrame, column: int | str, converter: Converter, validate: bool = True ) -> pd.Series: + return df[column].map(_get_parse_curie(converter=converter, validate=validate)) + + +def _get_parse_curie( + *, converter: Converter | None = None, validate: bool = False +) -> Callable[[str], str]: # TODO what if it can't parse? # TODO handle None? # TODO handle invalid CURIEs? - if validate: - return df[column].map(lambda curie: converter.parse_curie(curie, strict=True).prefix) + if not validate: + + def _func(curie: str) -> str: + return _split(curie)[0] + elif converter is None: + raise ValueError("converter is required for validation") else: - return df[column].map(lambda curie: _split(curie)[0]) + + def _func(curie: str) -> str: + reference = converter.parse_curie(curie, strict=True) + return reference.prefix + + return _func Method: TypeAlias = Literal["a", "b"] @@ -62,24 +82,24 @@ def get_prefix_index( converter: Converter | None = None, ) -> pd.Series: """Get an index of CURIEs in the given column that start with the prefix(es).""" - if method == "a": + if method == "a" or method is None: return df[column].map(_get_prefix_checker(prefix)) elif method == "b": - if converter is None: + if converter is None: # pragma: no cover raise ValueError("a converter is required for method B") prefix_series = _get_prefixes_from_curie_column(df, column, converter) if isinstance(prefix, str): return prefix_series == prefix else: return prefix_series.isin(prefix) - else: - raise ValueError + else: # pragma: no cover + raise ValueError(f"invalid method given: {method}") def filter_df_prefix( df: pd.DataFrame, column: str | int, - prefix: str, + prefix: str | Collection[str], *, method: Method | None = None, converter: Converter | None = None, @@ -97,6 +117,59 @@ def filter_df_prefix( return df[idx] +def get_curie_index( + df: pd.DataFrame, + column: str | int, + curie: str | Collection[str], +) -> pd.Series: + """Get an index of CURIEs in the given column that are the given CURIE(s).""" + if isinstance(curie, str): + return df[column] == curie + else: + return df[column].isin(set(curie)) + + +def filter_df_curie( + df: pd.DataFrame, + column: str | int, + curie: str | Collection[str], +) -> pd.DataFrame: + """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes. + + :param df: A dataframe + :param column: + The integer index or column name of a column containing CURIEs + :param curie: + The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep + :returns: If not in place, return a new dataframe. + """ + idx = get_curie_index(df=df, column=column, curie=curie) + return df[idx] + + +def get_dense_prefix( + df: pd.DataFrame, + column: str | int, + *, + converter: Converter | None = None, + validate: bool = False, +) -> dict[str, list[int]]: + """Get a dictionary from prefixes that appear in the column to the row indexes where they appear.""" + dd: defaultdict[str, list[int]] = defaultdict(list) + f = _get_parse_curie(converter=converter, validate=validate) + for i, prefix in enumerate(df[column].map(f)): + dd[prefix].append(i) + return dict(dd) + + +def get_dense_curie(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]: + """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear.""" + dd: defaultdict[str, list[int]] = defaultdict(list) + for i, curie in enumerate(df[column]): + dd[curie].append(i) + return dict(dd) + + def split_msdf_by_prefix( msdf: sssom.MappingSetDataFrame, subject_prefixes: Collection[str], @@ -124,20 +197,45 @@ def _split_dataframe_by_prefix( subject_prefixes: Collection[str], predicates: Collection[str], object_prefixes: Collection[str], + *, + method: Literal[1, 2] = 1, ) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: - s_indexes = { - subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) - for subject_prefix in subject_prefixes - } - p_indexes = {predicate: df["predicate_id"] == predicate for predicate in predicates} - o_indexes = { - object_prefix: get_prefix_index(df, column="object_id", prefix=object_prefix) - for object_prefix in object_prefixes - } - for subject_prefix, predicate, object_prefix in itt.product( - subject_prefixes, predicates, object_prefixes - ): - idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] - if not idx.any(): - continue - yield (subject_prefix, predicate, object_prefix), df[idx] + if method == 1: + s_indexes = { + subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) + for subject_prefix in subject_prefixes + } + p_indexes = { + predicate: get_curie_index(df, column="predicate_id", curie=predicate) + for predicate in predicates + } + o_indexes = { + object_prefix: get_prefix_index(df, column="object_id", prefix=object_prefix) + for object_prefix in object_prefixes + } + for subject_prefix, predicate, object_prefix in itt.product( + subject_prefixes, predicates, object_prefixes + ): + idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] + if not idx.any(): + continue + yield (subject_prefix, predicate, object_prefix), df[idx] + + elif method == 2: + s_index = get_dense_prefix(df, "subject_id") + p_index = get_dense_curie(df, "predicate_id") + o_index = get_dense_prefix(df, "object_id") + for subject_prefix, predicate, object_prefix in itt.product( + subject_prefixes, predicates, object_prefixes + ): + method_2_idx: list[int] = sorted( + set(s_index.get(subject_prefix, [])) + .intersection(p_index.get(predicate, [])) + .intersection(o_index.get(object_prefix, [])) + ) + if not method_2_idx: + continue + yield (subject_prefix, predicate, object_prefix), df.iloc[method_2_idx] + + else: + raise ValueError diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 7cbf3a35..7b60a692 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -6,7 +6,14 @@ import pandas as pd from curies import Converter -from curies.dataframe import Method, get_prefix_index +from curies.dataframe import ( + Method, + filter_df_curie, + filter_df_prefix, + get_dense_curie, + get_dense_prefix, + get_prefix_index, +) CONVERTER = Converter.from_prefix_map( { @@ -22,22 +29,52 @@ class TestDataframe(unittest.TestCase): def test_get_prefix_index(self) -> None: """Test getting a prefix index.""" - curies = [ - *(f"a:{i}" for i in range(5)), - *(f"b:{i}" for i in range(5)), - *(f"c:{i}" for i in range(5)), - ] + column = "curie" + a_curies = [f"a:{i}" for i in range(5)] + b_curies = [f"b:{i}" for i in range(5)] + c_curies = [f"c:{i}" for i in range(5)] * 2 + curies = [*a_curies, *b_curies, *c_curies] rows = [(curie,) for curie in curies] - df = pd.DataFrame(rows, columns=["curie"]) + df = pd.DataFrame(rows, columns=[column]) for method in typing.get_args(Method): with self.subTest(method=method): - idx = get_prefix_index(df, "curie", "a", method=method, converter=CONVERTER) + idx = get_prefix_index(df, column, "a", method=method, converter=CONVERTER) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) - idx = get_prefix_index(df, "curie", ["a", "b"], method=method, converter=CONVERTER) + idx = get_prefix_index(df, column, ["a", "b"], method=method, converter=CONVERTER) self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) + df_a = filter_df_prefix(df, column, "a") + self.assertEqual(set(a_curies), set(df_a[column])) + + df_ab = filter_df_prefix(df, column, ["a", "b"]) + self.assertEqual({*a_curies, *b_curies}, set(df_ab[column])) + + df_a1 = filter_df_curie(df, column, "a:1") + self.assertEqual({"a:1"}, set(df_a1[column])) + + df_a123 = filter_df_curie(df, column, ["a:1", "a:2", "b:1"]) + self.assertEqual({"a:1", "a:2", "b:1"}, set(df_a123[column])) + + dense_prefix_mapping = get_dense_prefix(df, column) + self.assertEqual( + { + "a": [0, 1, 2, 3, 4], + "b": [5, 6, 7, 8, 9], + "c": [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + }, + dense_prefix_mapping, + ) + + dense_curie_mapping = get_dense_curie(df, column) + self.assertNotIn("a", dense_curie_mapping) + self.assertNotIn("b", dense_curie_mapping) + self.assertNotIn("c", dense_curie_mapping) + self.assertIn("a:0", dense_curie_mapping) + self.assertEqual([0], dense_curie_mapping["a:0"]) + self.assertEqual([10, 15], dense_curie_mapping["c:0"]) + def _rr(series: pd.Series) -> list[int]: return [index for index, value in enumerate(series) if value] From 5acb1f69c98f7f8f3adc7f3a2f52c446f7ab2fa0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:18:07 +0200 Subject: [PATCH 04/19] Impl --- src/curies/dataframe.py | 15 +++++++++++---- tests/test_dataframe.py | 12 ++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 578e3961..18d60fe0 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -170,7 +170,7 @@ def get_dense_curie(df: pd.DataFrame, column: str | int) -> dict[str, list[int]] return dict(dd) -def split_msdf_by_prefix( +def _split_msdf_by_prefix( msdf: sssom.MappingSetDataFrame, subject_prefixes: Collection[str], predicates: Collection[str], @@ -194,12 +194,19 @@ def split_msdf_by_prefix( # this is split out from SSSOM def _split_dataframe_by_prefix( df: pd.DataFrame, - subject_prefixes: Collection[str], - predicates: Collection[str], - object_prefixes: Collection[str], + subject_prefixes: str | Collection[str], + predicates: str | Collection[str], + object_prefixes: str | Collection[str], *, method: Literal[1, 2] = 1, ) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: + if isinstance(subject_prefixes, str): + subject_prefixes = [subject_prefixes] + if isinstance(predicates, str): + predicates = [predicates] + if isinstance(object_prefixes, str): + object_prefixes = [object_prefixes] + if method == 1: s_indexes = { subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 7b60a692..d0587918 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -8,6 +8,7 @@ from curies import Converter from curies.dataframe import ( Method, + _split_dataframe_by_prefix, filter_df_curie, filter_df_prefix, get_dense_curie, @@ -75,6 +76,17 @@ def test_get_prefix_index(self) -> None: self.assertEqual([0], dense_curie_mapping["a:0"]) self.assertEqual([10, 15], dense_curie_mapping["c:0"]) + def test_split_df(self) -> None: + """Test the precursor to SSSOM function.""" + rows = [ + ("p1:1", "skos:exactMatch", "p2:1"), + ("p1:2", "skos:exactMatch", "p2:2"), + ("p1:2", "skos:exactMatch", "p3:2"), + ] + df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) + rv = dict(_split_dataframe_by_prefix(df, ["p1"], ["skos"], ["p2"])) + self.assertIn(("p1", "skos", "p2"), rv) + def _rr(series: pd.Series) -> list[int]: return [index for index, value in enumerate(series) if value] From 5f35f178b480afbc0936acebc0144bab73306a41 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:22:45 +0200 Subject: [PATCH 05/19] Fix tests --- tests/test_dataframe.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index d0587918..f9c6fbae 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -82,10 +82,20 @@ def test_split_df(self) -> None: ("p1:1", "skos:exactMatch", "p2:1"), ("p1:2", "skos:exactMatch", "p2:2"), ("p1:2", "skos:exactMatch", "p3:2"), + ("p4:1", "skos:exactMatch", "p1:1"), + ("p5:1", "skos:broaderMatch", "p6:1"), ] df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) - rv = dict(_split_dataframe_by_prefix(df, ["p1"], ["skos"], ["p2"])) - self.assertIn(("p1", "skos", "p2"), rv) + for method in [1, 2]: + with self.subTest(method=method): + # test that if there's ever an empty list, then it returns an empty dict + self.assertFalse(dict(_split_dataframe_by_prefix(df, [], ["skos:exactMatch"], ["p2"], method=method))) + self.assertFalse(dict(_split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method))) + self.assertFalse(dict(_split_dataframe_by_prefix(df, ["p1"], ["skos:exactMatch"], [], method=method))) + + rv = dict(_split_dataframe_by_prefix(df, ["p1"], ["skos:exactMatch"], ["p2"], method=method)) + self.assertIn(("p1", "skos:exactMatch", "p2"), rv) + self.assertEqual(1, len(rv)) def _rr(series: pd.Series) -> list[int]: From 10ccad1669b1571639bbf7be2e1926a959b6814a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:23:44 +0200 Subject: [PATCH 06/19] Update testing --- src/curies/dataframe.py | 7 +++++-- tests/test_dataframe.py | 31 +++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 18d60fe0..4b05ca57 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -191,6 +191,9 @@ def _split_msdf_by_prefix( return rv +_SplitMethod: TypeAlias = Literal[1, 2] + + # this is split out from SSSOM def _split_dataframe_by_prefix( df: pd.DataFrame, @@ -198,7 +201,7 @@ def _split_dataframe_by_prefix( predicates: str | Collection[str], object_prefixes: str | Collection[str], *, - method: Literal[1, 2] = 1, + method: _SplitMethod | None = None, ) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: if isinstance(subject_prefixes, str): subject_prefixes = [subject_prefixes] @@ -207,7 +210,7 @@ def _split_dataframe_by_prefix( if isinstance(object_prefixes, str): object_prefixes = [object_prefixes] - if method == 1: + if method == 1 or method is None: s_indexes = { subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) for subject_prefix in subject_prefixes diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index f9c6fbae..b7959b2f 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -9,6 +9,7 @@ from curies.dataframe import ( Method, _split_dataframe_by_prefix, + _SplitMethod, filter_df_curie, filter_df_prefix, get_dense_curie, @@ -86,14 +87,32 @@ def test_split_df(self) -> None: ("p5:1", "skos:broaderMatch", "p6:1"), ] df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) - for method in [1, 2]: + for method in typing.get_args(_SplitMethod): with self.subTest(method=method): # test that if there's ever an empty list, then it returns an empty dict - self.assertFalse(dict(_split_dataframe_by_prefix(df, [], ["skos:exactMatch"], ["p2"], method=method))) - self.assertFalse(dict(_split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method))) - self.assertFalse(dict(_split_dataframe_by_prefix(df, ["p1"], ["skos:exactMatch"], [], method=method))) - - rv = dict(_split_dataframe_by_prefix(df, ["p1"], ["skos:exactMatch"], ["p2"], method=method)) + self.assertFalse( + dict( + _split_dataframe_by_prefix( + df, [], ["skos:exactMatch"], ["p2"], method=method + ) + ) + ) + self.assertFalse( + dict(_split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) + ) + self.assertFalse( + dict( + _split_dataframe_by_prefix( + df, ["p1"], ["skos:exactMatch"], [], method=method + ) + ) + ) + + rv = dict( + _split_dataframe_by_prefix( + df, ["p1"], ["skos:exactMatch"], ["p2"], method=method + ) + ) self.assertIn(("p1", "skos:exactMatch", "p2"), rv) self.assertEqual(1, len(rv)) From 7c3984f5552909a382957d361b22c3085d96e051 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:26:01 +0200 Subject: [PATCH 07/19] Update dataframe.py --- src/curies/dataframe.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 4b05ca57..4553e2bb 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -43,12 +43,12 @@ def _func(curie: str) -> bool: def _get_prefixes_from_curie_column( - df: pd.DataFrame, column: int | str, converter: Converter, validate: bool = True + df: pd.DataFrame, column: int | str, converter: Converter, validate: bool ) -> pd.Series: - return df[column].map(_get_parse_curie(converter=converter, validate=validate)) + return df[column].map(_get_curie_parser(converter=converter, validate=validate)) -def _get_parse_curie( +def _get_curie_parser( *, converter: Converter | None = None, validate: bool = False ) -> Callable[[str], str]: # TODO what if it can't parse? @@ -80,6 +80,7 @@ def get_prefix_index( *, method: Method | None = None, converter: Converter | None = None, + validate: bool = False, ) -> pd.Series: """Get an index of CURIEs in the given column that start with the prefix(es).""" if method == "a" or method is None: @@ -87,7 +88,7 @@ def get_prefix_index( elif method == "b": if converter is None: # pragma: no cover raise ValueError("a converter is required for method B") - prefix_series = _get_prefixes_from_curie_column(df, column, converter) + prefix_series = _get_prefixes_from_curie_column(df, column, converter, validate=validate) if isinstance(prefix, str): return prefix_series == prefix else: @@ -111,6 +112,8 @@ def filter_df_prefix( The integer index or column name of a column containing CURIEs :param prefix: The prefix (given as a string) or collection of prefixes (given as a list, set, etc.) to keep + :param method: The implementation for getting the prefix index + :param converter: A converter :returns: If not in place, return a new dataframe. """ idx = get_prefix_index(df=df, column=column, prefix=prefix, method=method, converter=converter) @@ -156,7 +159,7 @@ def get_dense_prefix( ) -> dict[str, list[int]]: """Get a dictionary from prefixes that appear in the column to the row indexes where they appear.""" dd: defaultdict[str, list[int]] = defaultdict(list) - f = _get_parse_curie(converter=converter, validate=validate) + f = _get_curie_parser(converter=converter, validate=validate) for i, prefix in enumerate(df[column].map(f)): dd[prefix].append(i) return dict(dd) From 31b9edffb1bf136b7565eb7bb6faeaa831b9dbb0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:26:27 +0200 Subject: [PATCH 08/19] Update dataframe.py --- src/curies/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 4553e2bb..f1d46782 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -5,7 +5,9 @@ import itertools as itt from collections import defaultdict from collections.abc import Collection, Iterable -from typing import TYPE_CHECKING, Callable, Literal, TypeAlias +from typing import TYPE_CHECKING, Callable, Literal + +from typing_extensions import TypeAlias from curies.api import Converter, _split From 9e269515a7de0e5dc7d7f6ada88bbb77de479cc5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:34:06 +0200 Subject: [PATCH 09/19] Renames --- src/curies/dataframe.py | 48 +++++++++++++++++++++-------------------- tests/test_dataframe.py | 24 +++++++++++---------- 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index f1d46782..5777f08b 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -16,12 +16,12 @@ import sssom __all__ = [ - "filter_df_curie", - "filter_df_prefix", - "get_curie_index", - "get_dense_curie", + "get_curies_index", "get_dense_prefix", - "get_prefix_index", + "get_keep_curies_index", + "get_keep_prefixes_index", + "keep_curies", + "keep_prefixes", ] @@ -75,7 +75,7 @@ def _func(curie: str) -> str: Method: TypeAlias = Literal["a", "b"] -def get_prefix_index( +def get_keep_prefixes_index( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -99,7 +99,7 @@ def get_prefix_index( raise ValueError(f"invalid method given: {method}") -def filter_df_prefix( +def keep_prefixes( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -118,11 +118,13 @@ def filter_df_prefix( :param converter: A converter :returns: If not in place, return a new dataframe. """ - idx = get_prefix_index(df=df, column=column, prefix=prefix, method=method, converter=converter) + idx = get_keep_prefixes_index( + df=df, column=column, prefix=prefix, method=method, converter=converter + ) return df[idx] -def get_curie_index( +def get_keep_curies_index( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -134,7 +136,15 @@ def get_curie_index( return df[column].isin(set(curie)) -def filter_df_curie( +def get_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]: + """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear.""" + dd: defaultdict[str, list[int]] = defaultdict(list) + for i, curie in enumerate(df[column]): + dd[curie].append(i) + return dict(dd) + + +def keep_curies( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -148,7 +158,7 @@ def filter_df_curie( The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep :returns: If not in place, return a new dataframe. """ - idx = get_curie_index(df=df, column=column, curie=curie) + idx = get_keep_curies_index(df=df, column=column, curie=curie) return df[idx] @@ -167,14 +177,6 @@ def get_dense_prefix( return dict(dd) -def get_dense_curie(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]: - """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear.""" - dd: defaultdict[str, list[int]] = defaultdict(list) - for i, curie in enumerate(df[column]): - dd[curie].append(i) - return dict(dd) - - def _split_msdf_by_prefix( msdf: sssom.MappingSetDataFrame, subject_prefixes: Collection[str], @@ -217,15 +219,15 @@ def _split_dataframe_by_prefix( if method == 1 or method is None: s_indexes = { - subject_prefix: get_prefix_index(df, column="subject_id", prefix=subject_prefix) + subject_prefix: get_keep_prefixes_index(df, column="subject_id", prefix=subject_prefix) for subject_prefix in subject_prefixes } p_indexes = { - predicate: get_curie_index(df, column="predicate_id", curie=predicate) + predicate: get_keep_curies_index(df, column="predicate_id", curie=predicate) for predicate in predicates } o_indexes = { - object_prefix: get_prefix_index(df, column="object_id", prefix=object_prefix) + object_prefix: get_keep_prefixes_index(df, column="object_id", prefix=object_prefix) for object_prefix in object_prefixes } for subject_prefix, predicate, object_prefix in itt.product( @@ -238,7 +240,7 @@ def _split_dataframe_by_prefix( elif method == 2: s_index = get_dense_prefix(df, "subject_id") - p_index = get_dense_curie(df, "predicate_id") + p_index = get_curies_index(df, "predicate_id") o_index = get_dense_prefix(df, "object_id") for subject_prefix, predicate, object_prefix in itt.product( subject_prefixes, predicates, object_prefixes diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index b7959b2f..f05c4943 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -10,11 +10,11 @@ Method, _split_dataframe_by_prefix, _SplitMethod, - filter_df_curie, - filter_df_prefix, - get_dense_curie, + get_curies_index, get_dense_prefix, - get_prefix_index, + get_keep_prefixes_index, + keep_curies, + keep_prefixes, ) CONVERTER = Converter.from_prefix_map( @@ -41,22 +41,24 @@ def test_get_prefix_index(self) -> None: for method in typing.get_args(Method): with self.subTest(method=method): - idx = get_prefix_index(df, column, "a", method=method, converter=CONVERTER) + idx = get_keep_prefixes_index(df, column, "a", method=method, converter=CONVERTER) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) - idx = get_prefix_index(df, column, ["a", "b"], method=method, converter=CONVERTER) + idx = get_keep_prefixes_index( + df, column, ["a", "b"], method=method, converter=CONVERTER + ) self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) - df_a = filter_df_prefix(df, column, "a") + df_a = keep_prefixes(df, column, "a") self.assertEqual(set(a_curies), set(df_a[column])) - df_ab = filter_df_prefix(df, column, ["a", "b"]) + df_ab = keep_prefixes(df, column, ["a", "b"]) self.assertEqual({*a_curies, *b_curies}, set(df_ab[column])) - df_a1 = filter_df_curie(df, column, "a:1") + df_a1 = keep_curies(df, column, "a:1") self.assertEqual({"a:1"}, set(df_a1[column])) - df_a123 = filter_df_curie(df, column, ["a:1", "a:2", "b:1"]) + df_a123 = keep_curies(df, column, ["a:1", "a:2", "b:1"]) self.assertEqual({"a:1", "a:2", "b:1"}, set(df_a123[column])) dense_prefix_mapping = get_dense_prefix(df, column) @@ -69,7 +71,7 @@ def test_get_prefix_index(self) -> None: dense_prefix_mapping, ) - dense_curie_mapping = get_dense_curie(df, column) + dense_curie_mapping = get_curies_index(df, column) self.assertNotIn("a", dense_curie_mapping) self.assertNotIn("b", dense_curie_mapping) self.assertNotIn("c", dense_curie_mapping) From 3873aa0b0ff6a792af8342ddc538cbb4e5bb694c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:41:55 +0200 Subject: [PATCH 10/19] Refactor --- src/curies/_sssom_exploration.py | 107 +++++++++++++++++++++++++++++ src/curies/dataframe.py | 113 ++++--------------------------- tests/test_dataframe.py | 41 +++++------ 3 files changed, 143 insertions(+), 118 deletions(-) create mode 100644 src/curies/_sssom_exploration.py diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py new file mode 100644 index 00000000..55527c41 --- /dev/null +++ b/src/curies/_sssom_exploration.py @@ -0,0 +1,107 @@ +"""Utilities for SSSOM.""" + +from __future__ import annotations + +import itertools as itt +from collections.abc import Collection, Iterable +from typing import TYPE_CHECKING, Literal, TypeAlias + +import pandas as pd + +from curies.dataframe import ( + get_df_curies_index, + get_df_prefixes_index, + get_keep_df_curies_index, + get_keep_df_prefixes_index, +) + +if TYPE_CHECKING: + import sssom + +__all__ = [ + "split_dataframe_by_prefix", + "split_msdf_by_prefix", +] + + +def split_msdf_by_prefix( + msdf: sssom.MappingSetDataFrame, + subject_prefixes: Collection[str], + predicates: Collection[str], + object_prefixes: Collection[str], +) -> dict[str, sssom.MappingSetDataFrame]: + """Split a MSDF, a drop-in replacement for :func:`sssom.parsers.split_dataframe_by_prefix`.""" + from sssom.io import from_sssom_dataframe + + rr = split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) + rv = {} + for (subject_prefix, predicate, object_prefix), df in rr: + predicate_reference = msdf.converter.parse_curie(predicate, strict=True) + subconverter = msdf.converter.get_subconverter( + [subject_prefix, predicate_reference.prefix, object_prefix] + ) + split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" + rv[split] = from_sssom_dataframe(df, prefix_map=dict(subconverter.bimap), meta=msdf.meta) + return rv + + +SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"] + + +def split_dataframe_by_prefix( + df: pd.DataFrame, + subject_prefixes: str | Collection[str], + predicates: str | Collection[str], + object_prefixes: str | Collection[str], + *, + method: SplitMethod | None = None, +) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: + """Iterate over splits on a dataframe.""" + if isinstance(subject_prefixes, str): + subject_prefixes = [subject_prefixes] + if isinstance(predicates, str): + predicates = [predicates] + if isinstance(object_prefixes, str): + object_prefixes = [object_prefixes] + + if method == "disjoint-indexes" or method is None: + s_indexes = { + subject_prefix: get_keep_df_prefixes_index( + df, column="subject_id", prefix=subject_prefix + ) + for subject_prefix in subject_prefixes + } + p_indexes = { + predicate: get_keep_df_curies_index(df, column="predicate_id", curie=predicate) + for predicate in predicates + } + o_indexes = { + object_prefix: get_keep_df_prefixes_index(df, column="object_id", prefix=object_prefix) + for object_prefix in object_prefixes + } + for subject_prefix, predicate, object_prefix in itt.product( + subject_prefixes, predicates, object_prefixes + ): + idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] + if not idx.any(): + continue + yield (subject_prefix, predicate, object_prefix), df[idx] + + elif method == "dense-indexes": + s_index = get_df_prefixes_index(df, "subject_id") + p_index = get_df_curies_index(df, "predicate_id") + o_index = get_df_prefixes_index(df, "object_id") + for subject_prefix, predicate, object_prefix in itt.product( + subject_prefixes, predicates, object_prefixes + ): + method_2_idx: list[int] = sorted( + set(s_index.get(subject_prefix, [])) + .intersection(p_index.get(predicate, [])) + .intersection(o_index.get(object_prefix, [])) + ) + if not method_2_idx: + continue + yield (subject_prefix, predicate, object_prefix), df.iloc[method_2_idx] + + else: + raise ValueError diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 5777f08b..363d860e 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -2,9 +2,8 @@ from __future__ import annotations -import itertools as itt from collections import defaultdict -from collections.abc import Collection, Iterable +from collections.abc import Collection from typing import TYPE_CHECKING, Callable, Literal from typing_extensions import TypeAlias @@ -13,15 +12,14 @@ if TYPE_CHECKING: import pandas as pd - import sssom __all__ = [ - "get_curies_index", - "get_dense_prefix", - "get_keep_curies_index", - "get_keep_prefixes_index", - "keep_curies", - "keep_prefixes", + "get_df_curies_index", + "get_df_prefixes_index", + "get_keep_df_curies_index", + "get_keep_df_prefixes_index", + "keep_df_curies", + "keep_df_prefixes", ] @@ -75,7 +73,7 @@ def _func(curie: str) -> str: Method: TypeAlias = Literal["a", "b"] -def get_keep_prefixes_index( +def get_keep_df_prefixes_index( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -99,7 +97,7 @@ def get_keep_prefixes_index( raise ValueError(f"invalid method given: {method}") -def keep_prefixes( +def keep_df_prefixes( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -118,13 +116,13 @@ def keep_prefixes( :param converter: A converter :returns: If not in place, return a new dataframe. """ - idx = get_keep_prefixes_index( + idx = get_keep_df_prefixes_index( df=df, column=column, prefix=prefix, method=method, converter=converter ) return df[idx] -def get_keep_curies_index( +def get_keep_df_curies_index( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -136,7 +134,7 @@ def get_keep_curies_index( return df[column].isin(set(curie)) -def get_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]: +def get_df_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int]]: """Get a dictionary from CURIEs that appear in the column to the row indexes where they appear.""" dd: defaultdict[str, list[int]] = defaultdict(list) for i, curie in enumerate(df[column]): @@ -144,7 +142,7 @@ def get_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[int] return dict(dd) -def keep_curies( +def keep_df_curies( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -158,11 +156,11 @@ def keep_curies( The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep :returns: If not in place, return a new dataframe. """ - idx = get_keep_curies_index(df=df, column=column, curie=curie) + idx = get_keep_df_curies_index(df=df, column=column, curie=curie) return df[idx] -def get_dense_prefix( +def get_df_prefixes_index( df: pd.DataFrame, column: str | int, *, @@ -175,84 +173,3 @@ def get_dense_prefix( for i, prefix in enumerate(df[column].map(f)): dd[prefix].append(i) return dict(dd) - - -def _split_msdf_by_prefix( - msdf: sssom.MappingSetDataFrame, - subject_prefixes: Collection[str], - predicates: Collection[str], - object_prefixes: Collection[str], -) -> dict[str, sssom.MappingSetDataFrame]: - """Split a MSDF, a drop-in replacement for :func:`sssom.parsers.split_dataframe_by_prefix`.""" - from sssom.io import from_sssom_dataframe - - rr = _split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) - rv = {} - for (subject_prefix, predicate, object_prefix), df in rr: - predicate_reference = msdf.converter.parse_curie(predicate, strict=True) - subconverter = msdf.converter.get_subconverter( - [subject_prefix, predicate_reference.prefix, object_prefix] - ) - split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" - rv[split] = from_sssom_dataframe(df, prefix_map=dict(subconverter.bimap), meta=msdf.meta) - return rv - - -_SplitMethod: TypeAlias = Literal[1, 2] - - -# this is split out from SSSOM -def _split_dataframe_by_prefix( - df: pd.DataFrame, - subject_prefixes: str | Collection[str], - predicates: str | Collection[str], - object_prefixes: str | Collection[str], - *, - method: _SplitMethod | None = None, -) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: - if isinstance(subject_prefixes, str): - subject_prefixes = [subject_prefixes] - if isinstance(predicates, str): - predicates = [predicates] - if isinstance(object_prefixes, str): - object_prefixes = [object_prefixes] - - if method == 1 or method is None: - s_indexes = { - subject_prefix: get_keep_prefixes_index(df, column="subject_id", prefix=subject_prefix) - for subject_prefix in subject_prefixes - } - p_indexes = { - predicate: get_keep_curies_index(df, column="predicate_id", curie=predicate) - for predicate in predicates - } - o_indexes = { - object_prefix: get_keep_prefixes_index(df, column="object_id", prefix=object_prefix) - for object_prefix in object_prefixes - } - for subject_prefix, predicate, object_prefix in itt.product( - subject_prefixes, predicates, object_prefixes - ): - idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] - if not idx.any(): - continue - yield (subject_prefix, predicate, object_prefix), df[idx] - - elif method == 2: - s_index = get_dense_prefix(df, "subject_id") - p_index = get_curies_index(df, "predicate_id") - o_index = get_dense_prefix(df, "object_id") - for subject_prefix, predicate, object_prefix in itt.product( - subject_prefixes, predicates, object_prefixes - ): - method_2_idx: list[int] = sorted( - set(s_index.get(subject_prefix, [])) - .intersection(p_index.get(predicate, [])) - .intersection(o_index.get(object_prefix, [])) - ) - if not method_2_idx: - continue - yield (subject_prefix, predicate, object_prefix), df.iloc[method_2_idx] - - else: - raise ValueError diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index f05c4943..0027fcf8 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -6,15 +6,14 @@ import pandas as pd from curies import Converter +from curies._sssom_exploration import SplitMethod, split_dataframe_by_prefix from curies.dataframe import ( Method, - _split_dataframe_by_prefix, - _SplitMethod, - get_curies_index, - get_dense_prefix, - get_keep_prefixes_index, - keep_curies, - keep_prefixes, + get_df_curies_index, + get_df_prefixes_index, + get_keep_df_prefixes_index, + keep_df_curies, + keep_df_prefixes, ) CONVERTER = Converter.from_prefix_map( @@ -41,27 +40,29 @@ def test_get_prefix_index(self) -> None: for method in typing.get_args(Method): with self.subTest(method=method): - idx = get_keep_prefixes_index(df, column, "a", method=method, converter=CONVERTER) + idx = get_keep_df_prefixes_index( + df, column, "a", method=method, converter=CONVERTER + ) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) - idx = get_keep_prefixes_index( + idx = get_keep_df_prefixes_index( df, column, ["a", "b"], method=method, converter=CONVERTER ) self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) - df_a = keep_prefixes(df, column, "a") + df_a = keep_df_prefixes(df, column, "a") self.assertEqual(set(a_curies), set(df_a[column])) - df_ab = keep_prefixes(df, column, ["a", "b"]) + df_ab = keep_df_prefixes(df, column, ["a", "b"]) self.assertEqual({*a_curies, *b_curies}, set(df_ab[column])) - df_a1 = keep_curies(df, column, "a:1") + df_a1 = keep_df_curies(df, column, "a:1") self.assertEqual({"a:1"}, set(df_a1[column])) - df_a123 = keep_curies(df, column, ["a:1", "a:2", "b:1"]) + df_a123 = keep_df_curies(df, column, ["a:1", "a:2", "b:1"]) self.assertEqual({"a:1", "a:2", "b:1"}, set(df_a123[column])) - dense_prefix_mapping = get_dense_prefix(df, column) + dense_prefix_mapping = get_df_prefixes_index(df, column) self.assertEqual( { "a": [0, 1, 2, 3, 4], @@ -71,7 +72,7 @@ def test_get_prefix_index(self) -> None: dense_prefix_mapping, ) - dense_curie_mapping = get_curies_index(df, column) + dense_curie_mapping = get_df_curies_index(df, column) self.assertNotIn("a", dense_curie_mapping) self.assertNotIn("b", dense_curie_mapping) self.assertNotIn("c", dense_curie_mapping) @@ -89,29 +90,29 @@ def test_split_df(self) -> None: ("p5:1", "skos:broaderMatch", "p6:1"), ] df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) - for method in typing.get_args(_SplitMethod): + for method in typing.get_args(SplitMethod): with self.subTest(method=method): # test that if there's ever an empty list, then it returns an empty dict self.assertFalse( dict( - _split_dataframe_by_prefix( + split_dataframe_by_prefix( df, [], ["skos:exactMatch"], ["p2"], method=method ) ) ) self.assertFalse( - dict(_split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) + dict(split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) ) self.assertFalse( dict( - _split_dataframe_by_prefix( + split_dataframe_by_prefix( df, ["p1"], ["skos:exactMatch"], [], method=method ) ) ) rv = dict( - _split_dataframe_by_prefix( + split_dataframe_by_prefix( df, ["p1"], ["skos:exactMatch"], ["p2"], method=method ) ) From 959c5e8bf46ebfc8026dd8f80f6b9d82f46e1ef9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 13:49:24 +0200 Subject: [PATCH 11/19] more reorg --- src/curies/_sssom_exploration.py | 45 ++++++++++++++++++++++++++++++++ src/curies/dataframe.py | 12 +++++---- tests/test_dataframe.py | 44 ++----------------------------- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py index 55527c41..287362ae 100644 --- a/src/curies/_sssom_exploration.py +++ b/src/curies/_sssom_exploration.py @@ -3,6 +3,8 @@ from __future__ import annotations import itertools as itt +import typing +import unittest from collections.abc import Collection, Iterable from typing import TYPE_CHECKING, Literal, TypeAlias @@ -105,3 +107,46 @@ def split_dataframe_by_prefix( else: raise ValueError + + +class TestSplit(unittest.TestCase): + """A test case for dataframe utilities.""" + + def test_split_df(self) -> None: + """Test the precursor to SSSOM function.""" + rows = [ + ("p1:1", "skos:exactMatch", "p2:1"), + ("p1:2", "skos:exactMatch", "p2:2"), + ("p1:2", "skos:exactMatch", "p3:2"), + ("p4:1", "skos:exactMatch", "p1:1"), + ("p5:1", "skos:broaderMatch", "p6:1"), + ] + df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) + for method in typing.get_args(SplitMethod): + with self.subTest(method=method): + # test that if there's ever an empty list, then it returns an empty dict + self.assertFalse( + dict( + split_dataframe_by_prefix( + df, [], ["skos:exactMatch"], ["p2"], method=method + ) + ) + ) + self.assertFalse( + dict(split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) + ) + self.assertFalse( + dict( + split_dataframe_by_prefix( + df, ["p1"], ["skos:exactMatch"], [], method=method + ) + ) + ) + + rv = dict( + split_dataframe_by_prefix( + df, ["p1"], ["skos:exactMatch"], ["p2"], method=method + ) + ) + self.assertIn(("p1", "skos:exactMatch", "p2"), rv) + self.assertEqual(1, len(rv)) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 363d860e..741ed2c3 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -14,6 +14,7 @@ import pandas as pd __all__ = [ + "PrefixIndexMethod", "get_df_curies_index", "get_df_prefixes_index", "get_keep_df_curies_index", @@ -70,7 +71,8 @@ def _func(curie: str) -> str: return _func -Method: TypeAlias = Literal["a", "b"] +#: The method for filtering on prefixe +PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"] def get_keep_df_prefixes_index( @@ -78,14 +80,14 @@ def get_keep_df_prefixes_index( column: str | int, prefix: str | Collection[str], *, - method: Method | None = None, + method: PrefixIndexMethod | None = None, converter: Converter | None = None, validate: bool = False, ) -> pd.Series: """Get an index of CURIEs in the given column that start with the prefix(es).""" - if method == "a" or method is None: + if method == "iterative" or method is None: return df[column].map(_get_prefix_checker(prefix)) - elif method == "b": + elif method == "precalculated": if converter is None: # pragma: no cover raise ValueError("a converter is required for method B") prefix_series = _get_prefixes_from_curie_column(df, column, converter, validate=validate) @@ -102,7 +104,7 @@ def keep_df_prefixes( column: str | int, prefix: str | Collection[str], *, - method: Method | None = None, + method: PrefixIndexMethod | None = None, converter: Converter | None = None, ) -> pd.DataFrame: """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes. diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 0027fcf8..51a3a5e9 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -6,9 +6,8 @@ import pandas as pd from curies import Converter -from curies._sssom_exploration import SplitMethod, split_dataframe_by_prefix from curies.dataframe import ( - Method, + PrefixIndexMethod, get_df_curies_index, get_df_prefixes_index, get_keep_df_prefixes_index, @@ -38,7 +37,7 @@ def test_get_prefix_index(self) -> None: rows = [(curie,) for curie in curies] df = pd.DataFrame(rows, columns=[column]) - for method in typing.get_args(Method): + for method in typing.get_args(PrefixIndexMethod): with self.subTest(method=method): idx = get_keep_df_prefixes_index( df, column, "a", method=method, converter=CONVERTER @@ -80,45 +79,6 @@ def test_get_prefix_index(self) -> None: self.assertEqual([0], dense_curie_mapping["a:0"]) self.assertEqual([10, 15], dense_curie_mapping["c:0"]) - def test_split_df(self) -> None: - """Test the precursor to SSSOM function.""" - rows = [ - ("p1:1", "skos:exactMatch", "p2:1"), - ("p1:2", "skos:exactMatch", "p2:2"), - ("p1:2", "skos:exactMatch", "p3:2"), - ("p4:1", "skos:exactMatch", "p1:1"), - ("p5:1", "skos:broaderMatch", "p6:1"), - ] - df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) - for method in typing.get_args(SplitMethod): - with self.subTest(method=method): - # test that if there's ever an empty list, then it returns an empty dict - self.assertFalse( - dict( - split_dataframe_by_prefix( - df, [], ["skos:exactMatch"], ["p2"], method=method - ) - ) - ) - self.assertFalse( - dict(split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) - ) - self.assertFalse( - dict( - split_dataframe_by_prefix( - df, ["p1"], ["skos:exactMatch"], [], method=method - ) - ) - ) - - rv = dict( - split_dataframe_by_prefix( - df, ["p1"], ["skos:exactMatch"], ["p2"], method=method - ) - ) - self.assertIn(("p1", "skos:exactMatch", "p2"), rv) - self.assertEqual(1, len(rv)) - def _rr(series: pd.Series) -> list[int]: return [index for index, value in enumerate(series) if value] From 747e8113fd9e2a6c717576af1ac22b2ed665463d Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:02:04 +0200 Subject: [PATCH 12/19] More reorg --- src/curies/api.py | 20 ++------------------ src/curies/dataframe.py | 4 +++- src/curies/utils.py | 27 +++++++++++++++++++++++++++ tests/test_api.py | 2 +- 4 files changed, 33 insertions(+), 20 deletions(-) create mode 100644 src/curies/utils.py diff --git a/src/curies/api.py b/src/curies/api.py index 5a7d7d3f..cf7743ac 100644 --- a/src/curies/api.py +++ b/src/curies/api.py @@ -37,6 +37,8 @@ from pytrie import StringTrie from typing_extensions import Self +from .utils import _split + if TYPE_CHECKING: # pragma: no cover import pandas import rdflib @@ -77,13 +79,6 @@ def _get_field_validator_values(values, key: str): # type:ignore return values.data[key] -def _split(curie: str, *, sep: str = ":") -> tuple[str, str]: - prefix, delimiter, identifier = curie.partition(sep) - if not delimiter: - raise NoCURIEDelimiterError(curie) - return prefix, identifier - - class ReferenceTuple(NamedTuple): """A pair of a prefix (corresponding to a semantic space) and a local unique identifier in that semantic space. @@ -669,17 +664,6 @@ class DuplicateSummary(NamedTuple): prefix: str -class NoCURIEDelimiterError(ValueError): - """An error thrown on a string with no delimiter.""" - - def __init__(self, curie: str): - """Initialize the error.""" - self.curie = curie - - def __str__(self) -> str: - return f"{self.curie} does not appear to be a CURIE - missing a delimiter" - - class DuplicateValueError(ValueError): """An error raised with constructing a converter with data containing duplicate values.""" diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 741ed2c3..020e7193 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -8,11 +8,13 @@ from typing_extensions import TypeAlias -from curies.api import Converter, _split +from .utils import _split if TYPE_CHECKING: import pandas as pd + from .api import Converter + __all__ = [ "PrefixIndexMethod", "get_df_curies_index", diff --git a/src/curies/utils.py b/src/curies/utils.py new file mode 100644 index 00000000..b984b956 --- /dev/null +++ b/src/curies/utils.py @@ -0,0 +1,27 @@ +"""Utilities for working with strings.""" + +from __future__ import annotations + +__all__ = [ + "NoCURIEDelimiterError", + "_split", +] + + +class NoCURIEDelimiterError(ValueError): + """An error thrown on a string with no delimiter.""" + + def __init__(self, curie: str): + """Initialize the error.""" + self.curie = curie + + def __str__(self) -> str: + return f"{self.curie} does not appear to be a CURIE - missing a delimiter" + + +def _split(curie: str, *, sep: str = ":") -> tuple[str, str]: + """Split a CURIE string using string operations.""" + prefix, delimiter, identifier = curie.partition(sep) + if not delimiter: + raise NoCURIEDelimiterError(curie) + return prefix, identifier diff --git a/tests/test_api.py b/tests/test_api.py index 19322562..98531d2b 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -22,7 +22,6 @@ ExpansionError, NamableReference, NamedReference, - NoCURIEDelimiterError, PrefixStandardizationError, Record, Records, @@ -39,6 +38,7 @@ get_monarch_converter, get_obo_converter, ) +from curies.utils import NoCURIEDelimiterError from curies.version import get_version from tests.constants import SLOW From 2c4b5b2897fe03d05b51e24c0d0b2e579c7d701b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:04:05 +0200 Subject: [PATCH 13/19] Add reusable prefix from curie raw function --- src/curies/dataframe.py | 6 ++---- src/curies/utils.py | 5 +++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 020e7193..966ec157 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -8,7 +8,7 @@ from typing_extensions import TypeAlias -from .utils import _split +from .utils import _prefix_from_curie if TYPE_CHECKING: import pandas as pd @@ -59,9 +59,7 @@ def _get_curie_parser( # TODO handle invalid CURIEs? if not validate: - - def _func(curie: str) -> str: - return _split(curie)[0] + return _prefix_from_curie elif converter is None: raise ValueError("converter is required for validation") else: diff --git a/src/curies/utils.py b/src/curies/utils.py index b984b956..50188475 100644 --- a/src/curies/utils.py +++ b/src/curies/utils.py @@ -25,3 +25,8 @@ def _split(curie: str, *, sep: str = ":") -> tuple[str, str]: if not delimiter: raise NoCURIEDelimiterError(curie) return prefix, identifier + + +def _prefix_from_curie(curie: str, *, sep: str = ":") -> str: + """Split a CURIE string using string operations and return the prefix.""" + return _split(curie, sep=sep)[0] From ffb305653bb4eb9cf2d56476573e9b669c4aaa46 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:08:19 +0200 Subject: [PATCH 14/19] One more rename --- src/curies/_sssom_exploration.py | 10 +++++----- src/curies/dataframe.py | 12 ++++++------ tests/test_dataframe.py | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py index 287362ae..67b9b633 100644 --- a/src/curies/_sssom_exploration.py +++ b/src/curies/_sssom_exploration.py @@ -12,9 +12,9 @@ from curies.dataframe import ( get_df_curies_index, + get_df_keep_curies_index, + get_df_keep_prefixes_index, get_df_prefixes_index, - get_keep_df_curies_index, - get_keep_df_prefixes_index, ) if TYPE_CHECKING: @@ -68,17 +68,17 @@ def split_dataframe_by_prefix( if method == "disjoint-indexes" or method is None: s_indexes = { - subject_prefix: get_keep_df_prefixes_index( + subject_prefix: get_df_keep_prefixes_index( df, column="subject_id", prefix=subject_prefix ) for subject_prefix in subject_prefixes } p_indexes = { - predicate: get_keep_df_curies_index(df, column="predicate_id", curie=predicate) + predicate: get_df_keep_curies_index(df, column="predicate_id", curie=predicate) for predicate in predicates } o_indexes = { - object_prefix: get_keep_df_prefixes_index(df, column="object_id", prefix=object_prefix) + object_prefix: get_df_keep_prefixes_index(df, column="object_id", prefix=object_prefix) for object_prefix in object_prefixes } for subject_prefix, predicate, object_prefix in itt.product( diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 966ec157..d738ec74 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -18,9 +18,9 @@ __all__ = [ "PrefixIndexMethod", "get_df_curies_index", + "get_df_keep_curies_index", + "get_df_keep_prefixes_index", "get_df_prefixes_index", - "get_keep_df_curies_index", - "get_keep_df_prefixes_index", "keep_df_curies", "keep_df_prefixes", ] @@ -75,7 +75,7 @@ def _func(curie: str) -> str: PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"] -def get_keep_df_prefixes_index( +def get_df_keep_prefixes_index( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -118,13 +118,13 @@ def keep_df_prefixes( :param converter: A converter :returns: If not in place, return a new dataframe. """ - idx = get_keep_df_prefixes_index( + idx = get_df_keep_prefixes_index( df=df, column=column, prefix=prefix, method=method, converter=converter ) return df[idx] -def get_keep_df_curies_index( +def get_df_keep_curies_index( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -158,7 +158,7 @@ def keep_df_curies( The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep :returns: If not in place, return a new dataframe. """ - idx = get_keep_df_curies_index(df=df, column=column, curie=curie) + idx = get_df_keep_curies_index(df=df, column=column, curie=curie) return df[idx] diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 51a3a5e9..274f3660 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -9,8 +9,8 @@ from curies.dataframe import ( PrefixIndexMethod, get_df_curies_index, + get_df_keep_prefixes_index, get_df_prefixes_index, - get_keep_df_prefixes_index, keep_df_curies, keep_df_prefixes, ) @@ -39,12 +39,12 @@ def test_get_prefix_index(self) -> None: for method in typing.get_args(PrefixIndexMethod): with self.subTest(method=method): - idx = get_keep_df_prefixes_index( + idx = get_df_keep_prefixes_index( df, column, "a", method=method, converter=CONVERTER ) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) - idx = get_keep_df_prefixes_index( + idx = get_df_keep_prefixes_index( df, column, ["a", "b"], method=method, converter=CONVERTER ) self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) From 07ff9a84935a72cddea1d257a12f29f581956bc8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:14:43 +0200 Subject: [PATCH 15/19] One more one more rename --- src/curies/_sssom_exploration.py | 12 +++++++----- src/curies/dataframe.py | 20 ++++++++++---------- tests/test_dataframe.py | 18 +++++++++--------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py index 67b9b633..348e539b 100644 --- a/src/curies/_sssom_exploration.py +++ b/src/curies/_sssom_exploration.py @@ -12,9 +12,9 @@ from curies.dataframe import ( get_df_curies_index, - get_df_keep_curies_index, - get_df_keep_prefixes_index, get_df_prefixes_index, + get_filter_df_by_curies_index, + get_filter_df_by_prefixes_index, ) if TYPE_CHECKING: @@ -68,17 +68,19 @@ def split_dataframe_by_prefix( if method == "disjoint-indexes" or method is None: s_indexes = { - subject_prefix: get_df_keep_prefixes_index( + subject_prefix: get_filter_df_by_prefixes_index( df, column="subject_id", prefix=subject_prefix ) for subject_prefix in subject_prefixes } p_indexes = { - predicate: get_df_keep_curies_index(df, column="predicate_id", curie=predicate) + predicate: get_filter_df_by_curies_index(df, column="predicate_id", curie=predicate) for predicate in predicates } o_indexes = { - object_prefix: get_df_keep_prefixes_index(df, column="object_id", prefix=object_prefix) + object_prefix: get_filter_df_by_prefixes_index( + df, column="object_id", prefix=object_prefix + ) for object_prefix in object_prefixes } for subject_prefix, predicate, object_prefix in itt.product( diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index d738ec74..02dce56d 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -17,12 +17,12 @@ __all__ = [ "PrefixIndexMethod", + "filter_df_by_curies", + "filter_df_by_prefixes", "get_df_curies_index", - "get_df_keep_curies_index", - "get_df_keep_prefixes_index", "get_df_prefixes_index", - "keep_df_curies", - "keep_df_prefixes", + "get_filter_df_by_curies_index", + "get_filter_df_by_prefixes_index", ] @@ -75,7 +75,7 @@ def _func(curie: str) -> str: PrefixIndexMethod: TypeAlias = Literal["iterative", "precalculated"] -def get_df_keep_prefixes_index( +def get_filter_df_by_prefixes_index( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -99,7 +99,7 @@ def get_df_keep_prefixes_index( raise ValueError(f"invalid method given: {method}") -def keep_df_prefixes( +def filter_df_by_prefixes( df: pd.DataFrame, column: str | int, prefix: str | Collection[str], @@ -118,13 +118,13 @@ def keep_df_prefixes( :param converter: A converter :returns: If not in place, return a new dataframe. """ - idx = get_df_keep_prefixes_index( + idx = get_filter_df_by_prefixes_index( df=df, column=column, prefix=prefix, method=method, converter=converter ) return df[idx] -def get_df_keep_curies_index( +def get_filter_df_by_curies_index( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -144,7 +144,7 @@ def get_df_curies_index(df: pd.DataFrame, column: str | int) -> dict[str, list[i return dict(dd) -def keep_df_curies( +def filter_df_by_curies( df: pd.DataFrame, column: str | int, curie: str | Collection[str], @@ -158,7 +158,7 @@ def keep_df_curies( The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep :returns: If not in place, return a new dataframe. """ - idx = get_df_keep_curies_index(df=df, column=column, curie=curie) + idx = get_filter_df_by_curies_index(df=df, column=column, curie=curie) return df[idx] diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 274f3660..9ad27a45 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -8,11 +8,11 @@ from curies import Converter from curies.dataframe import ( PrefixIndexMethod, + filter_df_by_curies, + filter_df_by_prefixes, get_df_curies_index, - get_df_keep_prefixes_index, get_df_prefixes_index, - keep_df_curies, - keep_df_prefixes, + get_filter_df_by_prefixes_index, ) CONVERTER = Converter.from_prefix_map( @@ -39,26 +39,26 @@ def test_get_prefix_index(self) -> None: for method in typing.get_args(PrefixIndexMethod): with self.subTest(method=method): - idx = get_df_keep_prefixes_index( + idx = get_filter_df_by_prefixes_index( df, column, "a", method=method, converter=CONVERTER ) self.assertEqual([0, 1, 2, 3, 4], _rr(idx)) - idx = get_df_keep_prefixes_index( + idx = get_filter_df_by_prefixes_index( df, column, ["a", "b"], method=method, converter=CONVERTER ) self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], _rr(idx)) - df_a = keep_df_prefixes(df, column, "a") + df_a = filter_df_by_prefixes(df, column, "a") self.assertEqual(set(a_curies), set(df_a[column])) - df_ab = keep_df_prefixes(df, column, ["a", "b"]) + df_ab = filter_df_by_prefixes(df, column, ["a", "b"]) self.assertEqual({*a_curies, *b_curies}, set(df_ab[column])) - df_a1 = keep_df_curies(df, column, "a:1") + df_a1 = filter_df_by_curies(df, column, "a:1") self.assertEqual({"a:1"}, set(df_a1[column])) - df_a123 = keep_df_curies(df, column, ["a:1", "a:2", "b:1"]) + df_a123 = filter_df_by_curies(df, column, ["a:1", "a:2", "b:1"]) self.assertEqual({"a:1", "a:2", "b:1"}, set(df_a123[column])) dense_prefix_mapping = get_df_prefixes_index(df, column) From 30d4985e06f4e694dc1d7939af5f185fcfb7114b Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:18:01 +0200 Subject: [PATCH 16/19] Pass mypy --- src/curies/_sssom_exploration.py | 3 ++- src/curies/dataframe.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py index 348e539b..5c5be185 100644 --- a/src/curies/_sssom_exploration.py +++ b/src/curies/_sssom_exploration.py @@ -6,9 +6,10 @@ import typing import unittest from collections.abc import Collection, Iterable -from typing import TYPE_CHECKING, Literal, TypeAlias +from typing import TYPE_CHECKING, Literal import pandas as pd +from typing_extensions import TypeAlias from curies.dataframe import ( get_df_curies_index, diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index 02dce56d..e235c16e 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -47,7 +47,7 @@ def _func(curie: str) -> bool: def _get_prefixes_from_curie_column( df: pd.DataFrame, column: int | str, converter: Converter, validate: bool -) -> pd.Series: +) -> pd.Series[str]: return df[column].map(_get_curie_parser(converter=converter, validate=validate)) @@ -83,7 +83,7 @@ def get_filter_df_by_prefixes_index( method: PrefixIndexMethod | None = None, converter: Converter | None = None, validate: bool = False, -) -> pd.Series: +) -> pd.Series[bool]: """Get an index of CURIEs in the given column that start with the prefix(es).""" if method == "iterative" or method is None: return df[column].map(_get_prefix_checker(prefix)) @@ -128,7 +128,7 @@ def get_filter_df_by_curies_index( df: pd.DataFrame, column: str | int, curie: str | Collection[str], -) -> pd.Series: +) -> pd.Series[bool]: """Get an index of CURIEs in the given column that are the given CURIE(s).""" if isinstance(curie, str): return df[column] == curie From 3ae780c38ef386cfa7b0a21fe82aac59cb9309d8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:18:29 +0200 Subject: [PATCH 17/19] Update test_dataframe.py --- tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataframe.py b/tests/test_dataframe.py index 9ad27a45..801f747b 100644 --- a/tests/test_dataframe.py +++ b/tests/test_dataframe.py @@ -80,5 +80,5 @@ def test_get_prefix_index(self) -> None: self.assertEqual([10, 15], dense_curie_mapping["c:0"]) -def _rr(series: pd.Series) -> list[int]: +def _rr(series: "pd.Series[bool]") -> list[int]: return [index for index, value in enumerate(series) if value] From bc61a3354530e24212e3e805305162303187bd27 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:22:47 +0200 Subject: [PATCH 18/19] Add tutorial --- docs/source/dataframe.rst | 2 + docs/source/index.rst | 1 + src/curies/_sssom_exploration.py | 155 ------------------------------- src/curies/dataframe.py | 16 ++-- 4 files changed, 11 insertions(+), 163 deletions(-) create mode 100644 docs/source/dataframe.rst delete mode 100644 src/curies/_sssom_exploration.py diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst new file mode 100644 index 00000000..6381137a --- /dev/null +++ b/docs/source/dataframe.rst @@ -0,0 +1,2 @@ +Working with Dataframes +======================= diff --git a/docs/source/index.rst b/docs/source/index.rst index 7989263b..450119fb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -70,3 +70,4 @@ The most recent code and data can be installed directly from GitHub with: w3c preprocessing database + dataframe diff --git a/src/curies/_sssom_exploration.py b/src/curies/_sssom_exploration.py deleted file mode 100644 index 5c5be185..00000000 --- a/src/curies/_sssom_exploration.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Utilities for SSSOM.""" - -from __future__ import annotations - -import itertools as itt -import typing -import unittest -from collections.abc import Collection, Iterable -from typing import TYPE_CHECKING, Literal - -import pandas as pd -from typing_extensions import TypeAlias - -from curies.dataframe import ( - get_df_curies_index, - get_df_prefixes_index, - get_filter_df_by_curies_index, - get_filter_df_by_prefixes_index, -) - -if TYPE_CHECKING: - import sssom - -__all__ = [ - "split_dataframe_by_prefix", - "split_msdf_by_prefix", -] - - -def split_msdf_by_prefix( - msdf: sssom.MappingSetDataFrame, - subject_prefixes: Collection[str], - predicates: Collection[str], - object_prefixes: Collection[str], -) -> dict[str, sssom.MappingSetDataFrame]: - """Split a MSDF, a drop-in replacement for :func:`sssom.parsers.split_dataframe_by_prefix`.""" - from sssom.io import from_sssom_dataframe - - rr = split_dataframe_by_prefix(msdf.df, subject_prefixes, predicates, object_prefixes) - rv = {} - for (subject_prefix, predicate, object_prefix), df in rr: - predicate_reference = msdf.converter.parse_curie(predicate, strict=True) - subconverter = msdf.converter.get_subconverter( - [subject_prefix, predicate_reference.prefix, object_prefix] - ) - split = f"{subject_prefix.lower()}_{predicate.lower()}_{object_prefix.lower()}" - rv[split] = from_sssom_dataframe(df, prefix_map=dict(subconverter.bimap), meta=msdf.meta) - return rv - - -SplitMethod: TypeAlias = Literal["disjoint-indexes", "dense-indexes"] - - -def split_dataframe_by_prefix( - df: pd.DataFrame, - subject_prefixes: str | Collection[str], - predicates: str | Collection[str], - object_prefixes: str | Collection[str], - *, - method: SplitMethod | None = None, -) -> Iterable[tuple[tuple[str, str, str], pd.DataFrame]]: - """Iterate over splits on a dataframe.""" - if isinstance(subject_prefixes, str): - subject_prefixes = [subject_prefixes] - if isinstance(predicates, str): - predicates = [predicates] - if isinstance(object_prefixes, str): - object_prefixes = [object_prefixes] - - if method == "disjoint-indexes" or method is None: - s_indexes = { - subject_prefix: get_filter_df_by_prefixes_index( - df, column="subject_id", prefix=subject_prefix - ) - for subject_prefix in subject_prefixes - } - p_indexes = { - predicate: get_filter_df_by_curies_index(df, column="predicate_id", curie=predicate) - for predicate in predicates - } - o_indexes = { - object_prefix: get_filter_df_by_prefixes_index( - df, column="object_id", prefix=object_prefix - ) - for object_prefix in object_prefixes - } - for subject_prefix, predicate, object_prefix in itt.product( - subject_prefixes, predicates, object_prefixes - ): - idx = s_indexes[subject_prefix] & p_indexes[predicate] & o_indexes[object_prefix] - if not idx.any(): - continue - yield (subject_prefix, predicate, object_prefix), df[idx] - - elif method == "dense-indexes": - s_index = get_df_prefixes_index(df, "subject_id") - p_index = get_df_curies_index(df, "predicate_id") - o_index = get_df_prefixes_index(df, "object_id") - for subject_prefix, predicate, object_prefix in itt.product( - subject_prefixes, predicates, object_prefixes - ): - method_2_idx: list[int] = sorted( - set(s_index.get(subject_prefix, [])) - .intersection(p_index.get(predicate, [])) - .intersection(o_index.get(object_prefix, [])) - ) - if not method_2_idx: - continue - yield (subject_prefix, predicate, object_prefix), df.iloc[method_2_idx] - - else: - raise ValueError - - -class TestSplit(unittest.TestCase): - """A test case for dataframe utilities.""" - - def test_split_df(self) -> None: - """Test the precursor to SSSOM function.""" - rows = [ - ("p1:1", "skos:exactMatch", "p2:1"), - ("p1:2", "skos:exactMatch", "p2:2"), - ("p1:2", "skos:exactMatch", "p3:2"), - ("p4:1", "skos:exactMatch", "p1:1"), - ("p5:1", "skos:broaderMatch", "p6:1"), - ] - df = pd.DataFrame(rows, columns=["subject_id", "predicate_id", "object_id"]) - for method in typing.get_args(SplitMethod): - with self.subTest(method=method): - # test that if there's ever an empty list, then it returns an empty dict - self.assertFalse( - dict( - split_dataframe_by_prefix( - df, [], ["skos:exactMatch"], ["p2"], method=method - ) - ) - ) - self.assertFalse( - dict(split_dataframe_by_prefix(df, ["p1"], [], ["p2"], method=method)) - ) - self.assertFalse( - dict( - split_dataframe_by_prefix( - df, ["p1"], ["skos:exactMatch"], [], method=method - ) - ) - ) - - rv = dict( - split_dataframe_by_prefix( - df, ["p1"], ["skos:exactMatch"], ["p2"], method=method - ) - ) - self.assertIn(("p1", "skos:exactMatch", "p2"), rv) - self.assertEqual(1, len(rv)) diff --git a/src/curies/dataframe.py b/src/curies/dataframe.py index e235c16e..3a6c4af5 100644 --- a/src/curies/dataframe.py +++ b/src/curies/dataframe.py @@ -110,12 +110,12 @@ def filter_df_by_prefixes( """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes. :param df: A dataframe - :param column: - The integer index or column name of a column containing CURIEs - :param prefix: - The prefix (given as a string) or collection of prefixes (given as a list, set, etc.) to keep + :param column: The integer index or column name of a column containing CURIEs + :param prefix: The prefix (given as a string) or collection of prefixes (given as a + list, set, etc.) to keep :param method: The implementation for getting the prefix index :param converter: A converter + :returns: If not in place, return a new dataframe. """ idx = get_filter_df_by_prefixes_index( @@ -152,10 +152,10 @@ def filter_df_by_curies( """Filter a dataframe based on CURIEs in a given column having a given prefix or set of prefixes. :param df: A dataframe - :param column: - The integer index or column name of a column containing CURIEs - :param curie: - The CURIE (given as a string) or collection of CURIEs (given as a list, set, etc.) to keep + :param column: The integer index or column name of a column containing CURIEs + :param curie: The CURIE (given as a string) or collection of CURIEs (given as a + list, set, etc.) to keep + :returns: If not in place, return a new dataframe. """ idx = get_filter_df_by_curies_index(df=df, column=column, curie=curie) From 2f1a8bc6618ba43e0b1893aa2a2f6eb29610af31 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 5 Sep 2025 14:34:57 +0200 Subject: [PATCH 19/19] Add tutorial --- docs/source/api.rst | 3 +++ docs/source/dataframe.rst | 49 +++++++++++++++++++++++++++++++++++++++ src/curies/__init__.py | 3 +++ 3 files changed, 55 insertions(+) diff --git a/docs/source/api.rst b/docs/source/api.rst index 10b8bb92..50211703 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -3,3 +3,6 @@ API Reference .. automodapi:: curies :no-heading: + +.. automodapi:: curies.dataframe + :no-heading: diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst index 6381137a..5f28a0b8 100644 --- a/docs/source/dataframe.rst +++ b/docs/source/dataframe.rst @@ -1,2 +1,51 @@ Working with Dataframes ======================= + +Filtering +--------- + +In the following examples, we'll use a dataframe representing semantic mappings between +disease ontologies in the SSSOM format: + +============ =============== =============== ============================ +subject_id predicate_id object_id mapping_justification +============ =============== =============== ============================ +DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration +DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration +DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration +DOID:0080808 skos:exactMatch mesh:D000069295 semapv:ManualMappingCuration +============ =============== =============== ============================ + +First, to filter to objects that use EFO, use +:func:`curies.dataframe.filter_df_by_prefixes`: + +.. code-block:: python + + from curies.dataframe import filter_df_by_prefixes + + df = ... + df = filter_df_by_prefixes(df, column="object_id", prefixes=["efo"]) + +============ =============== =========== ============================ +subject_id predicate_id object_id mapping_justification +============ =============== =========== ============================ +DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration +DOID:0080799 skos:exactMatch EFO:1000527 semapv:ManualMappingCuration +============ =============== =========== ============================ + +Second, tto filter to rows that have the subject ``DOID:0080795``, use +:func:`curies.dataframe.filter_df_by_curies`: + +.. code-block:: python + + from curies.dataframe import filter_df_by_curies + + df = ... + df = filter_df_by_curies(df, column="subjects_id", curies=["DOID:0080795"]) + +============ =============== ============ ============================ +subject_id predicate_id object_id mapping_justification +============ =============== ============ ============================ +DOID:0080795 skos:exactMatch EFO:0003029 semapv:ManualMappingCuration +DOID:0080795 skos:exactMatch mesh:D015471 semapv:ManualMappingCuration +============ =============== ============ ============================ diff --git a/src/curies/__init__.py b/src/curies/__init__.py index a31cb545..11fa6558 100644 --- a/src/curies/__init__.py +++ b/src/curies/__init__.py @@ -24,6 +24,7 @@ write_shacl, write_tsv, ) +from .dataframe import filter_df_by_curies, filter_df_by_prefixes from .discovery import discover, discover_from_rdf from .preprocessing import ( PostprocessingRewrites, @@ -65,6 +66,8 @@ "chain", "discover", "discover_from_rdf", + "filter_df_by_curies", + "filter_df_by_prefixes", "get_bioregistry_converter", "get_go_converter", "get_monarch_converter",