Skip to content

Commit 671df41

Browse files
authored
Improve label output with SSSOM (#84)
this unifies the way that label adding is done, so now even streamable implementations can output labels for subjects/objects of mappings
1 parent d883418 commit 671df41

3 files changed

Lines changed: 58 additions & 39 deletions

File tree

src/semra/io/io.py

Lines changed: 47 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from .io_utils import (
2323
CONFIDENCE_PRECISION,
2424
get_confidence_str,
25-
get_name_by_curie,
25+
get_name_by_reference,
2626
safe_open,
2727
safe_open_writer,
2828
)
@@ -595,23 +595,13 @@ def get_sssom_df(
595595
"""
596596
fallback_mapping_set_id = _get_fallback_mapping_set_id()
597597
rows = [
598-
_get_sssom_row(mapping, evidence, fallback_mapping_set_id)
598+
_get_sssom_row(mapping, evidence, fallback_mapping_set_id, add_labels=add_labels)
599599
for mapping in tqdm(
600600
mappings, desc="Preparing SSSOM", leave=False, unit="mapping", unit_scale=True
601601
)
602602
for evidence in mapping.evidence
603603
]
604604
df = pd.DataFrame(rows, columns=SSSOM_DEFAULT_COLUMNS)
605-
if add_labels:
606-
with logging_redirect_tqdm():
607-
for label_column, id_column in [
608-
("subject_label", "subject_id"),
609-
("object_label", "object_id"),
610-
]:
611-
df[label_column] = [
612-
name or get_name_by_curie(curie)
613-
for curie, name in df[[id_column, label_column]].values
614-
]
615605

616606
if prune:
617607
# remove empty columns
@@ -626,7 +616,9 @@ def _format_confidence(confidence: float) -> str:
626616
return str(round(confidence, CONFIDENCE_PRECISION))
627617

628618

629-
def _get_sssom_row(mapping: Mapping, e: Evidence, fallback_mapping_set_id: str) -> SSSOMRow:
619+
def _get_sssom_row(
620+
mapping: Mapping, e: Evidence, fallback_mapping_set_id: str, *, add_labels: bool = False
621+
) -> SSSOMRow:
630622
if isinstance(e, SimpleEvidence):
631623
if e.mapping_set.purl:
632624
mapping_set_id = e.mapping_set.purl
@@ -648,12 +640,20 @@ def _get_sssom_row(mapping: Mapping, e: Evidence, fallback_mapping_set_id: str)
648640
else:
649641
raise TypeError
650642

643+
if add_labels:
644+
with logging_redirect_tqdm():
645+
subject_label = mapping.subject.name or get_name_by_reference(mapping.subject) or ""
646+
object_label = mapping.object.name or get_name_by_reference(mapping.object) or ""
647+
else:
648+
subject_label = mapping.subject.name or ""
649+
object_label = mapping.object.name or ""
650+
651651
return SSSOMRow(
652652
subject_id=mapping.subject.curie,
653-
subject_label=mapping.subject.name or "",
653+
subject_label=subject_label,
654654
predicate_id=mapping.predicate.curie,
655655
object_id=mapping.object.curie,
656-
object_label=mapping.object.name or "",
656+
object_label=object_label,
657657
mapping_justification=e.justification.curie,
658658
mapping_set_id=mapping_set_id,
659659
mapping_set_title=mapping_set_title,
@@ -700,13 +700,13 @@ def write_sssom(
700700
stream: bool = False,
701701
) -> None | Generator[Mapping]:
702702
"""Export mappings as an SSSOM file (could be lossy)."""
703-
if not add_labels and not prune:
703+
if not prune:
704704
if stream:
705-
return _write_sssom_stream(mappings, file, stream=stream)
705+
return _write_sssom_stream(mappings, file, stream=stream, add_labels=add_labels)
706706
else:
707-
return _write_sssom_stream(mappings, file, stream=stream)
707+
return _write_sssom_stream(mappings, file, stream=stream, add_labels=add_labels)
708708
elif stream:
709-
raise ValueError
709+
raise ValueError("can not prune and stream at the same time")
710710
else:
711711
df = get_sssom_df(mappings, add_labels=add_labels)
712712
df.to_csv(file, sep="\t", index=False)
@@ -716,41 +716,57 @@ def write_sssom(
716716
# docstr-coverage:excused `overload`
717717
@overload
718718
def _write_sssom_stream(
719-
mappings: Iterable[Mapping], file: str | Path | TextIO, *, stream: Literal[False] = False
719+
mappings: Iterable[Mapping],
720+
file: str | Path | TextIO,
721+
*,
722+
stream: Literal[False] = False,
723+
add_labels: bool = ...,
720724
) -> None: ...
721725

722726

723727
# docstr-coverage:excused `overload`
724728
@overload
725729
def _write_sssom_stream(
726-
mappings: Iterable[Mapping], file: str | Path | TextIO, *, stream: Literal[True] = True
730+
mappings: Iterable[Mapping],
731+
file: str | Path | TextIO,
732+
*,
733+
stream: Literal[True] = True,
734+
add_labels: bool = ...,
727735
) -> Generator[Mapping]: ...
728736

729737

730738
def _write_sssom_stream(
731-
mappings: Iterable[Mapping], file: str | Path | TextIO, *, stream: bool = False
739+
mappings: Iterable[Mapping],
740+
file: str | Path | TextIO,
741+
*,
742+
stream: bool = False,
743+
add_labels: bool = False,
732744
) -> Generator[Mapping] | None:
733745
fallback_mapping_set_id = _get_fallback_mapping_set_id()
734746
it = tqdm(mappings, desc="Writing SSSOM", leave=False, unit="mapping", unit_scale=True)
735747
if stream:
736-
return _stream_write_sssom(file, it, fallback_mapping_set_id)
748+
return _stream_write_sssom(file, it, fallback_mapping_set_id, add_labels=add_labels)
737749
else:
738-
with safe_open_writer(file) as writer:
739-
writer.writerow(SSSOM_DEFAULT_COLUMNS)
740-
for mapping in it:
741-
for evidence in mapping.evidence:
742-
writer.writerow(_get_sssom_row(mapping, evidence, fallback_mapping_set_id))
743-
return None
750+
for _ in _stream_write_sssom(file, it, fallback_mapping_set_id, add_labels=add_labels):
751+
pass
752+
return None
744753

745754

746755
def _stream_write_sssom(
747-
path: str | Path | TextIO, mappings: Iterable[Mapping], fallback_mapping_set_id: str
756+
path: str | Path | TextIO,
757+
mappings: Iterable[Mapping],
758+
fallback_mapping_set_id: str,
759+
add_labels: bool = False,
748760
) -> Generator[Mapping]:
749761
with safe_open_writer(path) as writer:
750762
writer.writerow(SSSOM_DEFAULT_COLUMNS)
751763
for mapping in mappings:
752764
for evidence in mapping.evidence:
753-
writer.writerow(_get_sssom_row(mapping, evidence, fallback_mapping_set_id))
765+
writer.writerow(
766+
_get_sssom_row(
767+
mapping, evidence, fallback_mapping_set_id, add_labels=add_labels
768+
)
769+
)
754770
yield mapping
755771

756772

src/semra/io/io_utils.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
import bioregistry
1414
import pyobo
1515
import requests
16+
from curies import Reference
1617

1718
from ..struct import ConfidenceMixin
1819

1920
__all__ = [
2021
"get_confidence_str",
21-
"get_name_by_curie",
22+
"get_name_by_reference",
2223
"get_orcid_name",
2324
"safe_open",
2425
"safe_open_writer",
@@ -35,13 +36,13 @@
3536
SKIP_PREFIXES.update(cast(bioregistry.Collection, bioregistry.get_collection("0000004")).resources)
3637

3738

38-
def get_name_by_curie(curie: str) -> str | None:
39+
def get_name_by_reference(reference: Reference) -> str | None:
3940
"""Get a name from a CURIE."""
40-
if any(curie.startswith(p) for p in SKIP_PREFIXES):
41+
if any(reference.prefix == p for p in SKIP_PREFIXES):
4142
return None
42-
if curie.startswith("orcid:"):
43-
return get_orcid_name(curie)
44-
return pyobo.get_name_by_curie(curie)
43+
if reference.prefix == "orcid":
44+
return get_orcid_name(reference.identifier)
45+
return pyobo.get_name(reference)
4546

4647

4748
@cache

src/semra/io/neo4j_io.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010
from jinja2 import Environment, FileSystemLoader, select_autoescape
1111
from pyobo import Reference
1212
from tqdm import tqdm
13+
from tqdm.contrib.logging import logging_redirect_tqdm
1314

14-
from .io_utils import get_confidence_str, get_name_by_curie, safe_open_writer
15+
from .io_utils import get_confidence_str, get_name_by_reference, safe_open_writer
1516
from ..rules import (
1617
SEMRA_EVIDENCE_PREFIX,
1718
SEMRA_MAPPING_PREFIX,
@@ -329,7 +330,8 @@ def _concept_to_row(
329330
) -> Sequence[str]:
330331
concept_curie = concept.curie
331332
if add_labels:
332-
name = concept.name or get_name_by_curie(concept_curie) or ""
333+
with logging_redirect_tqdm():
334+
name = concept.name or get_name_by_reference(concept) or ""
333335
else:
334336
name = concept.name or ""
335337
return (

0 commit comments

Comments
 (0)