Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[build-system]
requires = ["uv_build>=0.6.6,<0.7"]
requires = ["uv_build>=0.9.6,<1.0"]
build-backend = "uv_build"

[project]
Expand All @@ -20,7 +20,6 @@ classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Framework :: Pytest",
"Framework :: tox",
Expand All @@ -45,13 +44,11 @@ keywords = [

# License Information.
# See PEP-639 at https://peps.python.org/pep-0639/#add-license-files-key
license-files = [
"LICENSE",
]
license = { file = "LICENSE" }

requires-python = ">=3.10"
dependencies = [
"pystow",
"pystow>=0.8.0",
"pandas",
"pydantic",
"tqdm",
Expand All @@ -69,13 +66,22 @@ dependencies = [
"pyyaml",
"humanize",
"tabulate",
"sssom-pydantic",
]

[dependency-groups]
typing = [
"mypy",
"pydantic",
"types-requests",
"types-tabulate",
"types-PyYAML",
]

[project.optional-dependencies]
tests = [
"pytest",
"coverage[toml]",
"sssom>=0.4.16",
"httpx",
]
docs = [
Expand Down
1 change: 1 addition & 0 deletions src/semra/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#: The prefix used in CURIEs representing evidences
SEMRA_EVIDENCE_PREFIX = "semra.evidence"
SEMRA_EVIDENCE_URI_PREFIX = "https://w3id.org/biopragmatics/semra/evidence/"
SEMRA_EVIDENCE = bioregistry.Resource(prefix=SEMRA_EVIDENCE_PREFIX, name="SeMRA Evidence")

#: The prefix used in CURIEs representing mappings sets
Expand Down
8 changes: 4 additions & 4 deletions src/semra/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,18 @@
from bioontologies.robot import write_getter_warnings
from pydantic import BaseModel
from pyobo.getters import NoBuildError
from pystow.utils import gzip_compress, safe_open_writer
from tabulate import tabulate
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm
from zenodo_client import update_zenodo

from semra import Mapping
from semra.io import from_jsonl, from_pyobo, write_jsonl, write_neo4j, write_sssom
from semra.io.io_utils import safe_open_writer
from semra.pipeline import REFRESH_RAW_OPTION, REFRESH_SOURCE_OPTION
from semra.sources import SOURCE_RESOLVER
from semra.sources.wikidata import get_wikidata_mappings_by_prefix
from semra.utils import get_jinja_template, gzip_path
from semra.utils import get_jinja_template

__all__ = [
"build",
Expand Down Expand Up @@ -245,8 +245,8 @@ def build(
write_neo4j(mappings, NEO4J_DIR, compress="after")

# gzip these after the fact to avoid SIGKILLs
jsonl_gz_path = gzip_path(JSONL_PATH)
sssom_gz_path = gzip_path(SSSOM_PATH)
jsonl_gz_path = gzip_compress(JSONL_PATH)
sssom_gz_path = gzip_compress(SSSOM_PATH)

timedelta = time.time() - start
statistics = Statistics(
Expand Down
130 changes: 55 additions & 75 deletions src/semra/io/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import gzip
import csv
import logging
import pickle
import typing as t
Expand All @@ -16,16 +16,16 @@
import pydantic
import requests
import yaml
from pystow.utils import (
iter_pydantic_jsonl,
safe_open,
stream_write_pydantic_jsonl,
write_pydantic_jsonl,
)
from tqdm.autonotebook import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from .io_utils import (
CONFIDENCE_PRECISION,
get_confidence_str,
get_name_by_reference,
safe_open,
safe_open_writer,
)
from .io_utils import CONFIDENCE_PRECISION, get_confidence_str, get_name_by_reference
from ..rules import CURIE_TO_JUSTIFICATION, CURIE_TO_RELATION
from ..struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
from ..vocabulary import UNSPECIFIED_MAPPING
Expand Down Expand Up @@ -679,6 +679,7 @@ def write_sssom(
add_labels: bool = ...,
prune: bool = ...,
stream: Literal[True] = True,
mapping_set_id: str | None = ...,
) -> Generator[Mapping]: ...


Expand All @@ -691,6 +692,7 @@ def write_sssom(
add_labels: bool = ...,
prune: bool = ...,
stream: Literal[False] = False,
mapping_set_id: str | None = ...,
) -> None: ...


Expand All @@ -701,18 +703,20 @@ def write_sssom(
add_labels: bool = False,
prune: bool = True,
stream: bool = False,
mapping_set_id: str | None = None,
) -> None | Generator[Mapping]:
"""Export mappings as an SSSOM file (could be lossy)."""
if not prune:
if stream:
return _write_sssom_stream(mappings, file, stream=stream, add_labels=add_labels)
else:
return _write_sssom_stream(mappings, file, stream=stream, add_labels=add_labels)
return _write_sssom_stream( # type:ignore[no-any-return,call-overload]
mappings, file, stream=stream, add_labels=add_labels, mapping_set_id=mapping_set_id
)
elif stream:
raise ValueError("can not prune and stream at the same time")
else:
df = get_sssom_df(mappings, add_labels=add_labels)
df.to_csv(file, sep="\t", index=False)
with safe_open(file, operation="write", representation="text") as f:
_write_mapping_set_id(f, mapping_set_id)
df.to_csv(f, sep="\t", index=False)
return None


Expand All @@ -724,6 +728,7 @@ def _write_sssom_stream(
*,
stream: Literal[False] = False,
add_labels: bool = ...,
mapping_set_id: str | None = ...,
) -> None: ...


Expand All @@ -735,6 +740,7 @@ def _write_sssom_stream(
*,
stream: Literal[True] = True,
add_labels: bool = ...,
mapping_set_id: str | None = ...,
) -> Generator[Mapping]: ...


Expand All @@ -744,24 +750,37 @@ def _write_sssom_stream(
*,
stream: bool = False,
add_labels: bool = False,
mapping_set_id: str | None = None,
) -> Generator[Mapping] | None:
fallback_mapping_set_id = _get_fallback_mapping_set_id()
it = tqdm(mappings, desc="Writing SSSOM", leave=False, unit="mapping", unit_scale=True)
yv = _stream_write_sssom(
file, it, fallback_mapping_set_id, add_labels=add_labels, mapping_set_id=mapping_set_id
)
if stream:
return _stream_write_sssom(file, it, fallback_mapping_set_id, add_labels=add_labels)
return yv
else:
for _ in _stream_write_sssom(file, it, fallback_mapping_set_id, add_labels=add_labels):
for _ in yv:
pass
return None


def _write_mapping_set_id(file: TextIO, mapping_set_id: str | None) -> None:
if mapping_set_id is None:
mapping_set_id = _get_fallback_mapping_set_id()
file.write(f"#mapping_set_id: {mapping_set_id}\n")


def _stream_write_sssom(
path: str | Path | TextIO,
mappings: Iterable[Mapping],
fallback_mapping_set_id: str,
add_labels: bool = False,
mapping_set_id: str | None = None,
) -> Generator[Mapping]:
with safe_open_writer(path) as writer:
with safe_open(path, operation="write", representation="text") as file:
_write_mapping_set_id(file, mapping_set_id)
writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_NONE)
writer.writerow(SSSOM_DEFAULT_COLUMNS)
for mapping in mappings:
for evidence in mapping.evidence:
Expand All @@ -775,24 +794,14 @@ def _stream_write_sssom(

def write_pickle(mappings: list[Mapping], path: str | Path) -> None:
"""Write the mappings as a pickle."""
path = Path(path).resolve()
if path.suffix.endswith(".gz"):
with gzip.open(path, "wb") as file:
pickle.dump(mappings, file, protocol=pickle.HIGHEST_PROTOCOL)
else:
with path.open("wb") as file:
pickle.dump(mappings, file, protocol=pickle.HIGHEST_PROTOCOL)
with safe_open(path, representation="binary", operation="write") as file:
pickle.dump(mappings, file, protocol=pickle.HIGHEST_PROTOCOL)


def from_pickle(path: str | Path) -> list[Mapping]:
"""Read the mappings from a pickle."""
path = Path(path).resolve()
if path.suffix.endswith(".gz"):
with gzip.open(path, "rb") as file:
return cast(list[Mapping], pickle.load(file))
else:
with path.open("rb") as file:
return cast(list[Mapping], pickle.load(file))
with safe_open(path, representation="binary") as file:
return cast(list[Mapping], pickle.load(file))


# docstr-coverage:excused `overload`
Expand Down Expand Up @@ -829,33 +838,34 @@ def write_jsonl(
unit_scale=True,
disable=not show_progress,
)
# need this to include the evidence_type
kwargs = {"exclude_defaults": False, "exclude_unset": False}
if stream:
return _stream_write_jsonl(models, path)
return stream_write_pydantic_jsonl(models, path, **kwargs)
else:
with safe_open(path, read=False) as file:
for model in models:
file.write(f"{model.model_dump_json(exclude_none=True)}\n")
write_pydantic_jsonl(models, path, **kwargs)
return None


def _stream_write_jsonl(models: Iterable[X], path: str | Path) -> Generator[X]:
with safe_open(path, read=False) as file:
for model in models:
file.write(f"{model.model_dump_json(exclude_none=True)}\n")
yield model


# docstr-coverage:excused `overload`
@overload
def from_jsonl(
path: str | Path, *, show_progress: bool = ..., stream: Literal[False] = False
path: str | Path,
*,
show_progress: bool = ...,
stream: Literal[False] = False,
failure_action: Literal["raise", "skip"] = ...,
) -> list[Mapping]: ...


# docstr-coverage:excused `overload`
@overload
def from_jsonl(
path: str | Path, *, show_progress: bool = ..., stream: Literal[True] = True
path: str | Path,
*,
show_progress: bool = ...,
stream: Literal[True] = True,
failure_action: Literal["raise", "skip"] = ...,
) -> Iterable[Mapping]: ...


Expand All @@ -867,38 +877,8 @@ def from_jsonl(
failure_action: Literal["raise", "skip"] = "skip",
) -> list[Mapping] | Generator[Mapping]:
"""Read a list of Mapping objects from a JSONL file."""
rv = _iter_read_jsonl(path, show_progress=show_progress, failure_action=failure_action)
rv = iter_pydantic_jsonl(path, Mapping, progress=show_progress, failure_action=failure_action)
if stream:
return rv
return rv # type:ignore[return-value]
else:
return list(rv)


def _iter_read_jsonl(
path: str | Path,
*,
show_progress: bool = False,
failure_action: Literal["raise", "skip"] = "skip",
) -> Generator[Mapping]:
"""Stream mapping objects from a JSONL file."""
with safe_open(path, read=True) as file:
for i, line in enumerate(
tqdm(
file,
desc="Reading mappings",
leave=False,
unit="mapping",
unit_scale=True,
disable=not show_progress,
)
):
try:
yv = Mapping.model_validate_json(line.strip())
except pydantic.ValidationError:
if failure_action == "raise":
raise
else:
logger.debug("[line:%d] failed to parse JSON", i)
continue
else:
yield yv
Loading
Loading