Better text

cthoyt · cthoyt · commit e983c1a698b2 · 2025-07-04T00:56:40.000+02:00
diff --git a/README.md b/README.md
@@ -71,29 +71,29 @@ mapping = Mapping(
 
 ### Assembly
 
-Mappings can be assembled from many source formats using functions in the
-`semra.io` submodule:
+Mappings can be assembled from many source formats using I/O functions exposed
+through the top-level `semra` submodule:
 
 ```python
-import semra.io
+import semra
 
 # load mappings from any standardized SSSOM file as a file path or URL, via `pandas.read_csv`
 sssom_url = "https://w3id.org/biopragmatics/biomappings/sssom/biomappings.sssom.tsv"
-mappings = semra.io.from_sssom(
+mappings = semra.from_sssom(
     sssom_url, license="spdx:CC0-1.0", mapping_set_title="biomappings",
 )
 
 # alternatively, metadata can be passed via a file/URL
-mappings_alt = semra.io.from_sssom(
+mappings_alt = semra.from_sssom(
     sssom_url,
     metadata="https://w3id.org/biopragmatics/biomappings/sssom/biomappings.sssom.yml"
 )
 
 # load mappings from the Gene Ontology (via OBO format)
-go_mappings = semra.io.from_pyobo("go")
+go_mappings = semra.from_pyobo("go")
 
 # load mappings from the Uber Anatomy Ontology (via OWL format)
-uberon_mappings = semra.io.from_bioontologies("uberon")
+uberon_mappings = semra.from_bioontologies("uberon")
 ```
 
 SeMRA also implements custom importers in the `semra.sources` submodule. It's
@@ -281,7 +281,7 @@ these references can be standardized in a deterministic and principled way.
 
 ```python
 import chembl_downloader
-import semra.io
+import semra
 from semra.api import prioritize_df
 
 # A dataframe of indication-disease pairs, where the
@@ -291,7 +291,7 @@ df = chembl_downloader.query("SELECT DISTINCT drugind_id, efo_id FROM DRUG_INDIC
 # a pre-calculated prioritization of diseases and phenotypes from MONDO, DOID,
 # HPO, ICD, GARD, and more.
 url = "https://zenodo.org/records/15164180/files/priority.sssom.tsv?download=1"
-mappings = semra.io.from_sssom(url)
+mappings = semra.from_sssom(url)
 
 # the dataframe will now have a new column with standardized references
 prioritize_df(mappings, df, column="efo_id", target_column="priority_indication_curie")
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -31,12 +31,13 @@ the digital humanities. Get started by loading external mappings:
 
 .. code-block:: python
 
-    import semra.io
+    import semra
 
-    # load mappings from any standardized SSSOM file as a file path or URL, via `pandas.read_csv`
-    sssom_url = "https://w3id.org/biopragmatics/biomappings/sssom/biomappings.sssom.tsv"
-    mappings = semra.io.from_sssom(
-        sssom_url, license="spdx:CC0-1.0", mapping_set_title="biomappings",
+    mappings = semra.from_sssom(
+        # load mappings from any standardized SSSOM file as a file path or URL
+        "https://w3id.org/biopragmatics/biomappings/sssom/biomappings.sssom.tsv",
+        license="spdx:CC0-1.0",
+         mapping_set_title="biomappings",
     )
 
 Or by creating your own mappings:
@@ -97,9 +98,9 @@ web application for your use-case specific mapping database.
 SeMRA isn't itself a curation tool, but it has the option to integrate :mod:`biomappings`
 in deployments of its local web application for curation purposes.
 
-SeMRA isn't an tool for merging ontologies like `CoMerger <https://arxiv.org/abs/2005.02659>`_,
-but it outputs detailed and comprehensive semantic mappings that are critical
-as input for such tools.
+SeMRA isn't an tool for merging ontologies like `CoMerger <https://arxiv.org/abs/2005.02659>`_
+or `OntoMerger <https://arxiv.org/abs/2206.02238>`_, but it outputs detailed
+and comprehensive semantic mappings that are critical as input for such tools.
 
 Artifacts Overview
 ------------------
@@ -149,11 +150,11 @@ Table of Contents
     :name: start
 
     installation
-    tutorial
-    io
     pipeline
     artifacts
+    tutorial
     struct
+    io
     reference
     cli
 
diff --git a/docs/source/pipeline.rst b/docs/source/pipeline.rst
@@ -3,3 +3,4 @@ Mapping Assembly Pipeline
 
 .. automodapi:: semra.pipeline
     :no-heading:
+    :no-inheritance-diagram:
diff --git a/docs/source/reference.rst b/docs/source/reference.rst
@@ -1,7 +1,11 @@
 Reference
 =========
 
+This contains several SeMRA submodules with low-level functionality. You can use these
+to build your own mapping processing workflows and I/O.
+
 .. automodapi:: semra.api
+    :no-inheritance-diagram:
 
 .. automodapi:: semra.inference
 
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -1,12 +1,5 @@
-Usage
-=====
-
-1. I/O
-2. How to make a configuration and run it
-3. How to apply results
-
-Data Science Tutorial
----------------------
+Prioritizing CURIEs in a Dataframe
+==================================
 
 SeMRA provides tools for data scientists to standardize references using semantic
 mappings.
diff --git a/notebooks/gilda_reprocess.py b/notebooks/gilda_reprocess.py
@@ -5,18 +5,18 @@
 import pystow
 from gilda import Grounder
 from gilda.grounder import load_entries_from_terms_file
-from gilda.resources import get_grounding_terms, resource_dir
+from gilda.resources import get_grounding_terms
 
 from semra.gilda_utils import (
     GILDA_TO_BIOREGISTRY,
     print_scored_matches,
-    standardize_terms,
-    update_terms,
+    standardize_gilda_terms,
+    update_gilda_terms,
 )
-from semra.pipeline import Configuration, Input, Mutation, get_priority_mappings_from_config
+from semra.pipeline import AssembleReturnType, Configuration, Input, Mutation, assemble
 
 MODULE = pystow.module("semra", "gilda-demo")
-PROCESSED_GILDA_TERMS_PATH = resource_dir.joinpath("grounding_terms_standardized.tsv.gz")
+PROCESSED_GILDA_TERMS_PATH = MODULE.join(name="grounding_terms_standardized.tsv.gz")
 
 PRIORITY = [
     "HP",
@@ -42,6 +42,8 @@
 PRIORITY = [GILDA_TO_BIOREGISTRY[p] for p in PRIORITY]
 
 CONFIGURATION = Configuration(
+    key="gilda",
+    name="Gilda Reprocessing",
     inputs=[
         Input(source="biomappings"),
         Input(source="gilda"),
@@ -72,14 +74,14 @@ def _get_terms() -> list[gilda.Term]:
     from gilda.generate_terms import dump_terms
 
     terms: list[gilda.Term] = list(load_entries_from_terms_file(get_grounding_terms()))
-    terms = standardize_terms(terms)
+    terms = standardize_gilda_terms(terms)
     dump_terms(terms, PROCESSED_GILDA_TERMS_PATH)
     return terms
 
 
-def main():
+def main() -> None:
     """Reprocess the gilda default lexical index."""
-    mappings = get_priority_mappings_from_config(CONFIGURATION)
+    mappings = assemble(CONFIGURATION, return_type=AssembleReturnType.priority)
     if not mappings:
         raise ValueError("Bad mapping priority definition resulted in no mappings")
 
@@ -91,7 +93,7 @@ def main():
     if missing:
         raise ValueError(f"Missing: {sorted(missing)}")
 
-    terms = update_terms(terms, mappings)
+    terms = update_gilda_terms(terms, mappings)
 
     grounder = Grounder(terms)
     s = "Pelvic lipomatosis"
diff --git a/src/semra/__init__.py b/src/semra/__init__.py
@@ -1,5 +1,6 @@
 """Semantic Mapping Reasoner and Assembler."""
 
+from semra.io import from_bioontologies, from_jsonl, from_pyobo, from_sssom
 from semra.pipeline import Configuration, Input, Mutation
 from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
 from semra.vocabulary import (
@@ -33,4 +34,8 @@
     "ReasonedEvidence",
     "Reference",
     "SimpleEvidence",
+    "from_bioontologies",
+    "from_jsonl",
+    "from_pyobo",
+    "from_sssom",
 ]
diff --git a/src/semra/gilda_utils.py b/src/semra/gilda_utils.py
@@ -19,7 +19,7 @@
 from semra.struct import Mapping
 
 __all__ = [
-    "update_terms",
+    "update_gilda_terms",
 ]
 
 logger = logging.getLogger(__name__)
@@ -49,7 +49,7 @@
 REVERSE_GILDA_MAP = {v: k for k, v in GILDA_TO_BIOREGISTRY.items()}
 
 
-def update_terms(terms: list[gilda.Term], mappings: list[Mapping]) -> list[gilda.Term]:
+def update_gilda_terms(terms: list[gilda.Term], mappings: list[Mapping]) -> list[gilda.Term]:
     """Use a priority mapping to re-write terms with priority groundings.
 
     :param terms: A list of Gilda term objects
@@ -93,7 +93,7 @@ def update_terms(terms: list[gilda.Term], mappings: list[Mapping]) -> list[gilda
         source_terms = terms_index.pop(mapping.subject.pair, None)
         if source_terms:
             terms_index[mapping.object.pair].extend(
-                make_new_term(term, mapping.object.prefix, mapping.object.identifier)
+                make_new_gilda_term(term, mapping.object.prefix, mapping.object.identifier)
                 for term in source_terms
             )
 
@@ -102,16 +102,16 @@ def update_terms(terms: list[gilda.Term], mappings: list[Mapping]) -> list[gilda
     return cast(list[gilda.Term], gilda.term.filter_out_duplicates(new_terms))
 
 
-def standardize_terms(
+def standardize_gilda_terms(
     terms: t.Iterable[gilda.Term], *, multiprocessing: bool = True
 ) -> list[gilda.Term]:
     """Standardize a list of terms."""
     if not multiprocessing:
-        return [standardize_term(t) for t in terms]
+        return [standardize_gilda_term(t) for t in terms]
     return cast(
         list[gilda.Term],
         process_map(
-            standardize_term,
+            standardize_gilda_term,
             terms,
             unit="term",
             unit_scale=True,
@@ -121,7 +121,7 @@ def standardize_terms(
     )
 
 
-def standardize_term(term: gilda.Term) -> gilda.Term:
+def standardize_gilda_term(term: gilda.Term) -> gilda.Term:
     """Standardize a term's prefix and identifier to the Bioregistry standard."""
     prefix = bioregistry.normalize_prefix(term.db)
     if prefix is None:
@@ -137,7 +137,7 @@ def standardize_term(term: gilda.Term) -> gilda.Term:
     return term
 
 
-def make_new_term(
+def make_new_gilda_term(
     term: gilda.Term,
     target_db: str,
     target_id: str,
diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py
diff --git a/src/semra/templates/config-summary.md b/src/semra/templates/config-summary.md
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py

Original file line number	Diff line number	Diff line change
`@@ -3,3 +3,4 @@ Mapping Assembly Pipeline`
`3`	`3`
`4`	`4`	`.. automodapi:: semra.pipeline`
`5`	`5`	`:no-heading:`
	`6`	`+ :no-inheritance-diagram:`