From 563f56d9ccfa556fb0a3973510d45ec23a854443 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:04:20 -0400 Subject: [PATCH 1/6] Add cypher docs --- docs/source/cypher.rst | 119 +++++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + src/semra/io/neo4j_io.py | 20 ++++++- 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 docs/source/cypher.rst diff --git a/docs/source/cypher.rst b/docs/source/cypher.rst new file mode 100644 index 00000000..542a0013 --- /dev/null +++ b/docs/source/cypher.rst @@ -0,0 +1,119 @@ +Querying with Cypher +==================== + +SeMRA constructs locally-deployable Neo4j graph databases that +can be queried directly with the `Cypher query language `_. +By default, this works by the Docker image exposing port 7687. + +Alternatively, you can navigate to http://localhost:7474 for +a graphical front-end to Neo4j where you can type in Cypher +queries and interact with the results. + +In the following examples, we'll use the cell and cell lines +database. + +Data Model +---------- +.. todo:: put data model overview image here + +Lookup by CURIE +--------------- +Look up a concept (e.g., a cell line) by its CURIE: + +.. code-block:: cypher + + MATCH (n:concept) + WHERE n.curie = "cellosaurus:0440" + RETURN n + +The same is possible for mappings, evidences, and mapping sets. +Each of these three types of entities has SeMRA-specific CURIE +generation. For a mapping: + +.. code-block:: cypher + + MATCH (m:mapping) + WHERE m.curie = "..." + RETURN m + +For an evidence: + +.. code-block:: cypher + + MATCH (e:evidence) + WHERE e.curie = "..." + RETURN e + +For a mapping set: + +.. code-block:: cypher + + MATCH (s:mappingset) + WHERE s.curie = "..." + RETURN s + +Cypher also lets you return certain parts from each record. +The list of what fields are available can be found in the following documentation: + +=========== ============================================== +Concept :data:`semra.io.neo4j_io.CONCEPT_NODES_HEADER` +Mapping :data:`semra.io.neo4j_io.MAPPING_NODES_HEADER` +Evidence :data:`semra.io.neo4j_io.EVIDENCE_NODES_HEADER` +Mapping Set :data:`semra.io.neo4j_io.MAPPING_NODES_HEADER` +=========== ============================================== + +For example, you can look up a concept +by its CURIE and return specific parts, such as the name: + +.. code-block:: cypher + + MATCH (n:concept) + WHERE n.curie = "cellosaurus:0440" + RETURN n.name + + +Traversing Mappings +------------------- + + +Get all mappings where ``cellosaurus:0440`` +is the source: + +.. code-block:: cypher + + MATCH (m:mapping)-[:`owl:annotatedSource`]->(source:concept) + WHERE source.curie = "cellosaurus:0440" + RETURN m + +Get all targets where there exists a mapping with ``cellosaurus:0440`` +is the source: + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept), + (m)-[:`owl:annotatedTarget`]->(target:concept) + WHERE source.curie = "cellosaurus:0440" + RETURN target + +Get all mappings (with associated evidences, mapping sets, and authors) +where ``cellosaurus:0440`` is the source: + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:hasEvidence]->(e:evidence) + WHERE source.curie = "cellosaurus:0440" + OPTIONAL MATCH + (e)-[:fromSet]->(mset:mappingset) + OPTIONAL MATCH + (e)-[:hasAuthor]->(author) + RETURN m + +Neo4j Output Reference +---------------------- +.. automodapi:: semra.io.neo4j_io + :skip: write_neo4j + :include-all-objects: + :no-heading: diff --git a/docs/source/index.rst b/docs/source/index.rst index 96287302..1f70b286 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -157,6 +157,7 @@ Table of Contents io reference cli + cypher Indices and Tables ------------------ diff --git a/src/semra/io/neo4j_io.py b/src/semra/io/neo4j_io.py index 775549bb..96bad432 100644 --- a/src/semra/io/neo4j_io.py +++ b/src/semra/io/neo4j_io.py @@ -27,6 +27,16 @@ __all__ = [ "write_neo4j", + "MAPPING_NODES_HEADER", + "EVIDENCE_NODES_HEADER", + "MAPPING_SET_NODES_HEADER", + "CONCEPT_NODES_HEADER", + "EDGES_HEADER", + "EDGES_SUPPLEMENT_HEADER", + "HAS_EVIDENCE_PREDICATE", + "FROM_SET_PREDICATE", + "DERIVED_PREDICATE", + "HAS_AUTHOR_PREDICATE", ] HERE = Path(__file__).parent.resolve() @@ -39,7 +49,9 @@ PYTHON = "python3.13" +#: The column headers for the concept nodes in the SeMRA Neo4j graph database export CONCEPT_NODES_HEADER = ["curie:ID", "prefix", "name", "priority:boolean"] +#: The column headers for the mapping nodes in the SeMRA Neo4j graph database export MAPPING_NODES_HEADER = [ "curie:ID", "prefix", @@ -49,6 +61,7 @@ "secondary:boolean", "tertiary:boolean", ] +#: The column headers for evidence nodes in the SeMRA Neo4j graph database export EVIDENCE_NODES_HEADER = [ "curie:ID", "prefix", @@ -65,6 +78,8 @@ "version", "confidence:float", ] + +#: The column headers for properties attached to simple mappings EDGES_HEADER = [ ":START_ID", ":TYPE", @@ -75,7 +90,10 @@ "tertiary:boolean", "mapping_sets:string[]", ] -# for extra edges that aren't mapping edges +#: for extra edges that aren't mapping edges, such as +#: those with :data:`HAS_EVIDENCE_PREDICATE`, +#: :data:`FROM_SET_PREDICATE`, :data:`DERIVED_PREDICATE`, +#: and :data:`HAS_AUTHOR_PREDICATE` EDGES_SUPPLEMENT_HEADER = [ ":START_ID", ":TYPE", From 787d7ec0856ba90fa22949bc0ae6c84c77388130 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:17:47 -0400 Subject: [PATCH 2/6] Update cypher.rst --- docs/source/cypher.rst | 125 ++++++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 32 deletions(-) diff --git a/docs/source/cypher.rst b/docs/source/cypher.rst index 542a0013..f40a97de 100644 --- a/docs/source/cypher.rst +++ b/docs/source/cypher.rst @@ -1,23 +1,27 @@ Querying with Cypher ==================== -SeMRA constructs locally-deployable Neo4j graph databases that -can be queried directly with the `Cypher query language `_. -By default, this works by the Docker image exposing port 7687. +SeMRA constructs locally-deployable Neo4j graph databases that can be queried directly +with the `Cypher query language +`_. By default, this works +by the Docker image exposing port 7687. -Alternatively, you can navigate to http://localhost:7474 for -a graphical front-end to Neo4j where you can type in Cypher -queries and interact with the results. +Alternatively, you can navigate to http://localhost:7474 for a graphical front-end to +Neo4j where you can type in Cypher queries and interact with the results. -In the following examples, we'll use the cell and cell lines -database. +In the following examples, we'll use the cell and cell lines database. Data Model ---------- + .. todo:: put data model overview image here Lookup by CURIE --------------- + +The following Cypher queries allow for looking up concepts, mappings, evidences, and +mapping sets in a Neo4j database output by SeMRA (using :func:`semra.write_neo4j`). + Look up a concept (e.g., a cell line) by its CURIE: .. code-block:: cypher @@ -26,9 +30,8 @@ Look up a concept (e.g., a cell line) by its CURIE: WHERE n.curie = "cellosaurus:0440" RETURN n -The same is possible for mappings, evidences, and mapping sets. -Each of these three types of entities has SeMRA-specific CURIE -generation. For a mapping: +The same is possible for mappings, evidences, and mapping sets. Each of these three +types of entities has SeMRA-specific CURIE generation. For a mapping: .. code-block:: cypher @@ -52,18 +55,18 @@ For a mapping set: WHERE s.curie = "..." RETURN s -Cypher also lets you return certain parts from each record. -The list of what fields are available can be found in the following documentation: +Cypher also lets you return certain parts from each record. The list of what fields are +available can be found in the following documentation: -=========== ============================================== +=========== =============================================== Concept :data:`semra.io.neo4j_io.CONCEPT_NODES_HEADER` Mapping :data:`semra.io.neo4j_io.MAPPING_NODES_HEADER` Evidence :data:`semra.io.neo4j_io.EVIDENCE_NODES_HEADER` Mapping Set :data:`semra.io.neo4j_io.MAPPING_NODES_HEADER` -=========== ============================================== +=========== =============================================== -For example, you can look up a concept -by its CURIE and return specific parts, such as the name: +For example, you can look up a concept by its CURIE and return specific parts, such as +the name: .. code-block:: cypher @@ -71,48 +74,106 @@ by its CURIE and return specific parts, such as the name: WHERE n.curie = "cellosaurus:0440" RETURN n.name - Traversing Mappings ------------------- - -Get all mappings where ``cellosaurus:0440`` -is the source: +Get all targets for exact match mappings where ``cellosaurus:0440`` is the source: .. code-block:: cypher - MATCH (m:mapping)-[:`owl:annotatedSource`]->(source:concept) + MATCH + (source:concept)-[:`skos:exactMatch`]->(target:concept) WHERE source.curie = "cellosaurus:0440" - RETURN m + RETURN target -Get all targets where there exists a mapping with ``cellosaurus:0440`` -is the source: +The same query can be reified using ``owl:annotatedSource``, ``owl:annotatedTarget``, +and the ``mapping`` node type: .. code-block:: cypher MATCH - (m:mapping)-[:`owl:annotatedSource`]->(source:concept), - (m)-[:`owl:annotatedTarget`]->(target:concept) - WHERE source.curie = "cellosaurus:0440" + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:`owl:annotatedSource`]->(target:concept) + WHERE source.curie = "cellosaurus:0440" and m.predicate == "skos:exactMatch" + RETURN target + +After reifying, you can extend the query to return evidences. In the interactive view, +returning multiple elements will also automatically show edges between them + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:`owl:annotatedSource`]->(target:concept) + (m)-[:hasEvidence]->(e:evidence) + WHERE source.curie = "cellosaurus:0440" and m.predicate == "skos:exactMatch" + RETURN source, target, m, e + +Reification is useful for doing complex filters, e.g., on mapping justification. The +following query returns exact matches to ``cellosaurus:0440`` that have manual mapping +justification + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:`owl:annotatedSource`]->(target:concept) + (m)-[:hasEvidence]->(e:evidence) + WHERE + source.curie = "cellosaurus:0440" + and m.predicate == "skos:exactMatch" + and e.mapping_justification == "semapv:ManualMappingCuration" RETURN target -Get all mappings (with associated evidences, mapping sets, and authors) -where ``cellosaurus:0440`` is the source: +The previous query can be reformulated to filter for minimum confidence: .. code-block:: cypher MATCH (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:`owl:annotatedSource`]->(target:concept) + (m)-[:hasEvidence]->(e:evidence) + WHERE + source.curie = "cellosaurus:0440" + and m.predicate == "skos:exactMatch" + and e.confidence > 0.3 + RETURN target + +It can also be extended to return the authors of the evidences: + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m)-[:`owl:annotatedSource`]->(target:concept) + (m)-[:hasEvidence]->(e:evidence) + (e)-[:hasAuthor]->(author:concept) + WHERE + source.curie = "cellosaurus:0440" + and m.predicate == "skos:exactMatch" + and e.mapping_justification == "semapv:ManualMappingCuration" + RETURN target, author + +The following query gets all mappings (with associated evidences, mapping sets, and +authors) where ``cellosaurus:0440`` is the source, with optional matches for mapping +sets and authors: + +.. code-block:: cypher + + MATCH + (m:mapping)-[:`owl:annotatedSource`]->(source:concept) , + (m:mapping)-[:`owl:annotatedTarget`]->(target:concept) , (m)-[:hasEvidence]->(e:evidence) WHERE source.curie = "cellosaurus:0440" OPTIONAL MATCH (e)-[:fromSet]->(mset:mappingset) OPTIONAL MATCH - (e)-[:hasAuthor]->(author) - RETURN m + (e)-[:hasAuthor]->(author:concept) + RETURN source, target, m, e, mset, author Neo4j Output Reference ---------------------- + .. automodapi:: semra.io.neo4j_io :skip: write_neo4j :include-all-objects: From e82adf0db004385e086bb39780dd47c179774d23 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:17:52 -0400 Subject: [PATCH 3/6] Update neo4j_io.py --- src/semra/io/neo4j_io.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/semra/io/neo4j_io.py b/src/semra/io/neo4j_io.py index 96bad432..decbbc7d 100644 --- a/src/semra/io/neo4j_io.py +++ b/src/semra/io/neo4j_io.py @@ -26,17 +26,17 @@ from ..utils import gzip_path __all__ = [ - "write_neo4j", - "MAPPING_NODES_HEADER", - "EVIDENCE_NODES_HEADER", - "MAPPING_SET_NODES_HEADER", "CONCEPT_NODES_HEADER", + "DERIVED_PREDICATE", "EDGES_HEADER", "EDGES_SUPPLEMENT_HEADER", - "HAS_EVIDENCE_PREDICATE", + "EVIDENCE_NODES_HEADER", "FROM_SET_PREDICATE", - "DERIVED_PREDICATE", "HAS_AUTHOR_PREDICATE", + "HAS_EVIDENCE_PREDICATE", + "MAPPING_NODES_HEADER", + "MAPPING_SET_NODES_HEADER", + "write_neo4j", ] HERE = Path(__file__).parent.resolve() From 1d16937d5ba143baa98ef92a457aa7b62d28d846 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:28:48 -0400 Subject: [PATCH 4/6] Add chart --- docs/source/cypher.rst | 2 +- docs/source/img/graph-schema.svg | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/source/img/graph-schema.svg diff --git a/docs/source/cypher.rst b/docs/source/cypher.rst index f40a97de..8655db60 100644 --- a/docs/source/cypher.rst +++ b/docs/source/cypher.rst @@ -14,7 +14,7 @@ In the following examples, we'll use the cell and cell lines database. Data Model ---------- -.. todo:: put data model overview image here +.. image:: img/graph-schema.svg Lookup by CURIE --------------- diff --git a/docs/source/img/graph-schema.svg b/docs/source/img/graph-schema.svg new file mode 100644 index 00000000..7a2ee15d --- /dev/null +++ b/docs/source/img/graph-schema.svg @@ -0,0 +1 @@ + \ No newline at end of file From 93a3ace729cc252d4f34ca7ae33e6b39fbe806df Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:35:23 -0400 Subject: [PATCH 5/6] Update --- docs/source/cypher.rst | 27 ++++++++++++++++----------- src/semra/pipeline.py | 4 +++- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/docs/source/cypher.rst b/docs/source/cypher.rst index 8655db60..2fa06850 100644 --- a/docs/source/cypher.rst +++ b/docs/source/cypher.rst @@ -1,26 +1,31 @@ Querying with Cypher ==================== -SeMRA constructs locally-deployable Neo4j graph databases that can be queried directly -with the `Cypher query language -`_. By default, this works -by the Docker image exposing port 7687. +SeMRA constructs data artifacts and docker configuration for locally deploying a Neo4j +graph databases and a web application via :func:`semra.io.write_neo4j` (for example +outputs, see :mod:`semra.database` or :mod:`semra.landscape`). The resulting graph +database can be queried directly with the `Cypher query language +`_ in one of the following +ways: -Alternatively, you can navigate to http://localhost:7474 for a graphical front-end to -Neo4j where you can type in Cypher queries and interact with the results. +1. By connecting with a client via the ``bolt`` protocol on port 7687, which is exposed + in the Dockerfile +2. By navigating to http://localhost:7474 in the web browser to use Neo4j's builtin + graphical front-end, where you can type in Cypher queries and interact with the + results. -In the following examples, we'll use the cell and cell lines database. - -Data Model ----------- +The contents of the grpah database have the following schema: .. image:: img/graph-schema.svg +Below, some example Cypher queries are given to show what is possible by direct querying +of the database. + Lookup by CURIE --------------- The following Cypher queries allow for looking up concepts, mappings, evidences, and -mapping sets in a Neo4j database output by SeMRA (using :func:`semra.write_neo4j`). +mapping sets. Look up a concept (e.g., a cell line) by its CURIE: diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 01171bd8..7a1a38dd 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -44,7 +44,9 @@ confidence=0.99, extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"}, ), - Input(prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"}), + Input( + prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"} + ), Input(prefix="ncit", source="pyobo", confidence=0.99), Input(prefix="umls", source="pyobo", confidence=0.99), ], From e9242a37a1da9022936dcaeaefddb3f893ca370a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 8 Jul 2025 12:37:22 -0400 Subject: [PATCH 6/6] Update pipeline.py --- src/semra/pipeline.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 7a1a38dd..01171bd8 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -44,9 +44,7 @@ confidence=0.99, extras={"version": "22Q4", "standardize": True, "license": "CC-BY-4.0"}, ), - Input( - prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"} - ), + Input(prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"}), Input(prefix="ncit", source="pyobo", confidence=0.99), Input(prefix="umls", source="pyobo", confidence=0.99), ],