Merge pull request #206 from TranslatorSRI/add-kgx-conversion

gaurav · web-flow · commit e899537bd4a9 · 2023-12-06T23:42:53.000-05:00
This PR moves the KGX exporter from NodeNorm (specifically, https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208) into this repo and sets up Snakemake to generate the KGX files with every Babel run. Closes #61. Closes NCATSTranslator/NodeNormalization#95.
diff --git a/Snakefile b/Snakefile
@@ -13,6 +13,9 @@ include: "src/snakefiles/taxon.snakefile"
 include: "src/snakefiles/genefamily.snakefile"
 include: "src/snakefiles/leftover_umls.snakefile"
 include: "src/snakefiles/macromolecular_complex.snakefile"
+include: "src/snakefiles/exports.snakefile"
+
+
 
 rule all:
     input:
@@ -28,6 +31,8 @@ rule all:
         config['output_directory'] + '/reports/umls_done',
         config['output_directory'] + '/reports/macromolecular_complex_done',
         config['output_directory'] + '/reports/drugchemical_done',
+        # Check if we have exported the compendia as KGX.
+        config['output_directory'] + '/kgx/done',
     output:
         x = config['output_directory'] + '/reports/all_done'
     shell:
diff --git a/config.json b/config.json
@@ -55,5 +55,8 @@
 
   "genefamily_labels": ["PANTHER.FAMILY","HGNC.FAMILY"],
   "genefamily_ids": ["PANTHER.FAMILY","HGNC.FAMILY"],
-  "genefamily_outputs": ["GeneFamily.txt"]
+  "genefamily_outputs": ["GeneFamily.txt"],
+
+  "umls_outputs": ["umls.txt"],
+  "macromolecularcomplex_outputs": ["MacromolecularComplex.txt"]
 }
diff --git a/kubernetes/babel-outputs.k8s.yaml b/kubernetes/babel-outputs.k8s.yaml
@@ -15,5 +15,5 @@ spec:
     - ReadWriteOnce
   resources:
     requests:
-      storage: 400Gi
+      storage: 500Gi
   storageClassName: basic
diff --git a/src/exporters/kgx.py b/src/exporters/kgx.py
@@ -0,0 +1,164 @@
+# Once we generate the compendium files, we need to convert them into the
+# Knowledge Graph Exchange (KGX, https://github.com/biolink/kgx) format.
+# This file provides code for doing that, based on the code from
+# https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
+
+import hashlib
+import json
+import os
+from itertools import combinations
+
+import logging
+from src.util import LoggingUtil
+
+# Default logger for this file.
+logger = LoggingUtil.init_logging(__name__, level=logging.INFO)
+
+
+def convert_compendium_to_kgx(compendium_filename, kgx_nodes_filename, kgx_edges_filename):
+    """
+    Convert a compendium file to KGX (https://github.com/biolink/kgx) format.
+
+    Based on the code in https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
+
+    :param compendium_filename: The compendium file to convert.
+    :param kgx_nodes_filename: The KGX nodes file to write out.
+    :param kgx_edges_filename: The KGX edges file to write out.
+    """
+
+    logger.info(f"convert_compendium_to_kgx({compendium_filename}, {kgx_nodes_filename}, {kgx_edges_filename})")
+
+    # Set up data structures.
+    nodes: list = []
+    edges: list = []
+    pass_nodes: list = []
+
+    count_lines = 0
+    count_nodes = 0
+    count_edges = 0
+
+    # Used to count batches of 10000 lines to process together.
+    batch_size = 10000
+    line_counter = 0
+
+    # Make the output directories if they don't exist.
+    os.makedirs(os.path.dirname(kgx_nodes_filename), exist_ok=True)
+    os.makedirs(os.path.dirname(kgx_edges_filename), exist_ok=True)
+
+    # Open the compendium file for reading.
+    with open(compendium_filename, "r", encoding="utf-8") as compendium:
+        # Open the nodes and edges files for writing.
+        with \
+            open(kgx_nodes_filename, "w", encoding="utf-8") as node_file, \
+            open(kgx_edges_filename, "w", encoding="utf-8") as edge_file:
+
+            # set the flag for suppressing the first ",\n" in the written data
+            first = True
+
+            # At this point we should validate the compendium file, but the report
+            # has already run, so hopefully it's already validated?
+
+            # for each line in the file
+            for line in compendium:
+                # increment the record counter
+                line_counter += 1
+
+                # clear storage for this pass
+                pass_nodes.clear()
+
+                # load the line into memory
+                instance: dict = json.loads(line)
+
+                # all ids (even the root one) are in the equivalent identifiers
+                if len(instance["identifiers"]) > 0:
+                    # loop through each identifier and create a node
+                    for equiv_id in instance["identifiers"]:
+                        # check to see if there is a label. if there is use it
+                        if "l" in equiv_id:
+                            name = equiv_id["l"]
+                        else:
+                            name = ""
+
+                        # add the node to the ones in this pass
+                        pass_nodes.append(
+                            {
+                                "id": equiv_id["i"],
+                                "name": name,
+                                "category": instance["type"],
+                                "equivalent_identifiers": list(x["i"] for x in instance["identifiers"]),
+                            }
+                        )
+
+                    # get the combinations of the nodes in this pass
+                    combos = combinations(pass_nodes, 2)
+
+                    # for all the node combinations create an edge between them
+                    for c in combos:
+                        # create a unique id
+                        record_id: str = c[0]["id"] + c[1]["id"] + f"{compendium_filename}"
+
+                        # save the edge
+                        edges.append(
+                            {
+                                "id": f'{hashlib.md5(record_id.encode("utf-8")).hexdigest()}',
+                                "subject": c[0]["id"],
+                                "predicate": "biolink:same_as",
+                                "object": c[1]["id"],
+                            }
+                        )
+
+                # save the nodes in this pass to the big list
+                nodes.extend(pass_nodes)
+
+                # did we reach the write threshold
+                if line_counter == batch_size:
+                    # first time in doesn't get a leading comma
+                    if first:
+                        prefix = ""
+                    else:
+                        prefix = "\n"
+
+                    # reset the first record flag
+                    first = False
+
+                    # get all the nodes in a string and write them out
+                    nodes_to_write = prefix + "\n".join([json.dumps(node) for node in nodes])
+                    node_file.write(nodes_to_write)
+                    count_nodes += len(nodes)
+
+                    # are there any edges to output
+                    if len(edges) > 0:
+                        # get all the edges in a string and write them out
+                        edges_to_write = prefix + "\n".join([json.dumps(edge) for edge in edges])
+                        edge_file.write(edges_to_write)
+                        count_edges += len(edges)
+
+                    # reset for the next group
+                    nodes.clear()
+                    edges.clear()
+
+                    # Count total lines
+                    count_lines += line_counter
+                    logger.info(f"Processed {count_lines} lines from {compendium_filename}")
+
+                    # reset the line counter for the next group
+                    line_counter = 0
+
+            # pick up any remainders in the file
+            if len(nodes) > 0:
+                nodes_to_write = "\n" + "\n".join([json.dumps(node) for node in nodes])
+                node_file.write(nodes_to_write)
+                count_nodes += len(nodes)
+
+            if len(edges) > 0:
+                edges_to_write = "\n" + "\n".join([json.dumps(edge) for edge in edges])
+                edge_file.write(edges_to_write)
+                count_edges += len(edges)
+
+            # Count total lines
+            count_lines += line_counter
+            logger.info(f"Processed a total of {count_lines} lines from {compendium_filename}")
+
+    logger.info(f"Converted {compendium_filename} to KGX: " +
+                f"wrote {count_nodes} nodes to {kgx_nodes_filename} and " +
+                f"wrote {count_edges} edges to {kgx_edges_filename}.")
diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
@@ -368,7 +368,7 @@ rule get_panther_pathways:
     output:
         outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
     run:
-        pantherpathways.pull_panther_pathways()
+        pantherpathways.pull_panther_pathways(output.outfile)
 
 rule get_panther_pathway_labels:
     input:
diff --git a/src/snakefiles/exports.snakefile b/src/snakefiles/exports.snakefile
@@ -0,0 +1,32 @@
+from src.snakefiles.util import get_all_compendia
+import src.exporters.kgx as kgx
+import os
+
+### Export compendia/synonyms into downstream outputs
+
+# Export all compendia to KGX, then create `babel_outputs/kgx/done` to signal that we're done.
+rule export_all_to_kgx:
+    input:
+        nodes_files=expand("{od}/kgx/{fn}",
+            od=config['output_directory'],
+            fn=map(lambda fn: os.path.splitext(fn)[0] + '_nodes.jsonl', get_all_compendia(config))
+        ),
+        edges_files=expand("{od}/kgx/{fn}",
+            od=config['output_directory'],
+            fn=map(lambda fn: os.path.splitext(fn)[0] + '_edges.jsonl', get_all_compendia(config))
+        )
+    output:
+        x = config['output_directory'] + '/kgx/done',
+    shell:
+        "echo 'done' >> {output.x}"
+
+
+# Generic rule for generating the KGX files for a particular compendia file.
+rule generate_kgx:
+    input:
+        compendium_file=config['output_directory'] + "/compendia/{filename}.txt",
+    output:
+        nodes_file=config['output_directory'] + "/kgx/{filename}_nodes.jsonl",
+        edges_file=config['output_directory'] + "/kgx/{filename}_edges.jsonl",
+    run:
+        kgx.convert_compendium_to_kgx(input.compendium_file, output.nodes_file, output.edges_file)
diff --git a/src/snakefiles/util.py b/src/snakefiles/util.py
@@ -0,0 +1,14 @@
+# Shared code used by Snakemake files
+
+# List of all the compendia files that need to be converted.
+def get_all_compendia(config):
+    return (config['anatomy_outputs'] +
+            config['chemical_outputs'] +
+            config['disease_outputs'] +
+            config['gene_outputs'] +
+            config['genefamily_outputs'] +
+            config['process_outputs'] +
+            config['protein_outputs'] +
+            config['taxon_outputs'] +
+            config['umls_outputs'] +
+            config['macromolecularcomplex_outputs'])

Original file line number	Diff line number	Diff line change
`@@ -55,5 +55,8 @@`
`55`	`55`
`56`	`56`	`"genefamily_labels": ["PANTHER.FAMILY","HGNC.FAMILY"],`
`57`	`57`	`"genefamily_ids": ["PANTHER.FAMILY","HGNC.FAMILY"],`
`58`		`- "genefamily_outputs": ["GeneFamily.txt"]`
	`58`	`+ "genefamily_outputs": ["GeneFamily.txt"],`
	`59`	`+`
	`60`	`+ "umls_outputs": ["umls.txt"],`
	`61`	`+ "macromolecularcomplex_outputs": ["MacromolecularComplex.txt"]`
`59`	`62`	`}`