Skip to content

Commit e899537

Browse files
authored
Merge pull request #206 from TranslatorSRI/add-kgx-conversion
This PR moves the KGX exporter from NodeNorm (specifically, https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208) into this repo and sets up Snakemake to generate the KGX files with every Babel run. Closes #61. Closes NCATSTranslator/NodeNormalization#95.
2 parents 9071e56 + 7b04ddb commit e899537

File tree

7 files changed

+221
-3
lines changed

7 files changed

+221
-3
lines changed

Snakefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ include: "src/snakefiles/taxon.snakefile"
1313
include: "src/snakefiles/genefamily.snakefile"
1414
include: "src/snakefiles/leftover_umls.snakefile"
1515
include: "src/snakefiles/macromolecular_complex.snakefile"
16+
include: "src/snakefiles/exports.snakefile"
17+
18+
1619

1720
rule all:
1821
input:
@@ -28,6 +31,8 @@ rule all:
2831
config['output_directory'] + '/reports/umls_done',
2932
config['output_directory'] + '/reports/macromolecular_complex_done',
3033
config['output_directory'] + '/reports/drugchemical_done',
34+
# Check if we have exported the compendia as KGX.
35+
config['output_directory'] + '/kgx/done',
3136
output:
3237
x = config['output_directory'] + '/reports/all_done'
3338
shell:

config.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,5 +55,8 @@
5555

5656
"genefamily_labels": ["PANTHER.FAMILY","HGNC.FAMILY"],
5757
"genefamily_ids": ["PANTHER.FAMILY","HGNC.FAMILY"],
58-
"genefamily_outputs": ["GeneFamily.txt"]
58+
"genefamily_outputs": ["GeneFamily.txt"],
59+
60+
"umls_outputs": ["umls.txt"],
61+
"macromolecularcomplex_outputs": ["MacromolecularComplex.txt"]
5962
}

kubernetes/babel-outputs.k8s.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,5 @@ spec:
1515
- ReadWriteOnce
1616
resources:
1717
requests:
18-
storage: 400Gi
18+
storage: 500Gi
1919
storageClassName: basic

src/exporters/kgx.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# Once we generate the compendium files, we need to convert them into the
2+
# Knowledge Graph Exchange (KGX, https://github.com/biolink/kgx) format.
3+
# This file provides code for doing that, based on the code from
4+
# https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
5+
6+
import hashlib
7+
import json
8+
import os
9+
from itertools import combinations
10+
11+
import logging
12+
from src.util import LoggingUtil
13+
14+
# Default logger for this file.
15+
logger = LoggingUtil.init_logging(__name__, level=logging.INFO)
16+
17+
18+
def convert_compendium_to_kgx(compendium_filename, kgx_nodes_filename, kgx_edges_filename):
19+
"""
20+
Convert a compendium file to KGX (https://github.com/biolink/kgx) format.
21+
22+
Based on the code in https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
23+
24+
:param compendium_filename: The compendium file to convert.
25+
:param kgx_nodes_filename: The KGX nodes file to write out.
26+
:param kgx_edges_filename: The KGX edges file to write out.
27+
"""
28+
29+
logger.info(f"convert_compendium_to_kgx({compendium_filename}, {kgx_nodes_filename}, {kgx_edges_filename})")
30+
31+
# Set up data structures.
32+
nodes: list = []
33+
edges: list = []
34+
pass_nodes: list = []
35+
36+
count_lines = 0
37+
count_nodes = 0
38+
count_edges = 0
39+
40+
# Used to count batches of 10000 lines to process together.
41+
batch_size = 10000
42+
line_counter = 0
43+
44+
# Make the output directories if they don't exist.
45+
os.makedirs(os.path.dirname(kgx_nodes_filename), exist_ok=True)
46+
os.makedirs(os.path.dirname(kgx_edges_filename), exist_ok=True)
47+
48+
# Open the compendium file for reading.
49+
with open(compendium_filename, "r", encoding="utf-8") as compendium:
50+
# Open the nodes and edges files for writing.
51+
with \
52+
open(kgx_nodes_filename, "w", encoding="utf-8") as node_file, \
53+
open(kgx_edges_filename, "w", encoding="utf-8") as edge_file:
54+
55+
# set the flag for suppressing the first ",\n" in the written data
56+
first = True
57+
58+
# At this point we should validate the compendium file, but the report
59+
# has already run, so hopefully it's already validated?
60+
61+
# for each line in the file
62+
for line in compendium:
63+
# increment the record counter
64+
line_counter += 1
65+
66+
# clear storage for this pass
67+
pass_nodes.clear()
68+
69+
# load the line into memory
70+
instance: dict = json.loads(line)
71+
72+
# all ids (even the root one) are in the equivalent identifiers
73+
if len(instance["identifiers"]) > 0:
74+
# loop through each identifier and create a node
75+
for equiv_id in instance["identifiers"]:
76+
# check to see if there is a label. if there is use it
77+
if "l" in equiv_id:
78+
name = equiv_id["l"]
79+
else:
80+
name = ""
81+
82+
# add the node to the ones in this pass
83+
pass_nodes.append(
84+
{
85+
"id": equiv_id["i"],
86+
"name": name,
87+
"category": instance["type"],
88+
"equivalent_identifiers": list(x["i"] for x in instance["identifiers"]),
89+
}
90+
)
91+
92+
# get the combinations of the nodes in this pass
93+
combos = combinations(pass_nodes, 2)
94+
95+
# for all the node combinations create an edge between them
96+
for c in combos:
97+
# create a unique id
98+
record_id: str = c[0]["id"] + c[1]["id"] + f"{compendium_filename}"
99+
100+
# save the edge
101+
edges.append(
102+
{
103+
"id": f'{hashlib.md5(record_id.encode("utf-8")).hexdigest()}',
104+
"subject": c[0]["id"],
105+
"predicate": "biolink:same_as",
106+
"object": c[1]["id"],
107+
}
108+
)
109+
110+
# save the nodes in this pass to the big list
111+
nodes.extend(pass_nodes)
112+
113+
# did we reach the write threshold
114+
if line_counter == batch_size:
115+
# first time in doesn't get a leading comma
116+
if first:
117+
prefix = ""
118+
else:
119+
prefix = "\n"
120+
121+
# reset the first record flag
122+
first = False
123+
124+
# get all the nodes in a string and write them out
125+
nodes_to_write = prefix + "\n".join([json.dumps(node) for node in nodes])
126+
node_file.write(nodes_to_write)
127+
count_nodes += len(nodes)
128+
129+
# are there any edges to output
130+
if len(edges) > 0:
131+
# get all the edges in a string and write them out
132+
edges_to_write = prefix + "\n".join([json.dumps(edge) for edge in edges])
133+
edge_file.write(edges_to_write)
134+
count_edges += len(edges)
135+
136+
# reset for the next group
137+
nodes.clear()
138+
edges.clear()
139+
140+
# Count total lines
141+
count_lines += line_counter
142+
logger.info(f"Processed {count_lines} lines from {compendium_filename}")
143+
144+
# reset the line counter for the next group
145+
line_counter = 0
146+
147+
# pick up any remainders in the file
148+
if len(nodes) > 0:
149+
nodes_to_write = "\n" + "\n".join([json.dumps(node) for node in nodes])
150+
node_file.write(nodes_to_write)
151+
count_nodes += len(nodes)
152+
153+
if len(edges) > 0:
154+
edges_to_write = "\n" + "\n".join([json.dumps(edge) for edge in edges])
155+
edge_file.write(edges_to_write)
156+
count_edges += len(edges)
157+
158+
# Count total lines
159+
count_lines += line_counter
160+
logger.info(f"Processed a total of {count_lines} lines from {compendium_filename}")
161+
162+
logger.info(f"Converted {compendium_filename} to KGX: " +
163+
f"wrote {count_nodes} nodes to {kgx_nodes_filename} and " +
164+
f"wrote {count_edges} edges to {kgx_edges_filename}.")

src/snakefiles/datacollect.snakefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ rule get_panther_pathways:
368368
output:
369369
outfile = config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.7.txt'
370370
run:
371-
pantherpathways.pull_panther_pathways()
371+
pantherpathways.pull_panther_pathways(output.outfile)
372372

373373
rule get_panther_pathway_labels:
374374
input:

src/snakefiles/exports.snakefile

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from src.snakefiles.util import get_all_compendia
2+
import src.exporters.kgx as kgx
3+
import os
4+
5+
### Export compendia/synonyms into downstream outputs
6+
7+
# Export all compendia to KGX, then create `babel_outputs/kgx/done` to signal that we're done.
8+
rule export_all_to_kgx:
9+
input:
10+
nodes_files=expand("{od}/kgx/{fn}",
11+
od=config['output_directory'],
12+
fn=map(lambda fn: os.path.splitext(fn)[0] + '_nodes.jsonl', get_all_compendia(config))
13+
),
14+
edges_files=expand("{od}/kgx/{fn}",
15+
od=config['output_directory'],
16+
fn=map(lambda fn: os.path.splitext(fn)[0] + '_edges.jsonl', get_all_compendia(config))
17+
)
18+
output:
19+
x = config['output_directory'] + '/kgx/done',
20+
shell:
21+
"echo 'done' >> {output.x}"
22+
23+
24+
# Generic rule for generating the KGX files for a particular compendia file.
25+
rule generate_kgx:
26+
input:
27+
compendium_file=config['output_directory'] + "/compendia/{filename}.txt",
28+
output:
29+
nodes_file=config['output_directory'] + "/kgx/{filename}_nodes.jsonl",
30+
edges_file=config['output_directory'] + "/kgx/{filename}_edges.jsonl",
31+
run:
32+
kgx.convert_compendium_to_kgx(input.compendium_file, output.nodes_file, output.edges_file)

src/snakefiles/util.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Shared code used by Snakemake files
2+
3+
# List of all the compendia files that need to be converted.
4+
def get_all_compendia(config):
5+
return (config['anatomy_outputs'] +
6+
config['chemical_outputs'] +
7+
config['disease_outputs'] +
8+
config['gene_outputs'] +
9+
config['genefamily_outputs'] +
10+
config['process_outputs'] +
11+
config['protein_outputs'] +
12+
config['taxon_outputs'] +
13+
config['umls_outputs'] +
14+
config['macromolecularcomplex_outputs'])

0 commit comments

Comments
 (0)