1+ # Once we generate the compendium files, we need to convert them into the
2+ # Knowledge Graph Exchange (KGX, https://github.com/biolink/kgx) format.
3+ # This file provides code for doing that, based on the code from
4+ # https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
5+
6+ import hashlib
7+ import json
8+ import os
9+ from itertools import combinations
10+
11+ import logging
12+ from src .util import LoggingUtil
13+
14+ # Default logger for this file.
15+ logger = LoggingUtil .init_logging (__name__ , level = logging .INFO )
16+
17+
18+ def convert_compendium_to_kgx (compendium_filename , kgx_nodes_filename , kgx_edges_filename ):
19+ """
20+ Convert a compendium file to KGX (https://github.com/biolink/kgx) format.
21+
22+ Based on the code in https://github.com/TranslatorSRI/NodeNormalization/blob/68096b2f16e6c2eedb699178ace71cea98dc794f/node_normalizer/loader.py#L70-L208
23+
24+ :param compendium_filename: The compendium file to convert.
25+ :param kgx_nodes_filename: The KGX nodes file to write out.
26+ :param kgx_edges_filename: The KGX edges file to write out.
27+ """
28+
29+ logger .info (f"convert_compendium_to_kgx({ compendium_filename } , { kgx_nodes_filename } , { kgx_edges_filename } )" )
30+
31+ # Set up data structures.
32+ nodes : list = []
33+ edges : list = []
34+ pass_nodes : list = []
35+
36+ count_lines = 0
37+ count_nodes = 0
38+ count_edges = 0
39+
40+ # Used to count batches of 10000 lines to process together.
41+ batch_size = 10000
42+ line_counter = 0
43+
44+ # Make the output directories if they don't exist.
45+ os .makedirs (os .path .dirname (kgx_nodes_filename ), exist_ok = True )
46+ os .makedirs (os .path .dirname (kgx_edges_filename ), exist_ok = True )
47+
48+ # Open the compendium file for reading.
49+ with open (compendium_filename , "r" , encoding = "utf-8" ) as compendium :
50+ # Open the nodes and edges files for writing.
51+ with \
52+ open (kgx_nodes_filename , "w" , encoding = "utf-8" ) as node_file , \
53+ open (kgx_edges_filename , "w" , encoding = "utf-8" ) as edge_file :
54+
55+ # set the flag for suppressing the first ",\n" in the written data
56+ first = True
57+
58+ # At this point we should validate the compendium file, but the report
59+ # has already run, so hopefully it's already validated?
60+
61+ # for each line in the file
62+ for line in compendium :
63+ # increment the record counter
64+ line_counter += 1
65+
66+ # clear storage for this pass
67+ pass_nodes .clear ()
68+
69+ # load the line into memory
70+ instance : dict = json .loads (line )
71+
72+ # all ids (even the root one) are in the equivalent identifiers
73+ if len (instance ["identifiers" ]) > 0 :
74+ # loop through each identifier and create a node
75+ for equiv_id in instance ["identifiers" ]:
76+ # check to see if there is a label. if there is use it
77+ if "l" in equiv_id :
78+ name = equiv_id ["l" ]
79+ else :
80+ name = ""
81+
82+ # add the node to the ones in this pass
83+ pass_nodes .append (
84+ {
85+ "id" : equiv_id ["i" ],
86+ "name" : name ,
87+ "category" : instance ["type" ],
88+ "equivalent_identifiers" : list (x ["i" ] for x in instance ["identifiers" ]),
89+ }
90+ )
91+
92+ # get the combinations of the nodes in this pass
93+ combos = combinations (pass_nodes , 2 )
94+
95+ # for all the node combinations create an edge between them
96+ for c in combos :
97+ # create a unique id
98+ record_id : str = c [0 ]["id" ] + c [1 ]["id" ] + f"{ compendium_filename } "
99+
100+ # save the edge
101+ edges .append (
102+ {
103+ "id" : f'{ hashlib .md5 (record_id .encode ("utf-8" )).hexdigest ()} ' ,
104+ "subject" : c [0 ]["id" ],
105+ "predicate" : "biolink:same_as" ,
106+ "object" : c [1 ]["id" ],
107+ }
108+ )
109+
110+ # save the nodes in this pass to the big list
111+ nodes .extend (pass_nodes )
112+
113+ # did we reach the write threshold
114+ if line_counter == batch_size :
115+ # first time in doesn't get a leading comma
116+ if first :
117+ prefix = ""
118+ else :
119+ prefix = "\n "
120+
121+ # reset the first record flag
122+ first = False
123+
124+ # get all the nodes in a string and write them out
125+ nodes_to_write = prefix + "\n " .join ([json .dumps (node ) for node in nodes ])
126+ node_file .write (nodes_to_write )
127+ count_nodes += len (nodes )
128+
129+ # are there any edges to output
130+ if len (edges ) > 0 :
131+ # get all the edges in a string and write them out
132+ edges_to_write = prefix + "\n " .join ([json .dumps (edge ) for edge in edges ])
133+ edge_file .write (edges_to_write )
134+ count_edges += len (edges )
135+
136+ # reset for the next group
137+ nodes .clear ()
138+ edges .clear ()
139+
140+ # Count total lines
141+ count_lines += line_counter
142+ logger .info (f"Processed { count_lines } lines from { compendium_filename } " )
143+
144+ # reset the line counter for the next group
145+ line_counter = 0
146+
147+ # pick up any remainders in the file
148+ if len (nodes ) > 0 :
149+ nodes_to_write = "\n " + "\n " .join ([json .dumps (node ) for node in nodes ])
150+ node_file .write (nodes_to_write )
151+ count_nodes += len (nodes )
152+
153+ if len (edges ) > 0 :
154+ edges_to_write = "\n " + "\n " .join ([json .dumps (edge ) for edge in edges ])
155+ edge_file .write (edges_to_write )
156+ count_edges += len (edges )
157+
158+ # Count total lines
159+ count_lines += line_counter
160+ logger .info (f"Processed a total of { count_lines } lines from { compendium_filename } " )
161+
162+ logger .info (f"Converted { compendium_filename } to KGX: " +
163+ f"wrote { count_nodes } nodes to { kgx_nodes_filename } and " +
164+ f"wrote { count_edges } edges to { kgx_edges_filename } ." )
0 commit comments