NCATSTranslator · gaurav · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from collections import defaultdict
 import jsonlines
 import requests
@@ -74,35 +75,40 @@ def build_chemical_umls_relationships(mrconso, idfile,outfile):
 def build_chemical_rxnorm_relationships(conso, idfile,outfile):
     umls.build_sets(conso, idfile, outfile, {'MSH': MESH,  'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI)
 
-def write_pubchem_ids(labelfile,smilesfile,outfile):
+def write_pubchem_ids(labeldir, smilesfile, outfile):
     #Trying to be memory efficient here.  We could just ingest the whole smilesfile which would make this code easier
     # but since they're already sorted, let's give it a shot
-    with open(labelfile,'r') as inlabels, gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile,'w') as outf:
-        sn = -1
-        flag_file_ended = False
-        for labelline in inlabels:
-            x = labelline.split('\t')[0]
-            pn = int(x.split(':')[-1])
-            while not flag_file_ended and sn < pn:
-                line = insmiles.readline()
-                if line == '':
-                    # Get this: a blank line in readline() means that we've reached the end-of-file.
-                    # (A '\n' would indicate that we've just read a blank line.)
-                    flag_file_ended = True
-                    break
-                smiline = line.strip().split('\t')
-                if len(smiline) != 2:
-                    raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
-                sn = int(smiline[0])
-
-            if sn == pn:
-                #We have a smiles for this id
-                stype = get_type_from_smiles(smiline[1])
-                outf.write(f'{x}\t{stype}\n')
-            else:
-                #sn > pn, we went past it.  No smiles for that
-                print('no smiles:',x,pn,sn)
-                outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')
+    with gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile, 'w') as outf:
+        for labelfile in os.listdir(labeldir):
+            labelpath = os.path.join(labeldir, labelfile)
+            if not os.path.isfile(labelpath):
+                continue
+            with open(labelpath, 'r') as inlabels:
+                sn = -1
+                flag_file_ended = False
+                for labelline in inlabels:
+                    x = labelline.split('\t')[0]
+                    pn = int(x.split(':')[-1])
+                    while not flag_file_ended and sn < pn:
+                        line = insmiles.readline()
+                        if line == '':
+                            # Get this: a blank line in readline() means that we've reached the end-of-file.
+                            # (A '\n' would indicate that we've just read a blank line.)
+                            flag_file_ended = True
+                            break
+                        smiline = line.strip().split('\t')
+                        if len(smiline) != 2:
+                            raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
+                        sn = int(smiline[0])
+
+                    if sn == pn:
+                        #We have a smiles for this id
+                        stype = get_type_from_smiles(smiline[1])
+                        outf.write(f'{x}\t{stype}\n')
+                    else:
+                        #sn > pn, we went past it.  No smiles for that
+                        print('no smiles:',x,pn,sn)
+                        outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')
 
 
 def write_mesh_ids(outfile):
@@ -209,20 +215,25 @@ def write_drugbank_ids(infile,outfile):
                 outf.write(f'{dbid}\t{CHEMICAL_ENTITY}\n')
                 written.add(x[2])
 
-def write_chemical_ids_from_labels_and_smiles(labelfile,smifile,outfile):
+def write_chemical_ids_from_labels_and_smiles(labeldir,smifile,outfile):
     smiles = {}
     with open(smifile,'r') as inf:
         for line in inf:
             x = line.strip().split('\t')
             smiles[x[0]] = x[1]
-    with open(labelfile,'r') as inf, open(outfile,'w') as outf:
-        for line in inf:
-            hmdbid = line.split('\t')[0]
-            if hmdbid in smiles:
-                ctype = get_type_from_smiles(smiles[hmdbid])
-            else:
-                ctype = CHEMICAL_ENTITY
-            outf.write(f'{hmdbid}\t{ctype}\n')
+    with open(outfile,'w') as outf:
+        for labelfile in os.listdir(labeldir):
+            labelpath = os.path.join(labeldir,labelfile)
+            if not os.path.isfile(labelpath):
+                continue
+            with open(labelpath,'r') as inf:
+                for line in inf:
+                    hmdbid = line.split('\t')[0]
+                    if hmdbid in smiles:
+                        ctype = get_type_from_smiles(smiles[hmdbid])
+                    else:
+                        ctype = CHEMICAL_ENTITY
+                    outf.write(f'{hmdbid}\t{ctype}\n')
 
 
 def parse_smifile(infile,outfile,smicol,idcol,pref,stripquotes=False):
@@ -363,17 +374,22 @@ def make_pubchem_cas_concord(pubchemsynonyms, outfile):
             if is_cas(x[1]):
                 outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n')
 
-def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile):
+def make_pubchem_mesh_concord(pubcheminput,meshlabelsdir,outfile):
     mesh_label_to_id={}
     #Meshlabels has all kinds of stuff. e.g. these are both in there:
     #MESH:D014867    Water
     #MESH:M0022883   Water
     #but we only want the ones that are MESH:D... or MESH:C....
-    with open(meshlabels,'r') as inf:
-        for line in inf:
-            x = line.strip().split('\t')
-            if x[0].split(':')[-1][0] in ['C','D']:
-                mesh_label_to_id[x[1]] = x[0]
+    for meshlabelsfile in os.listdir(meshlabelsdir):
+        meshlabels = os.path.join(meshlabelsdir,meshlabelsfile)
+        if not os.path.isfile(meshlabels):
+            continue
+        with open(meshlabels,'r') as inf:
+            for line in inf:
+                x = line.strip().split('\t')
+                if x[0].split(':')[-1][0] in ['C','D']:
+                    mesh_label_to_id[x[1]] = x[0]
+
     #The pubchem - mesh pairs are supposed to be ordered in this file such that the
     # first mapping is the 'best' i.e. the one most frequently reported.
     # We will only use the first one

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
@@ -5,7 +5,7 @@
 from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
                             SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
                             ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE)
-from src.babel_utils import glom, get_numerical_curie_suffix
+from src.babel_utils import glom, get_numerical_curie_suffix, make_local_name
 from collections import defaultdict
 import os,json
 
@@ -378,7 +378,7 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem
     glom(gloms, pairs_to_be_glommed)
 
     # Set up a NodeFactory.
-    nodefactory = NodeFactory('', get_config()['biolink_version'])
+    nodefactory = NodeFactory(make_local_name(''), get_config()['biolink_version'])
 
     # Write out all the resulting cliques.
     written = set()

diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py
@@ -18,10 +18,16 @@
 
 def write_mods_ids(dd,id,modlist):
     for mod in modlist:
-        with open(f'{dd}/{mod}/labels','r') as inf, open(f'{id}/gene/ids/{mod}','w') as outf:
-            for line in inf:
-                x = line.split('\t')[0]
-                outf.write(f'{x}\n')
+        with open(f'{id}/gene/ids/{mod}','w') as outf:
+            for labelfile in os.listdir(f'{dd}/{mod}/labels'):
+                labelfile_path = f'{dd}/{mod}/labels/{labelfile}'
+                if not os.path.isfile(labelfile_path):
+                    # Skip label files.
+                    continue
+                with open(labelfile_path,'r') as inf:
+                    for line in inf:
+                        x = line.split('\t')[0]
+                        outf.write(f'{x}\n')
 
 def build_gene_ensembl_relationships(ensembl_dir, outfile):
     """Loop over all the ensembl species.  Find any protein-coding gene"""

diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py
@@ -6,6 +6,7 @@
 from snakemake.logging import Logger
 from bmt import Toolkit
 
+from src.babel_utils import make_local_name
 from src.node import NodeFactory
 from src.datahandlers import umls
 from src.prefixes import UMLS
@@ -206,7 +207,7 @@ def umls_type_to_biolink_type(umls_tui):
         reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")
 
         # Write out synonyms to synonym file.
-        node_factory = NodeFactory('babel_downloads/UMLS/labels', biolink_version)
+        node_factory = NodeFactory(make_local_name(''), biolink_version)
         count_synonym_objs = 0
         with jsonlines.open(umls_synonyms, 'w') as umls_synonymsf:
             for id in synonyms_by_id:

diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py
@@ -1,5 +1,5 @@
 from src.prefixes import CHEMBLCOMPOUND
-from src.babel_utils import pull_via_ftp, make_local_name
+from src.babel_utils import pull_via_ftp
 import ftplib
 import pyoxigraph
 

diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py
@@ -4,7 +4,6 @@
 from src.prefixes import CLO
 from src.categories import CELL_LINE
 from src.babel_utils import pull_via_urllib
-from src.babel_utils import make_local_name
 from src.util import Text, LoggingUtil
 import pyoxigraph
 

diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py
@@ -1,5 +1,5 @@
 from src.ubergraph import UberGraph
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
 from collections import defaultdict
 import os, gzip
 from json import loads,dumps
@@ -8,7 +8,7 @@ def pull_pubchem_labels():
     print('LABEL PUBCHEM')
     f_name =  'CID-Title.gz'
     cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name)
-    fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND')
+    fname = make_local_name('pull_pubchem_labels', subpath='PUBCHEM.COMPOUND/labels')
     with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf:
         for line in inf:
             x = line.strip().split('\t')
@@ -17,7 +17,7 @@ def pull_pubchem_labels():
 def pull_pubchem_synonyms():
     f_name = 'CID-Synonym-filtered.gz'
     sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name)
-    fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND')
+    fname = make_local_name('pull_pubchem_synonyms', subpath='PUBCHEM.COMPOUND/synonyms')
     with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf:
         for line in inf:
             x = line.strip().split('\t')
@@ -31,28 +31,6 @@ def pull_pubchem():
     pull_pubchem_labels()
     pull_pubchem_synonyms()
 
-def pull_hgnc():
-    data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json')
-    hgnc_json = loads(data)
-    lname = make_local_name('labels', subpath='HGNC')
-    sname = make_local_name('synonyms', subpath='HGNC')
-    with open(lname,'w') as lfile, open(sname,'w') as sfile:
-        for gene in hgnc_json['response']['docs']:
-            hgnc_id =gene['hgnc_id']
-            symbol = gene['symbol']
-            lfile.write(f'{hgnc_id}\t{symbol}\n')
-            name = gene['name']
-            sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n')
-            if 'alias_symbol' in gene:
-                alias_symbols = gene['alias_symbol']
-                for asym in alias_symbols:
-                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
-            if 'alias_name' in gene:
-                alias_names = gene['alias_name']
-                for asym in alias_names:
-                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
-
-
 def pull_prot(which,refresh):
     #swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta')
     if refresh:
@@ -82,7 +60,7 @@ def pull_prot(which,refresh):
 
 def pull_prots(refresh_swiss=False,refresh_trembl=False):
     swiss,labels = pull_prot('sprot',refresh_swiss)
-    fname = make_local_name('labels', subpath='UNIPROTKB')
+    fname = make_local_name('pull_prots', subpath='UNIPROTKB/labels')
     with open(fname,'w') as synonyms:
         for k,v in labels.items():
             synonyms.write(f'{k}\t{v}\n')

diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py
@@ -1,9 +1,8 @@
 from src.prefixes import EC
 from src.categories import MOLECULAR_ACTIVITY
 from src.babel_utils import pull_via_urllib
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name
 import pyoxigraph
-from collections import defaultdict
 
 
 def pull_ec():

diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 
 from src.prefixes import EFO,ORPHANET
@@ -159,6 +160,8 @@ def get_xrefs(self, iri, outfile):
 
 def make_labels(labelfile,synfile):
     m = EFOgraph()
+    os.makedirs(os.path.dirname(labelfile),exist_ok=True)
+    os.makedirs(os.path.dirname(synfile),exist_ok=True)
     m.pull_EFO_labels_and_synonyms(labelfile,synfile)
 
 def make_ids(roots,idfname):

diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
@@ -12,8 +12,8 @@ def pull_hgnc():
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data:
         hgnc_json = json.load(data)
-    lname = make_local_name('labels', subpath='HGNC')
-    sname = make_local_name('synonyms', subpath='HGNC')
+    lname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/labels')
+    sname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/synonyms')
     with open(lname,'w') as lfile, open(sname,'w') as sfile:
         for gene in hgnc_json['response']['docs']:
             hgnc_id =gene['hgnc_id']

diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py
@@ -1,6 +1,4 @@
-from pronto.utils.io import decompress
-
-from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
+from src.babel_utils import pull_via_urllib
 from src.prefixes import HGNCFAMILY
 
 def pull_hgncfamily():

diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
@@ -105,7 +105,7 @@ def pull_mesh_labels(self):
                 WHERE { ?term rdfs:label ?label }
                 ORDER BY ?term
         """
-        ofname = make_local_name('labels', subpath='MESH')
+        ofname = make_local_name('pull_mesh_labels', subpath='MESH/labels')
         qres = self.m.query(s)
         with open(ofname, 'w', encoding='utf8') as outf:
             for row in list(qres):
@@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell',
 
 
 #    ifname = make_local_name('mesh.nt', subpath='MESH')
-#    ofname = make_local_name('labels', subpath='MESH')
+#    ofname = make_local_name('write_ids', subpath='MESH/labels')
 #    badlines = 0
 #    with open(ofname, 'w') as outf, open(ifname,'r') as data:
 #        for line in data:

diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py
@@ -22,7 +22,9 @@ def write_labels(dd):
     for mod,prefix in modmap.items():
         with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf:
             j = json.load(inf)
-        with open(f'{dd}/{prefix}/labels','w') as outf:
+
+        os.makedirs(f'{dd}/{prefix}/labels',exist_ok=True)
+        with open(f'{dd}/{prefix}/labels/write_labels','w') as outf:
             for gene in j['data']:
                 gid = gene['gene_id'].split(':')[-1]
                 outf.write(f'{prefix}:{gid}\t{gene["gene_name"]}\n')
diff --git a/src/datahandlers/ncbigene.py b/src/datahandlers/ncbigene.py
@@ -10,8 +10,8 @@ def pull_ncbigene(filenames):
 def pull_ncbigene_labels_synonyms_and_taxa():
     # File format described here: https://ftp.ncbi.nlm.nih.gov/gene/DATA/README
     ifname = make_local_name('gene_info.gz', subpath='NCBIGene')
-    labelname = make_local_name('labels', subpath='NCBIGene')
-    synname = make_local_name('synonyms', subpath='NCBIGene')
+    labelname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/labels')
+    synname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/synonyms')
     taxaname = make_local_name('taxa', subpath='NCBIGene')
     bad_gene_types = {'biological-region', 'other', 'unknown'}
     with gzip.open(ifname, 'r') as inf, \

diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py
@@ -24,7 +24,7 @@ def pull_uber_labels(expected):
         ldict[p].add( ( unit['iri'], unit['label'] ) )
     for p in ldict:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('labels',subpath=p)
+            fname = make_local_name('pull_uber_labels',subpath=p + "/labels")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\n')
@@ -39,7 +39,7 @@ def pull_uber_descriptions(expected):
         ldict[p].add( ( unit['iri'], unit['description'] ) )
     for p in ldict:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('descriptions',subpath=p)
+            fname = make_local_name('pull_uber_descriptions',subpath=p + "/descriptions")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\n')
@@ -57,7 +57,7 @@ def pull_uber_synonyms(expected):
     # we are going to make some zero-length files for it
     for p in expected:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('synonyms',subpath=p)
+            fname = make_local_name('pull_uber_synonyms',subpath=p + "/synonyms")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n')

diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py
@@ -1,4 +1,4 @@
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import pull_via_ftp
 from src.prefixes import PANTHERFAMILY
 
 def pull_pantherfamily():