From 9f21c46ac8c6131ae3d791daa13e2f90f1d6f3a6 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 15:38:13 -0400
Subject: [PATCH 1/6] Cleaned up make_local_name() usage; changed labels,
 synonyms and descriptions to dirs.

---
 src/datahandlers/chembl.py        |  2 +-
 src/datahandlers/clo.py           |  1 -
 src/datahandlers/datacollect.py   | 30 ++++--------------------------
 src/datahandlers/ec.py            |  3 +--
 src/datahandlers/hgnc.py          |  4 ++--
 src/datahandlers/hgncfamily.py    |  4 +---
 src/datahandlers/mesh.py          |  4 ++--
 src/datahandlers/ncbigene.py      |  4 ++--
 src/datahandlers/obo.py           |  6 +++---
 src/datahandlers/pantherfamily.py |  2 +-
 src/datahandlers/umls.py          |  8 ++++----
 src/datahandlers/uniprotkb.py     |  4 +---
 12 files changed, 22 insertions(+), 50 deletions(-)

diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py
index 271ffd9b..48c0d2c3 100644
--- a/src/datahandlers/chembl.py
+++ b/src/datahandlers/chembl.py
@@ -1,5 +1,5 @@
 from src.prefixes import CHEMBLCOMPOUND
-from src.babel_utils import pull_via_ftp, make_local_name
+from src.babel_utils import pull_via_ftp
 import ftplib
 import pyoxigraph
 
diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py
index 018f8d44..66167071 100644
--- a/src/datahandlers/clo.py
+++ b/src/datahandlers/clo.py
@@ -4,7 +4,6 @@
 from src.prefixes import CLO
 from src.categories import CELL_LINE
 from src.babel_utils import pull_via_urllib
-from src.babel_utils import make_local_name
 from src.util import Text, LoggingUtil
 import pyoxigraph
 
diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py
index b5137fbd..bd992048 100644
--- a/src/datahandlers/datacollect.py
+++ b/src/datahandlers/datacollect.py
@@ -1,5 +1,5 @@
 from src.ubergraph import UberGraph
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
 from collections import defaultdict
 import os, gzip
 from json import loads,dumps
@@ -8,7 +8,7 @@ def pull_pubchem_labels():
     print('LABEL PUBCHEM')
     f_name =  'CID-Title.gz'
     cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name)
-    fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND')
+    fname = make_local_name('pull_pubchem_labels', subpath='PUBCHEM.COMPOUND/labels')
     with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf:
         for line in inf:
             x = line.strip().split('\t')
@@ -17,7 +17,7 @@ def pull_pubchem_labels():
 def pull_pubchem_synonyms():
     f_name = 'CID-Synonym-filtered.gz'
     sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name)
-    fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND')
+    fname = make_local_name('pull_pubchem_synonyms', subpath='PUBCHEM.COMPOUND/synonyms')
     with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf:
         for line in inf:
             x = line.strip().split('\t')
@@ -31,28 +31,6 @@ def pull_pubchem():
     pull_pubchem_labels()
     pull_pubchem_synonyms()
 
-def pull_hgnc():
-    data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json')
-    hgnc_json = loads(data)
-    lname = make_local_name('labels', subpath='HGNC')
-    sname = make_local_name('synonyms', subpath='HGNC')
-    with open(lname,'w') as lfile, open(sname,'w') as sfile:
-        for gene in hgnc_json['response']['docs']:
-            hgnc_id =gene['hgnc_id']
-            symbol = gene['symbol']
-            lfile.write(f'{hgnc_id}\t{symbol}\n')
-            name = gene['name']
-            sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n')
-            if 'alias_symbol' in gene:
-                alias_symbols = gene['alias_symbol']
-                for asym in alias_symbols:
-                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
-            if 'alias_name' in gene:
-                alias_names = gene['alias_name']
-                for asym in alias_names:
-                    sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
-
-
 def pull_prot(which,refresh):
     #swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta')
     if refresh:
@@ -82,7 +60,7 @@ def pull_prot(which,refresh):
 
 def pull_prots(refresh_swiss=False,refresh_trembl=False):
     swiss,labels = pull_prot('sprot',refresh_swiss)
-    fname = make_local_name('labels', subpath='UNIPROTKB')
+    fname = make_local_name('pull_prots', subpath='UNIPROTKB/labels')
     with open(fname,'w') as synonyms:
         for k,v in labels.items():
             synonyms.write(f'{k}\t{v}\n')
diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py
index 1d77c71b..58f02430 100644
--- a/src/datahandlers/ec.py
+++ b/src/datahandlers/ec.py
@@ -1,9 +1,8 @@
 from src.prefixes import EC
 from src.categories import MOLECULAR_ACTIVITY
 from src.babel_utils import pull_via_urllib
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name
 import pyoxigraph
-from collections import defaultdict
 
 
 def pull_ec():
diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py
index 482f9d67..33401029 100644
--- a/src/datahandlers/hgnc.py
+++ b/src/datahandlers/hgnc.py
@@ -12,8 +12,8 @@ def pull_hgnc():
 def pull_hgnc_labels_and_synonyms(infile):
     with open(infile,'r') as data:
         hgnc_json = json.load(data)
-    lname = make_local_name('labels', subpath='HGNC')
-    sname = make_local_name('synonyms', subpath='HGNC')
+    lname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/labels')
+    sname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/synonyms')
     with open(lname,'w') as lfile, open(sname,'w') as sfile:
         for gene in hgnc_json['response']['docs']:
             hgnc_id =gene['hgnc_id']
diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py
index cc6f8c13..07fca6fb 100644
--- a/src/datahandlers/hgncfamily.py
+++ b/src/datahandlers/hgncfamily.py
@@ -1,6 +1,4 @@
-from pronto.utils.io import decompress
-
-from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
+from src.babel_utils import pull_via_urllib
 from src.prefixes import HGNCFAMILY
 
 def pull_hgncfamily():
diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
index 6c198c4d..f71f9504 100644
--- a/src/datahandlers/mesh.py
+++ b/src/datahandlers/mesh.py
@@ -105,7 +105,7 @@ def pull_mesh_labels(self):
                 WHERE { ?term rdfs:label ?label }
                 ORDER BY ?term
         """
-        ofname = make_local_name('labels', subpath='MESH')
+        ofname = make_local_name('pull_mesh_labels', subpath='MESH/labels')
         qres = self.m.query(s)
         with open(ofname, 'w', encoding='utf8') as outf:
             for row in list(qres):
@@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell',
 
 
 #    ifname = make_local_name('mesh.nt', subpath='MESH')
-#    ofname = make_local_name('labels', subpath='MESH')
+#    ofname = make_local_name('MESH', subpath='MESH/labels')
 #    badlines = 0
 #    with open(ofname, 'w') as outf, open(ifname,'r') as data:
 #        for line in data:
diff --git a/src/datahandlers/ncbigene.py b/src/datahandlers/ncbigene.py
index eacfb163..a2648ee1 100644
--- a/src/datahandlers/ncbigene.py
+++ b/src/datahandlers/ncbigene.py
@@ -10,8 +10,8 @@ def pull_ncbigene(filenames):
 def pull_ncbigene_labels_synonyms_and_taxa():
     # File format described here: https://ftp.ncbi.nlm.nih.gov/gene/DATA/README
     ifname = make_local_name('gene_info.gz', subpath='NCBIGene')
-    labelname = make_local_name('labels', subpath='NCBIGene')
-    synname = make_local_name('synonyms', subpath='NCBIGene')
+    labelname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/labels')
+    synname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/synonyms')
     taxaname = make_local_name('taxa', subpath='NCBIGene')
     bad_gene_types = {'biological-region', 'other', 'unknown'}
     with gzip.open(ifname, 'r') as inf, \
diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py
index 6b37e052..37f82341 100644
--- a/src/datahandlers/obo.py
+++ b/src/datahandlers/obo.py
@@ -24,7 +24,7 @@ def pull_uber_labels(expected):
         ldict[p].add( ( unit['iri'], unit['label'] ) )
     for p in ldict:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('labels',subpath=p)
+            fname = make_local_name('pull_uber_labels',subpath=p + "/labels")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\n')
@@ -39,7 +39,7 @@ def pull_uber_descriptions(expected):
         ldict[p].add( ( unit['iri'], unit['description'] ) )
     for p in ldict:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('descriptions',subpath=p)
+            fname = make_local_name('pull_uber_descriptions',subpath=p + "/descriptions")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\n')
@@ -57,7 +57,7 @@ def pull_uber_synonyms(expected):
     # we are going to make some zero-length files for it
     for p in expected:
         if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
-            fname = make_local_name('synonyms',subpath=p)
+            fname = make_local_name('pull_uber_synonyms',subpath=p + "/synonyms")
             with open(fname,'w') as outf:
                 for unit in ldict[p]:
                     outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n')
diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py
index f4a0c596..6cfea38a 100644
--- a/src/datahandlers/pantherfamily.py
+++ b/src/datahandlers/pantherfamily.py
@@ -1,4 +1,4 @@
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import pull_via_ftp
 from src.prefixes import PANTHERFAMILY
 
 def pull_pantherfamily():
diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py
index cd852b68..8a712335 100644
--- a/src/datahandlers/umls.py
+++ b/src/datahandlers/umls.py
@@ -305,8 +305,8 @@ def pull_umls(mrconso):
     """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT"""
     rows = defaultdict(list)
     priority = read_umls_priority()
-    snomed_label_name = make_local_name('labels', subpath='SNOMEDCT')
-    snomed_syn_name = make_local_name('synonyms', subpath='SNOMEDCT')
+    snomed_label_name = make_local_name('pull_umls', subpath='SNOMEDCT/labels')
+    snomed_syn_name = make_local_name('pull_umls', subpath='SNOMEDCT/synonyms')
     with open(mrconso, 'r') as inf, open(snomed_label_name,'w') as snolabels, open(snomed_syn_name,'w') as snosyns:
         for line in inf:
             if not check_mrconso_line(line):
@@ -335,8 +335,8 @@ def pull_umls(mrconso):
                 #print(pkey)
                 pri = 1000000
             rows[cui].append( (pri,term,line) )
-    lname = make_local_name('labels', subpath='UMLS')
-    sname = make_local_name('synonyms', subpath='UMLS')
+    lname = make_local_name('pull_umls', subpath='UMLS/labels')
+    sname = make_local_name('pull_umls', subpath='UMLS/synonyms')
     re_numerical = re.compile(r"^\s*[+-]*[\d\.]+\s*$")
     with open(lname,'w') as labels, open(sname,'w') as synonyms:
         for cui,crows in rows.items():
diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py
index cb65d33e..3c8a7c74 100644
--- a/src/datahandlers/uniprotkb.py
+++ b/src/datahandlers/uniprotkb.py
@@ -3,9 +3,7 @@
 import os
 
 import requests
-from requests import request
-
-from src.babel_utils import pull_via_urllib, make_local_name, pull_via_wget
+from src.babel_utils import make_local_name
 
 
 def readlabels(which):

From 04e5f3d4e3a08693bf13413b2d55627892653e64 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 15:48:40 -0400
Subject: [PATCH 2/6] Standardized how we call NodeFactory().

---
 src/createcompendia/drugchemical.py  | 2 +-
 src/createcompendia/leftover_umls.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
index 6c851941..1567d455 100644
--- a/src/createcompendia/drugchemical.py
+++ b/src/createcompendia/drugchemical.py
@@ -378,7 +378,7 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem
     glom(gloms, pairs_to_be_glommed)
 
     # Set up a NodeFactory.
-    nodefactory = NodeFactory('', get_config()['biolink_version'])
+    nodefactory = NodeFactory(make_local_name(''), get_config()['biolink_version'])
 
     # Write out all the resulting cliques.
     written = set()
diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py
index f1d2bfb3..4fe6943f 100644
--- a/src/createcompendia/leftover_umls.py
+++ b/src/createcompendia/leftover_umls.py
@@ -6,6 +6,7 @@
 from snakemake.logging import Logger
 from bmt import Toolkit
 
+from src.babel_utils import make_local_name
 from src.node import NodeFactory
 from src.datahandlers import umls
 from src.prefixes import UMLS
@@ -206,7 +207,7 @@ def umls_type_to_biolink_type(umls_tui):
         reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")
 
         # Write out synonyms to synonym file.
-        node_factory = NodeFactory('babel_downloads/UMLS/labels', biolink_version)
+        node_factory = NodeFactory(make_local_name(''), biolink_version)
         count_synonym_objs = 0
         with jsonlines.open(umls_synonyms, 'w') as umls_synonymsf:
             for id in synonyms_by_id:

From 1c1d353a44b28631bf71a715cfd2a5969ba606e5 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 15:49:14 -0400
Subject: [PATCH 3/6] Rewrote write_mods_ids() to use `labels` as a directory.

---
 src/createcompendia/gene.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py
index 1c89da02..39a6c4f6 100644
--- a/src/createcompendia/gene.py
+++ b/src/createcompendia/gene.py
@@ -18,10 +18,16 @@
 
 def write_mods_ids(dd,id,modlist):
     for mod in modlist:
-        with open(f'{dd}/{mod}/labels','r') as inf, open(f'{id}/gene/ids/{mod}','w') as outf:
-            for line in inf:
-                x = line.split('\t')[0]
-                outf.write(f'{x}\n')
+        with open(f'{id}/gene/ids/{mod}','w') as outf:
+            for labelfile in os.listdir(f'{dd}/{mod}/labels'):
+                labelfile_path = f'{dd}/{mod}/labels/{labelfile}'
+                if not os.path.isfile(labelfile_path):
+                    # Skip label files.
+                    continue
+                with open(labelfile_path,'r') as inf:
+                    for line in inf:
+                        x = line.split('\t')[0]
+                        outf.write(f'{x}\n')
 
 def build_gene_ensembl_relationships(ensembl_dir, outfile):
     """Loop over all the ensembl species.  Find any protein-coding gene"""

From c905dd6b7edccd6e082f081a477b7bd8617b38ae Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 15:50:17 -0400
Subject: [PATCH 4/6] Updated/cleaned up make_local_name() change.

---
 src/createcompendia/drugchemical.py | 2 +-
 src/datahandlers/mesh.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py
index 1567d455..f80d94e8 100644
--- a/src/createcompendia/drugchemical.py
+++ b/src/createcompendia/drugchemical.py
@@ -5,7 +5,7 @@
 from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
                             SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
                             ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE)
-from src.babel_utils import glom, get_numerical_curie_suffix
+from src.babel_utils import glom, get_numerical_curie_suffix, make_local_name
 from collections import defaultdict
 import os,json
 
diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
index f71f9504..d16bbc59 100644
--- a/src/datahandlers/mesh.py
+++ b/src/datahandlers/mesh.py
@@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell',
 
 
 #    ifname = make_local_name('mesh.nt', subpath='MESH')
-#    ofname = make_local_name('MESH', subpath='MESH/labels')
+#    ofname = make_local_name('write_ids', subpath='MESH/labels')
 #    badlines = 0
 #    with open(ofname, 'w') as outf, open(ifname,'r') as data:
 #        for line in data:

From e50995ced5526459a3e3b4d9a700dd91fa83b572 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 15:51:42 -0400
Subject: [PATCH 5/6] Updated write_labels() to write a labels directory.

---
 src/datahandlers/mods.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py
index 3de6672b..82da5804 100644
--- a/src/datahandlers/mods.py
+++ b/src/datahandlers/mods.py
@@ -22,7 +22,9 @@ def write_labels(dd):
     for mod,prefix in modmap.items():
         with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf:
             j = json.load(inf)
-        with open(f'{dd}/{prefix}/labels','w') as outf:
+
+        os.makedirs(f'{dd}/{prefix}/labels',exist_ok=True)
+        with open(f'{dd}/{prefix}/labels/write_labels','w') as outf:
             for gene in j['data']:
                 gid = gene['gene_id'].split(':')[-1]
                 outf.write(f'{prefix}:{gid}\t{gene["gene_name"]}\n')
\ No newline at end of file

From 728f60497353f5c7bd42269c94fdde0c3efc3b49 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 25 Apr 2025 16:19:18 -0400
Subject: [PATCH 6/6] Modified more labels as directory.

---
 src/createcompendia/chemicals.py     | 100 ++++++++++++++++-----------
 src/datahandlers/efo.py              |   3 +
 src/snakefiles/anatomy.snakefile     |   4 +-
 src/snakefiles/cell_line.snakefile   |   2 +-
 src/snakefiles/chemical.snakefile    |  20 +++---
 src/snakefiles/datacollect.snakefile |  10 +--
 6 files changed, 79 insertions(+), 60 deletions(-)

diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py
index 6acd293f..0d81e0d8 100644
--- a/src/createcompendia/chemicals.py
+++ b/src/createcompendia/chemicals.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from collections import defaultdict
 import jsonlines
 import requests
@@ -74,35 +75,40 @@ def build_chemical_umls_relationships(mrconso, idfile,outfile):
 def build_chemical_rxnorm_relationships(conso, idfile,outfile):
     umls.build_sets(conso, idfile, outfile, {'MSH': MESH,  'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI)
 
-def write_pubchem_ids(labelfile,smilesfile,outfile):
+def write_pubchem_ids(labeldir, smilesfile, outfile):
     #Trying to be memory efficient here.  We could just ingest the whole smilesfile which would make this code easier
     # but since they're already sorted, let's give it a shot
-    with open(labelfile,'r') as inlabels, gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile,'w') as outf:
-        sn = -1
-        flag_file_ended = False
-        for labelline in inlabels:
-            x = labelline.split('\t')[0]
-            pn = int(x.split(':')[-1])
-            while not flag_file_ended and sn < pn:
-                line = insmiles.readline()
-                if line == '':
-                    # Get this: a blank line in readline() means that we've reached the end-of-file.
-                    # (A '\n' would indicate that we've just read a blank line.)
-                    flag_file_ended = True
-                    break
-                smiline = line.strip().split('\t')
-                if len(smiline) != 2:
-                    raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
-                sn = int(smiline[0])
-
-            if sn == pn:
-                #We have a smiles for this id
-                stype = get_type_from_smiles(smiline[1])
-                outf.write(f'{x}\t{stype}\n')
-            else:
-                #sn > pn, we went past it.  No smiles for that
-                print('no smiles:',x,pn,sn)
-                outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')
+    with gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile, 'w') as outf:
+        for labelfile in os.listdir(labeldir):
+            labelpath = os.path.join(labeldir, labelfile)
+            if not os.path.isfile(labelpath):
+                continue
+            with open(labelpath, 'r') as inlabels:
+                sn = -1
+                flag_file_ended = False
+                for labelline in inlabels:
+                    x = labelline.split('\t')[0]
+                    pn = int(x.split(':')[-1])
+                    while not flag_file_ended and sn < pn:
+                        line = insmiles.readline()
+                        if line == '':
+                            # Get this: a blank line in readline() means that we've reached the end-of-file.
+                            # (A '\n' would indicate that we've just read a blank line.)
+                            flag_file_ended = True
+                            break
+                        smiline = line.strip().split('\t')
+                        if len(smiline) != 2:
+                            raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
+                        sn = int(smiline[0])
+
+                    if sn == pn:
+                        #We have a smiles for this id
+                        stype = get_type_from_smiles(smiline[1])
+                        outf.write(f'{x}\t{stype}\n')
+                    else:
+                        #sn > pn, we went past it.  No smiles for that
+                        print('no smiles:',x,pn,sn)
+                        outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')
 
 
 def write_mesh_ids(outfile):
@@ -209,20 +215,25 @@ def write_drugbank_ids(infile,outfile):
                 outf.write(f'{dbid}\t{CHEMICAL_ENTITY}\n')
                 written.add(x[2])
 
-def write_chemical_ids_from_labels_and_smiles(labelfile,smifile,outfile):
+def write_chemical_ids_from_labels_and_smiles(labeldir,smifile,outfile):
     smiles = {}
     with open(smifile,'r') as inf:
         for line in inf:
             x = line.strip().split('\t')
             smiles[x[0]] = x[1]
-    with open(labelfile,'r') as inf, open(outfile,'w') as outf:
-        for line in inf:
-            hmdbid = line.split('\t')[0]
-            if hmdbid in smiles:
-                ctype = get_type_from_smiles(smiles[hmdbid])
-            else:
-                ctype = CHEMICAL_ENTITY
-            outf.write(f'{hmdbid}\t{ctype}\n')
+    with open(outfile,'w') as outf:
+        for labelfile in os.listdir(labeldir):
+            labelpath = os.path.join(labeldir,labelfile)
+            if not os.path.isfile(labelpath):
+                continue
+            with open(labelpath,'r') as inf:
+                for line in inf:
+                    hmdbid = line.split('\t')[0]
+                    if hmdbid in smiles:
+                        ctype = get_type_from_smiles(smiles[hmdbid])
+                    else:
+                        ctype = CHEMICAL_ENTITY
+                    outf.write(f'{hmdbid}\t{ctype}\n')
 
 
 def parse_smifile(infile,outfile,smicol,idcol,pref,stripquotes=False):
@@ -363,17 +374,22 @@ def make_pubchem_cas_concord(pubchemsynonyms, outfile):
             if is_cas(x[1]):
                 outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n')
 
-def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile):
+def make_pubchem_mesh_concord(pubcheminput,meshlabelsdir,outfile):
     mesh_label_to_id={}
     #Meshlabels has all kinds of stuff. e.g. these are both in there:
     #MESH:D014867    Water
     #MESH:M0022883   Water
     #but we only want the ones that are MESH:D... or MESH:C....
-    with open(meshlabels,'r') as inf:
-        for line in inf:
-            x = line.strip().split('\t')
-            if x[0].split(':')[-1][0] in ['C','D']:
-                mesh_label_to_id[x[1]] = x[0]
+    for meshlabelsfile in os.listdir(meshlabelsdir):
+        meshlabels = os.path.join(meshlabelsdir,meshlabelsfile)
+        if not os.path.isfile(meshlabels):
+            continue
+        with open(meshlabels,'r') as inf:
+            for line in inf:
+                x = line.strip().split('\t')
+                if x[0].split(':')[-1][0] in ['C','D']:
+                    mesh_label_to_id[x[1]] = x[0]
+
     #The pubchem - mesh pairs are supposed to be ordered in this file such that the
     # first mapping is the 'best' i.e. the one most frequently reported.
     # We will only use the first one
diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py
index 03fd59f1..ec81b412 100644
--- a/src/datahandlers/efo.py
+++ b/src/datahandlers/efo.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import re
 
 from src.prefixes import EFO,ORPHANET
@@ -159,6 +160,8 @@ def get_xrefs(self, iri, outfile):
 
 def make_labels(labelfile,synfile):
     m = EFOgraph()
+    os.makedirs(os.path.dirname(labelfile),exist_ok=True)
+    os.makedirs(os.path.dirname(synfile),exist_ok=True)
     m.pull_EFO_labels_and_synonyms(labelfile,synfile)
 
 def make_ids(roots,idfname):
diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile
index 571225d0..e04b731d 100644
--- a/src/snakefiles/anatomy.snakefile
+++ b/src/snakefiles/anatomy.snakefile
@@ -69,8 +69,8 @@ rule get_anatomy_umls_relationships:
 
 rule anatomy_compendia:
     input:
-        labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['anatomy_prefixes']),
-        synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['anatomy_prefixes']),
+        labels=directory(expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['anatomy_prefixes'])),
+        synonyms=directory(expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['anatomy_prefixes'])),
         concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']),
         idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']),
         icrdf_filename=config['download_directory']+'/icRDF.tsv',
diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile
index b8e72965..e9e4593f 100644
--- a/src/snakefiles/cell_line.snakefile
+++ b/src/snakefiles/cell_line.snakefile
@@ -22,7 +22,7 @@ rule get_clo_ids:
 rule cell_line_compendia:
     input:
         ids=config['intermediate_directory']+"/cell_line/ids/CLO",
-        labelfile=config['download_directory'] + '/CLO/labels',
+        labelfile=directory(config['download_directory'] + '/CLO/labels'),
         synonymfile=config['download_directory'] + '/CLO/synonyms',
         icrdf_filename=config['download_directory']+'/icRDF.tsv',
     output:
diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile
index 249fe30c..1d85c8c1 100644
--- a/src/snakefiles/chemical.snakefile
+++ b/src/snakefiles/chemical.snakefile
@@ -28,23 +28,23 @@ rule chemical_mesh_ids:
 
 rule chemical_pubchem_ids:
     input:
-        infile=config['download_directory']+"/PUBCHEM.COMPOUND/labels",
+        labelsdir=directory(config['download_directory']+"/PUBCHEM.COMPOUND/labels"),
         smilesfile=config['download_directory']+"/PUBCHEM.COMPOUND/CID-SMILES.gz"
     output:
         outfile=config['intermediate_directory']+"/chemicals/ids/PUBCHEM.COMPOUND"
     run:
         #This one is a simple enough transform to do with awk
-        chemicals.write_pubchem_ids(input.infile,input.smilesfile,output.outfile)
+        chemicals.write_pubchem_ids(input.labelsdir,input.smilesfile,output.outfile)
         #"awk '{{print $1\"\tbiolink:ChemicalSubstance\"}}' {input.infile} > {output.outfile}"
 
 rule chemical_chembl_ids:
     input:
-        labelfile=config['download_directory']+"/CHEMBL.COMPOUND/labels",
+        labeldir=directory(config['download_directory']+"/CHEMBL.COMPOUND/labels"),
         smifile  =config['download_directory'] + "/CHEMBL.COMPOUND/smiles"
     output:
         outfile=config['intermediate_directory']+"/chemicals/ids/CHEMBL.COMPOUND"
     run:
-        chemicals.write_chemical_ids_from_labels_and_smiles(input.labelfile,input.smifile,output.outfile)
+        chemicals.write_chemical_ids_from_labels_and_smiles(input.labeldir,input.smifile,output.outfile)
 
 rule chemical_gtopdb_ids:
     input:
@@ -56,7 +56,7 @@ rule chemical_gtopdb_ids:
 
 rule chemical_kegg_ids:
     input:
-        infile=config['download_directory']+"/KEGG.COMPOUND/labels"
+        infile=config['download_directory']+"/KEGG.COMPOUND/labels/pull_kegg_compound_labels"
     output:
         outfile=config['intermediate_directory']+"/chemicals/ids/KEGG.COMPOUND"
     shell:
@@ -73,12 +73,12 @@ rule chemical_unii_ids:
 
 rule chemical_hmdb_ids:
     input:
-        labelfile=config['download_directory']+"/HMDB/labels",
+        labeldir=directory(config['download_directory']+"/HMDB/labels"),
         smifile=config['download_directory'] + "/HMDB/smiles"
     output:
         outfile=config['intermediate_directory']+"/chemicals/ids/HMDB"
     run:
-        chemicals.write_chemical_ids_from_labels_and_smiles(input.labelfile,input.smifile,output.outfile)
+        chemicals.write_chemical_ids_from_labels_and_smiles(input.labeldir,input.smifile,output.outfile)
 
 rule chemical_drugcentral_ids:
     input:
@@ -159,11 +159,11 @@ rule get_chemical_unichem_relationships:
 rule get_chemical_pubchem_mesh_concord:
     input:
         pubchemfile=config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH',
-        meshlabels=config['download_directory'] + '/MESH/labels'
+        meshlabelsdir=directory(config['download_directory'] + '/MESH/labels')
     output:
         outfile =  config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH'
     run:
-        chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile)
+        chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabelsdir,output.outfile)
 
 rule get_chemical_pubchem_cas_concord:
     input:
@@ -201,7 +201,7 @@ rule chemical_unichem_concordia:
 
 rule untyped_chemical_compendia:
     input:
-        labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['chemical_labels']),
+        labeldirs=directory(expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['chemical_labels'])),
         synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['chemical_synonyms']),
         unichemgroup = config['intermediate_directory']+'/chemicals/partials/UNICHEM',
         concords = expand('{dd}/chemicals/concords/{cc}',dd=config['intermediate_directory'], cc=config['chemical_concords'] ),
diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index 4bb708dd..8e517601 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -54,8 +54,8 @@ rule get_EFO_labels:
     input:
         infile=config['download_directory'] + '/EFO/efo.owl'
     output:
-        labelfile=config['download_directory'] + '/EFO/labels',
-        synonymfile =config['download_directory'] + '/EFO/synonyms'
+        labelfile=config['download_directory'] + '/EFO/labels/get_EFO_labels',
+        synonymfile =config['download_directory'] + '/EFO/synonyms/get_EFO_labels'
     run:
         efo.make_labels(output.labelfile,output.synonymfile)
 
@@ -89,7 +89,7 @@ rule get_mods_labels:
     input:
         expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json",download_directory=config['download_directory'], mod=config['mods']),
     output:
-        expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']),
+        expand("{download_directory}/{mod}/labels/write_labels",download_directory=config['download_directory'], mod=config['mods']),
     run:
         mods.write_labels(config['download_directory'])
 
@@ -118,7 +118,7 @@ rule get_uniprotkb_labels:
         sprot_input=config['download_directory']+'/UniProtKB/uniprot_sprot.fasta',
         trembl_input=config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
     output:
-        outfile=config['download_directory']+'/UniProtKB/labels'
+        outfile=config['download_directory']+'/UniProtKB/labels/pull_uniprot_labels'
     run:
         uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile)
 
@@ -471,7 +471,7 @@ rule gtopdb_labels_and_synonyms:
 
 rule keggcompound_labels:
     output:
-        labelfile=config['download_directory'] + '/KEGG.COMPOUND/labels'
+        labelfile=config['download_directory'] + '/KEGG.COMPOUND/labels/pull_kegg_compound_labels'
     run:
         kegg.pull_kegg_compound_labels(output.labelfile)