From ebabe70fd357c200f6756e41ce0b1080cf685b04 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:14:18 -0400
Subject: [PATCH 01/28] adding omykiss_gene_ensembl

---
 config.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/config.json b/config.json
index eb602cb0..89373efd 100644
--- a/config.json
+++ b/config.json
@@ -133,7 +133,8 @@
     "hgfemale_gene_ensembl",
     "charengus_gene_ensembl",
     "otshawytscha_gene_ensembl",
-    "aocellaris_gene_ensembl"
+    "aocellaris_gene_ensembl",
+    "omykiss_gene_ensembl"
   ],
 
   "duckdb_config": {

From 90786c252f486237c1cc79bde6fb163a60c25db2 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:14:50 -0400
Subject: [PATCH 02/28] formatting

---
 tests/test_glom.py | 54 +++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/tests/test_glom.py b/tests/test_glom.py
index d90b8669..db6f84fc 100644
--- a/tests/test_glom.py
+++ b/tests/test_glom.py
@@ -3,54 +3,58 @@
 
 """glom is a tool that looks at list of sets of values and combines them together if they share members"""
 
+
 def test_uberon():
-    uberon=[('UBERON:123',)]
-    dict={}
-    glom(dict,uberon,unique_prefixes='UBERON')
-    uber2 = [set(['UBERON:123','SOME:other'])]
-    glom(dict,uber2,unique_prefixes='UBERON')
+    uberon = [("UBERON:123",)]
+    dict = {}
+    glom(dict, uberon, unique_prefixes="UBERON")
+    uber2 = [{"UBERON:123", "SOME:other"}]
+    glom(dict, uber2, unique_prefixes="UBERON")
     print(dict)
 
+
 def test_simple():
     """Given 3 sets, 2 of which share a member, output 2 sets, with the sharing sets combined"""
     d = {}
-    eqs = [('1','2'), ('2','3'), ('4','5')]
-    glom(d,eqs)
+    eqs = [("1", "2"), ("2", "3"), ("4", "5")]
+    glom(d, eqs)
+    print(f"{d}")
     assert len(d) == 5
-    assert d['1'] == d['2'] == d['3'] == {'1','2','3'}
-    assert d['4'] == d['5'] == {'4','5'}
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == {"4", "5"}
+
 
 def test_two_calls():
     """Test using glom iteratively. The first call joins the first two sets, then the second call joins
     the next two and the new set."""
     d = {}
-    eqs = [('1','2'), ('2','3'), ('4','5'), ('6','7')]
-    oeqs = [('5','7')]
-    glom(d,eqs)
-    glom(d,oeqs)
-    assert d['1']==d['2']==d['3']=={'1','2','3'}
-    assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
+    eqs = [("1", "2"), ("2", "3"), ("4", "5"), ("6", "7")]
+    oeqs = [("5", "7")]
+    glom(d, eqs)
+    glom(d, oeqs)
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}
+
 
 def test_sets():
     """Test using set() as opposed to {}"""
     d = {}
-    eqs = [{'1','2'}, set(['2','3']), set(['4','5']), set(['6','7'])]
-    oeqs = [{'5','7'}]
-    glom(d,eqs)
-    glom(d,oeqs)
-    assert d['1']==d['2']==d['3']=={'1','2','3'}
-    assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
+    eqs = [{"1", "2"}, {"2", "3"}, {"4", "5"}, {"6", "7"}]
+    oeqs = [{"5", "7"}]
+    glom(d, eqs)
+    glom(d, oeqs)
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}
+
 
 def test_bigger_sets():
     """Test when the sets have more than two members.
     As of recent builds, we no longer expect this to work.
     Now glom only operates on new pairwise sets"""
     d = {}
-    eqs = [{'1','2','3'}, {'4','5','6'} ]
+    eqs = [{"1", "2", "3"}, {"4", "5", "6"}]
     try:
-        glom(d,eqs)
+        glom(d, eqs)
         assert False
     except ValueError:
         assert True
-
-

From 3bae93d1e39aea916c44e921953ff65ea5c3e674 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:15:40 -0400
Subject: [PATCH 03/28] adding and 'datacollect' target, using rust on occasion

---
 src/snakefiles/datacollect.snakefile | 223 ++++++++++++++++++++++-----
 1 file changed, 184 insertions(+), 39 deletions(-)

diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
index 89e61ac2..1f5d923f 100644
--- a/src/snakefiles/datacollect.snakefile
+++ b/src/snakefiles/datacollect.snakefile
@@ -56,8 +56,10 @@ rule get_EFO_labels:
     output:
         labelfile=config['download_directory'] + '/EFO/labels',
         synonymfile =config['download_directory'] + '/EFO/synonyms'
-    run:
-        efo.make_labels(output.labelfile,output.synonymfile)
+    # run:
+    #     efo.make_labels(output.labelfile,output.synonymfile) # 21 seconds
+    shell:
+        "./babel_io/target/release/create_efo_labels --input {input.infile} --labels-output {output.labelfile} --synonyms-output {output.synonymfile}"
 
 ### Complex Portal
 # https://www.ebi.ac.uk/complexportal/
@@ -74,20 +76,22 @@ rule get_complexportal_labels_and_synonyms:
     output:
         lfile = config['download_directory']+'/ComplexPortal'+'/559292_labels.tsv',
         sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv'
-    run:
-        complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile)
+    # run:
+    #     complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile)
+    shell:
+        "./babel_io/target/release/create_complexportal_labels_and_synonyms --input {input.infile} --labels-output {output.lfile} --synonyms-output {output.sfile}"
 
 ### MODS
 
 rule get_mods:
     output:
-        expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json", download_directory = config['download_directory'], mod = config['mods']),
+        expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json", download_directory = config['download_directory'], mod = config['mods']),
     run:
         mods.pull_mods()
 
 rule get_mods_labels:
     input:
-        expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json",download_directory=config['download_directory'], mod=config['mods']),
+        expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json",download_directory=config['download_directory'], mod=config['mods']),
     output:
         expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']),
     run:
@@ -119,8 +123,10 @@ rule get_uniprotkb_labels:
         trembl_input=config['download_directory']+'/UniProtKB/uniprot_trembl.fasta',
     output:
         outfile=config['download_directory']+'/UniProtKB/labels'
-    run:
-        uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile)
+    #run:
+    #    uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile)
+    shell:
+        "./babel_io/target/release/create_uniprot_labels --sprot-input {input.sprot_input} --trembl-input {input.trembl_input} --output {output.outfile}"
 
 rule get_umls_gene_protein_mappings:
     output:
@@ -148,8 +154,10 @@ rule get_mesh_labels:
         config['download_directory']+'/MESH/mesh.nt'
     output:
         config['download_directory']+'/MESH/labels'
-    run:
-        mesh.pull_mesh_labels()
+    # run:
+    #     mesh.pull_mesh_labels()
+    shell:
+        "./babel_io/target/release/create_mesh_labels --input {input} --output {output}"
 
 rule get_mesh_synonyms:
     #We don't actually get any.  Maybe we could from the nt?
@@ -178,6 +186,8 @@ rule get_umls_labels_and_synonyms:
         config['download_directory']+'/SNOMEDCT/synonyms'
     run:
         umls.pull_umls(input.mrconso)
+    # shell:
+    #     "./babel_io/target/release/create_umls_labels_and_synonyms --input {input.mrconso}"
 
 ### OBO Ontologies
 
@@ -232,16 +242,22 @@ rule get_ncbigene_labels_synonyms_and_taxa:
         synonyms_filename=config['download_directory']+'/NCBIGene/synonyms',
         taxa_filename=config['download_directory']+'/NCBIGene/taxa',
         descriptions_filename=config['download_directory']+'/NCBIGene/descriptions',
-    run:
-        ncbigene.pull_ncbigene_labels_synonyms_and_taxa(input.gene_info_filename, output.labels_filename, output.synonyms_filename, output.taxa_filename, output.descriptions_filename)
+    # run:
+    #     ncbigene.pull_ncbigene_labels_synonyms_and_taxa(input.gene_info_filename, output.labels_filename, output.synonyms_filename, output.taxa_filename, output.descriptions_filename)
+    shell:
+        "./babel_io/target/release/create_ncbigene_labels_synonyms_and_taxa -i {input.gene_info_filename} -l {output.labels_filename} -s {output.synonyms_filename} -t {output.taxa_filename} -d {output.descriptions_filename}"
 
 ### ENSEMBL
 
 rule get_ensembl:
     output:
         outfile=config['download_directory']+'/ENSEMBL/BioMartDownloadComplete'
-    run:
-        ensembl.pull_ensembl(output.outfile)
+    params:
+        output_dir=config['download_directory']+'/ENSEMBL'
+    # run:
+    #     ensembl.pull_ensembl(output.outfile)
+    shell:
+        "./babel_io/target/release/pull_ensembl --ensembl-output-dir {params.output_dir}"
 
 ### HGNC
 
@@ -252,13 +268,15 @@ rule get_hgnc:
         hgnc.pull_hgnc()
 
 rule get_hgnc_labels_and_synonyms:
-    output:
-        config['download_directory']+'/HGNC/labels',
-        config['download_directory']+'/HGNC/synonyms'
     input:
         infile=rules.get_hgnc.output.outfile
-    run:
-        hgnc.pull_hgnc_labels_and_synonyms(input.infile)
+    output:
+        labels_filename=config['download_directory']+'/HGNC/labels',
+        synonyms_filename=config['download_directory']+'/HGNC/synonyms'
+    # run:
+    #     hgnc.pull_hgnc_labels_and_synonyms(input.infile)
+    shell:
+        "./babel_io/target/release/create_hgnc_labels_and_synonyms -i {input.infile} -l {output.labels_filename} -s {output.synonyms_filename}"
 
 ### HGNC.FAMILY
 
@@ -273,8 +291,10 @@ rule get_hgncfamily_labels:
         infile=rules.get_hgncfamily.output.outfile
     output:
         outfile = config['download_directory'] + '/HGNC.FAMILY/labels',
-    run:
-        hgncfamily.pull_labels(input.infile,output.outfile)
+    # run:
+    #     hgncfamily.pull_labels(input.infile,output.outfile)
+    shell:
+        "./babel_io/target/release/create_hgncfamily_labels -i {input.infile} -l {output.outfile}"
 
 ### PANTHER.FAMILY
 
@@ -289,8 +309,10 @@ rule get_pantherfamily_labels:
         infile=rules.get_pantherfamily.output.outfile
     output:
         outfile = config['download_directory'] + '/PANTHER.FAMILY/labels',
-    run:
-        pantherfamily.pull_labels(input.infile,output.outfile)
+    # run:
+    #     pantherfamily.pull_labels(input.infile,output.outfile)
+    shell:
+        "./babel_io/target/release/create_pantherfamily_labels -i {input.infile} -l {output.outfile}"
 
 
 ### OMIM
@@ -324,8 +346,10 @@ rule get_doid_labels_and_synonyms:
     output:
         labelfile = config['download_directory'] + '/DOID/labels',
         synonymfile = config['download_directory'] + '/DOID/synonyms'
-    run:
-        doid.pull_doid_labels_and_synonyms(input.infile, output.labelfile, output.synonymfile)
+    # run:
+    #     doid.pull_doid_labels_and_synonyms(input.infile, output.labelfile, output.synonymfile)
+    shell:
+        "./babel_io/target/release/create_doid_labels_and_synonyms -i {input.infile} -l {output.labelfile} -s {output.synonymfile}"
 
 ### Orphanet
 
@@ -357,8 +381,10 @@ rule get_reactome_labels:
         infile=config['download_directory'] + '/REACT/Events.json',
     output:
         labelfile=config['download_directory'] + '/REACT/labels',
-    run:
-        reactome.make_labels(input.infile,output.labelfile)
+    # run:
+    #     reactome.make_labels(input.infile,output.labelfile)
+    shell:
+        "./babel_io/target/release/create_reactome_labels -i {input.infile} -l {output.labelfile}"
 
 ### RHEA
 
@@ -373,8 +399,10 @@ rule get_rhea_labels:
         infile=config['download_directory'] + '/RHEA/rhea.rdf',
     output:
         labelfile=config['download_directory'] + '/RHEA/labels',
-    run:
-        rhea.make_labels(output.labelfile)
+    # run:
+    #     rhea.make_labels(output.labelfile)
+    shell:
+        "./babel_io/target/release/create_rhea_labels -i {input.infile} -l {output.labelfile}"
 
 ### EC
 
@@ -390,8 +418,10 @@ rule get_EC_labels:
     output:
         labelfile=config['download_directory'] + '/EC/labels',
         synonymfile =config['download_directory'] + '/EC/synonyms'
-    run:
-        ec.make_labels(output.labelfile,output.synonymfile)
+    # run:
+    #     ec.make_labels(output.labelfile,output.synonymfile)
+    shell:
+        "./babel_io/target/release/create_ec_labels -i {input.infile} -l {output.labelfile} -s {output.synonymfile}"
 
 ### SMPDB
 
@@ -406,8 +436,10 @@ rule get_SMPDB_labels:
         infile=config['download_directory'] + '/SMPDB/smpdb_pathways.csv'
     output:
         labelfile=config['download_directory'] + '/SMPDB/labels'
-    run:
-        smpdb.make_labels(input.infile,output.labelfile)
+    # run:
+    #     smpdb.make_labels(input.infile,output.labelfile)
+    shell:
+        "./babel_io/target/release/create_smpdb_labels -i {input.infile} -l {output.labelfile}"
 
 ### PantherPathways
 
@@ -422,8 +454,10 @@ rule get_panther_pathway_labels:
         infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt'
     output:
         labelfile=config['download_directory'] + '/PANTHER.PATHWAY/labels'
-    run:
-        pantherpathways.make_pathway_labels(input.infile,output.labelfile)
+    # run:
+    #     pantherpathways.make_pathway_labels(input.infile,output.labelfile)
+    shell:
+        "./babel_io/target/release/create_pantherpathways_labels -i {input.infile} -l {output.labelfile}"
 
 ### Unichem
 
@@ -439,8 +473,10 @@ rule filter_unichem:
         reffile=config['download_directory'] + '/UNICHEM/reference.tsv.gz',
     output:
         filteredreffile=config['download_directory'] + '/UNICHEM/reference.filtered.tsv',
-    run:
-        unichem.filter_unichem(input.reffile, output.filteredreffile)
+    # run:
+    #     unichem.filter_unichem(input.reffile, output.filteredreffile)
+    shell:
+        "./babel_io/target/release/filter_unichem -i {input.reffile} -o {output.filteredreffile}"
 
 ### CHEMBL
 
@@ -458,8 +494,10 @@ rule chembl_labels_and_smiles:
     output:
         outfile=config['download_directory']+'/CHEMBL.COMPOUND/labels',
         smifile=config['download_directory']+'/CHEMBL.COMPOUND/smiles'
-    run:
-        chembl.pull_chembl_labels_and_smiles(input.infile,input.ccofile,output.outfile,output.smifile)
+    # run:
+    #     chembl.pull_chembl_labels_and_smiles(input.infile,input.ccofile,output.outfile,output.smifile)
+    shell:
+        "./babel_io/target/release/create_chembl_labels_and_smiles -i {input.infile} -c {input.ccofile} -l {output.outfile} -s {output.smifile}"
 
 ### DrugBank requires a login... but not for basic vocabulary information.
 rule get_drugbank_labels_and_synonyms:
@@ -634,3 +672,110 @@ rule get_CLO_labels:
         synonymfile =config['download_directory'] + '/CLO/synonyms'
     run:
         clo.make_labels(input.infile, output.labelfile,output.synonymfile)
+
+rule datacollect:
+    input:
+        config['download_directory'] + '/EFO/labels',
+        config['download_directory'] + '/EFO/synonyms',
+        config['download_directory'] + '/ComplexPortal/559292.tsv',
+        config['download_directory'] + '/ComplexPortal/559292_labels.tsv',
+        config['download_directory'] + '/ComplexPortal/559292_synonyms.tsv',
+        expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json", download_directory = config['download_directory'], mod = config['mods']),
+        expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']),
+        config['download_directory'] + '/UniProtKB/idmapping.dat',
+        config['download_directory'] + '/UniProtKB/uniprot_sprot.fasta',
+        config['download_directory'] + '/UniProtKB/uniprot_trembl.fasta',
+        config['download_directory'] + '/UniProtKB/labels',
+        config['download_directory'] + '/UMLS_UniProtKB/UMLS_UniProtKB.tsv',
+        config['output_directory'] + '/intermediate/gene/concords/UMLS_NCBIGene',
+        config['output_directory'] + '/intermediate/protein/concords/UMLS_UniProtKB',
+        config['download_directory'] + '/MESH/mesh.nt',
+        config['download_directory'] + '/MESH/labels',
+        config['download_directory'] + '/MESH/synonyms',
+        config['download_directory'] + '/UMLS/MRCONSO.RRF',
+        config['download_directory'] + '/UMLS/MRSTY.RRF',
+        config['download_directory'] + '/UMLS/MRREL.RRF',
+        config['download_directory'] + '/UMLS/labels',
+        config['download_directory'] + '/UMLS/synonyms',
+        config['download_directory'] + '/SNOMEDCT/labels',
+        config['download_directory'] + '/SNOMEDCT/synonyms',
+        config['download_directory'] + '/common/ubergraph/labels',
+        config['download_directory'] + '/common/ubergraph/synonyms.jsonl',
+        config['download_directory'] + '/common/ubergraph/descriptions.jsonl',
+        config['download_directory'] + '/icRDF.tsv',
+        expand("{download_directory}/NCBIGene/{ncbi_files}", download_directory=config['download_directory'],ncbi_files=config['ncbi_files']),
+        config['download_directory'] + '/NCBIGene/labels',
+        config['download_directory'] + '/NCBIGene/synonyms',
+        config['download_directory'] + '/NCBIGene/taxa',
+        config['download_directory'] + '/NCBIGene/descriptions',
+        config['download_directory'] + '/ENSEMBL/BioMartDownloadComplete',
+        config['download_directory'] + '/HGNC/hgnc_complete_set.json',
+        config['download_directory'] + '/HGNC/labels',
+        config['download_directory'] + '/HGNC/synonyms',
+        config['download_directory'] + '/HGNC.FAMILY/family.csv',
+        config['download_directory'] + '/HGNC.FAMILY/labels',
+        config['download_directory'] + '/PANTHER.FAMILY/family.csv',
+        config['download_directory'] + '/PANTHER.FAMILY/labels',
+        config['download_directory'] + '/OMIM/mim2gene.txt',
+        config['download_directory'] + '/NCIT/NCIt-SwissProt_Mapping.txt',
+        config['download_directory'] + '/DOID/doid.json',
+        config['download_directory'] + '/DOID/labels',
+        config['download_directory'] + '/DOID/synonyms',
+        config['download_directory'] + '/Orphanet/Orphanet_Nomenclature_Pack_EN.zip',
+        config['download_directory'] + '/Orphanet/labels',
+        config['download_directory'] + '/Orphanet/synonyms',
+        config['download_directory'] + '/REACT/Events.json',
+        config['download_directory'] + '/REACT/labels',
+        config['download_directory'] + '/RHEA/rhea.rdf',
+        config['download_directory'] + '/RHEA/labels',
+        config['download_directory'] + '/EC/enzyme.rdf',
+        config['download_directory'] + '/EC/labels',
+        config['download_directory'] + '/EC/synonyms',
+        config['download_directory'] + '/SMPDB/smpdb_pathways.csv',
+        config['download_directory'] + '/SMPDB/labels',
+        config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt',
+        config['download_directory'] + '/PANTHER.PATHWAY/labels',
+        config['download_directory'] + '/UNICHEM/structure.tsv.gz',
+        config['download_directory'] + '/UNICHEM/reference.tsv.gz',
+        config['download_directory'] + '/UNICHEM/reference.filtered.tsv',
+        config['download_directory'] + '/CHEMBL.COMPOUND/chembl_latest_molecule.ttl',
+        config['download_directory'] + '/CHEMBL.COMPOUND/cco.ttl',
+        config['download_directory'] + '/CHEMBL.COMPOUND/labels',
+        config['download_directory'] + '/CHEMBL.COMPOUND/smiles',
+        config['download_directory'] + '/DRUGBANK/drugbank vocabulary.csv',
+        config['download_directory'] + '/DRUGBANK/labels',
+        config['download_directory'] + '/DRUGBANK/synonyms',
+        config['download_directory'] + '/GTOPDB/ligands.tsv',
+        config['download_directory'] + '/GTOPDB/labels',
+        config['download_directory'] + '/GTOPDB/synonyms',
+        config['download_directory'] + '/KEGG.COMPOUND/labels',
+        config['download_directory'] + '/UNII/Latest_UNII_Names.txt',
+        config['download_directory'] + '/UNII/Latest_UNII_Records.txt',
+        config['download_directory'] + '/UNII/labels',
+        config['download_directory'] + '/UNII/synonyms',
+        config['download_directory'] + '/HMDB/hmdb_metabolites.xml',
+        config['download_directory'] + '/HMDB/labels',
+        config['download_directory'] + '/HMDB/synonyms',
+        config['download_directory'] + '/HMDB/smiles',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/CID-Synonym-filtered.gz',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/CID-Title.gz',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/CID-InChI-Key.gz',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/CID-SMILES.gz',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/labels',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/synonyms',
+        config['download_directory'] + '/RxNorm/RXNCONSO.RRF',
+        config['download_directory'] + '/RxNorm/RXNREL.RRF',
+        config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json',
+        config['download_directory'] + '/DrugCentral/structures',
+        config['download_directory'] + '/DrugCentral/labels',
+        config['download_directory'] + '/DrugCentral/xrefs',
+        config['download_directory'] + '/NCBITaxon/taxdump.tar',
+        config['download_directory'] + '/NCBITaxon/labels',
+        config['download_directory'] + '/NCBITaxon/synonyms',
+        config['download_directory'] + '/NCBITaxon/properties.tsv.gz',
+        config['download_directory'] + '/CHEBI/ChEBI_complete.sdf',
+        config['download_directory'] + '/CHEBI/database_accession.tsv',
+        config['download_directory'] + '/CLO/clo.owl',
+        config['download_directory'] + '/CLO/labels',
+        config['download_directory'] + '/CLO/synonyms'

From fbc25a3b302a73b795fdd1f316d7d1d718a04389 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:16:03 -0400
Subject: [PATCH 04/28] cleaning

---
 src/datahandlers/unichem.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datahandlers/unichem.py b/src/datahandlers/unichem.py
index 7755f440..85229130 100644
--- a/src/datahandlers/unichem.py
+++ b/src/datahandlers/unichem.py
@@ -8,7 +8,6 @@
 data_sources: dict = {'1': CHEMBLCOMPOUND, '2': DRUGBANK, '4': GTOPDB, '6': KEGGCOMPOUND, '7': CHEBI, '14': UNII,
                       '18': HMDB, '22': PUBCHEMCOMPOUND, '34': DRUGCENTRAL}
 
-
 def pull_unichem():
     """ Download UniChem files. """
     pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/', 'structure.tsv.gz', decompress=False, subpath='UNICHEM')

From 6bee5867b6378119cac593691bd79dfc118ff7d8 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:16:21 -0400
Subject: [PATCH 05/28] using bulk_load

---
 src/datahandlers/rhea.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py
index 6546e8ff..619c50df 100644
--- a/src/datahandlers/rhea.py
+++ b/src/datahandlers/rhea.py
@@ -15,9 +15,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading rhea')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')

From a96d9a03fb68c915fdc52d66df63238dda1b8153 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:17:08 -0400
Subject: [PATCH 06/28] adding comments

---
 src/datahandlers/pantherfamily.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py
index f4a0c596..2197739f 100644
--- a/src/datahandlers/pantherfamily.py
+++ b/src/datahandlers/pantherfamily.py
@@ -18,14 +18,15 @@ def pull_labels(infile,outfile):
     labels = {}
     done = set()
     with open(outfile,'w') as labelf:
+        # FIXME: first line should not be skipped
         for line in lines[1:]:
             parts = line.split('\t')
             if len(parts) < 5:
                 continue
             sf = parts[SUBFAMILY_COLUMN]
-            mf = sf.split(':')[0]
-            mfname = parts[MAINFAMILY_NAME_COLUMN]
-            sfname = parts[SUBFAMILY_NAME_COLUMN]
+            mf = sf.split(':')[0] # PTHR10845:SF155 -> PTHR10845
+            mfname = parts[MAINFAMILY_NAME_COLUMN] # REGULATOR OF G PROTEIN SIGNALING
+            sfname = parts[SUBFAMILY_NAME_COLUMN] # REGULATOR OF G-PROTEIN SIGNALING 18
             if mf not in done:
                 main_family = f'{PANTHERFAMILY}:{mf}'
                 #panther_families.append(main_family)

From 6c8a68952286c38fedb76d085d44a214e2120299 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:17:26 -0400
Subject: [PATCH 07/28] fixing download url

---
 src/datahandlers/mods.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py
index 3de6672b..0f5d2c43 100644
--- a/src/datahandlers/mods.py
+++ b/src/datahandlers/mods.py
@@ -10,7 +10,10 @@
 def pull_mods():
     for mod in mods:
         subp = modmap[mod]
-        origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp)
+        # https://www.alliancegenome.org/downloads#gene-descriptions
+        # https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/SGD/GENE-DESCRIPTION-JSON_SGD_9.json.gz
+        # origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp)
+        origname = pull_via_urllib(f'https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/{mod}/',f'GENE-DESCRIPTION-JSON_{mod}_9.json.gz', subpath=subp)
         #This should be fine.  But for the makefile it's nice if the directory in which this goes is the same as the {mod} in the filename.
         # And we'd like it to be the names of the prefixes
         if mod != modmap[mod]:
@@ -20,7 +23,7 @@ def pull_mods():
 
 def write_labels(dd):
     for mod,prefix in modmap.items():
-        with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf:
+        with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}_9.json','r') as inf:
             j = json.load(inf)
         with open(f'{dd}/{prefix}/labels','w') as outf:
             for gene in j['data']:

From c2d90b12826dce1a5d59e446fef23a09d1813d76 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:17:42 -0400
Subject: [PATCH 08/28] adding bulk_load

---
 src/datahandlers/mesh.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
index 6c198c4d..b90827e0 100644
--- a/src/datahandlers/mesh.py
+++ b/src/datahandlers/mesh.py
@@ -13,9 +13,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading mesh.nt')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/n-triples')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.N_TRIPLES)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')

From a43bccd11dc54aac088fbb1e6905b3634e6c8a9f Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:18:02 -0400
Subject: [PATCH 09/28] adding comment

---
 src/datahandlers/hgncfamily.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py
index cc6f8c13..5fd9a784 100644
--- a/src/datahandlers/hgncfamily.py
+++ b/src/datahandlers/hgncfamily.py
@@ -21,6 +21,6 @@ def pull_labels(infile,outfile):
             if len(parts) < 10:
                 continue
             i = f"{HGNCFAMILY}:{parts[0][1:-1]}"
-            l = parts[2][1:-1]
+            l = parts[2][1:-1] # FIXME...this is a bug since commas are used in the fields of a line
             outf.write(f'{i}\t{l}\n')
 

From 73979fa1a1efe0a7f60afcb880319a2085a26fcd Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:18:32 -0400
Subject: [PATCH 10/28] adding comments, but switch to rust solution

---
 src/datahandlers/ensembl.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py
index b6c3593d..89ce6c0f 100644
--- a/src/datahandlers/ensembl.py
+++ b/src/datahandlers/ensembl.py
@@ -1,5 +1,8 @@
 import traceback
 
+import apybiomart
+import pandas
+
 from src.babel_utils import make_local_name, get_config
 from apybiomart import find_datasets, query, find_attributes
 import os
@@ -13,14 +16,19 @@
 # genes that can be gathered without downloading hundreds of gigs of other stuff.  So, we'll use biomart to pull
 # just what we need.
 def pull_ensembl(complete_file):
-    f = find_datasets()
+    dataset_df = find_datasets()
+
+    # dataset_url = "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL"
+    # dataset_df = pandas.read_csv(dataset_url, sep='\t', header=None, index_col=False)
 
     skip_dataset_ids = set(get_config()['ensembl_datasets_to_skip'])
 
     cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source",
             "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id',
             'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'}
-    for ds in f['Dataset_ID']:
+
+    # for ds in dataset_df[1]:
+    for ds in dataset_df['Dataset_ID']:
         print(ds)
         if ds in skip_dataset_ids:
             print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}')
@@ -32,9 +40,18 @@ def pull_ensembl(complete_file):
         if os.path.exists(outfile):
             continue
         try:
-            atts = find_attributes(ds)
-            existingatts = set(atts['Attribute_ID'].to_list())
+
+            attributes_df = find_attributes(ds)
+            # attributes_url = f"http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={ds}"
+            # attributes_df = pandas.read_csv(attributes_url, sep='\t', header=None, index_col=False)
+
+            # existingatts = set(attributes_df[0].to_list())
+            existingatts = set(attributes_df['Attribute_ID'].to_list())
             attsIcanGet = cols.intersection(existingatts)
+
+            # query_url = f"http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={ds}"
+            # query_df = pandas.read_csv(attributes_url, sep='\t', header=None, index_col=False)
+
             df = query(attributes=list(attsIcanGet), filters={}, dataset=ds)
             df.to_csv(outfile, index=False, sep='\t')
         except Exception as exc:
@@ -48,4 +65,7 @@ def pull_ensembl(complete_file):
 
 
 if __name__ == '__main__':
-    pull_ensembl()
+    # marts = apybiomart.find_marts()
+    # print(marts.head())
+
+    pull_ensembl("/tmp/asdfasdf.txt")

From 5069a5cd2304c7ab80f50822a74ef9930b62bb40 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:18:50 -0400
Subject: [PATCH 11/28] using bulk_load

---
 src/datahandlers/efo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py
index 03fd59f1..6c468921 100644
--- a/src/datahandlers/efo.py
+++ b/src/datahandlers/efo.py
@@ -26,9 +26,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading EFO')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
@@ -119,6 +119,7 @@ def get_exacts(self, iri, outfile):
             outfile.write(f"{iri}\tskos:exactMatch\t{otherid}\n")
             nwrite += 1
         return nwrite
+
     def get_xrefs(self, iri, outfile):
         query = f"""
          prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

From dc89d2624ca79add582f4bc31f1241cb84b19fee Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:18:59 -0400
Subject: [PATCH 12/28] using bulk_load

---
 src/datahandlers/ec.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py
index 1d77c71b..37fc16e0 100644
--- a/src/datahandlers/ec.py
+++ b/src/datahandlers/ec.py
@@ -23,9 +23,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading EC')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')

From 39b4ba775a0df5f752f153cc578bbdc0b6ae3886 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:19:10 -0400
Subject: [PATCH 13/28] using bulk_load

---
 src/datahandlers/clo.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py
index 018f8d44..e0251169 100644
--- a/src/datahandlers/clo.py
+++ b/src/datahandlers/clo.py
@@ -19,9 +19,9 @@ def __init__(self,ifname):
         from datetime import datetime as dt
         print('loading CLO')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')

From d9d0a735433b16c9b5e8301828bf237632b5e12c Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:19:24 -0400
Subject: [PATCH 14/28] using bulk_load

---
 src/datahandlers/chembl.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py
index ac8e6513..dff5dfde 100644
--- a/src/datahandlers/chembl.py
+++ b/src/datahandlers/chembl.py
@@ -1,3 +1,6 @@
+import os.path
+import pathlib
+
 from src.prefixes import CHEMBLCOMPOUND
 from src.babel_utils import pull_via_ftp, make_local_name
 import ftplib
@@ -48,11 +51,11 @@ def __init__(self,ifname,ccofile):
         from datetime import datetime as dt
         print('loading chembl')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ccofile,'rb') as inf:
-            self.m.load(inf,'application/turtle')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/turtle')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')

From 3ce12fa0a7a9027f74a67a11b616c9b886ef521d Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:19:48 -0400
Subject: [PATCH 15/28] using snakemake.logger

---
 src/createcompendia/leftover_umls.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py
index 604ac248..0c1082ac 100644
--- a/src/createcompendia/leftover_umls.py
+++ b/src/createcompendia/leftover_umls.py
@@ -1,9 +1,10 @@
+import logging
 from datetime import datetime
 import json
 import jsonlines
 from pathlib import Path
 
-from snakemake.logging import Logger
+import snakemake.logging
 from bmt import Toolkit
 
 from src.node import NodeFactory
@@ -30,7 +31,7 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym
     :return: Nothing.
     """
 
-    logging = Logger()
+    logging = snakemake.logging.logger
     logging.info(f"write_leftover_umls({compendia}, {umls_labels_filename}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})")
 
     # For now, we have many more UMLS entities in MRCONSO than in the compendia, so

From 650166bcfbdab922101692a3829d2fe159104976 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:20:08 -0400
Subject: [PATCH 16/28] adding condition for file size

---
 src/babel_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/babel_utils.py b/src/babel_utils.py
index deee0d5f..56d5c692 100644
--- a/src/babel_utils.py
+++ b/src/babel_utils.py
@@ -185,7 +185,7 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
             # write out the data to the output file
             compressed_file.write(data)
 
-    if decompress:
+    if os.stat(dl_file_name).st_size > 0 and decompress:
         out_file_name = dl_file_name[:-3]
 
         # create the output text file
@@ -201,6 +201,7 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None
     else:
         out_file_name = dl_file_name
 
+
     # return the filename to the caller
     return out_file_name
 

From 8e36c0f9934748ddec667c9b4ae1aca153b25665 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Wed, 30 Jul 2025 18:20:28 -0400
Subject: [PATCH 17/28] incrementing pyoxigraph version

---
 requirements.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e330149d..aaac1f26 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,16 @@
 apybiomart
 biopython
 bmt
-datrie
 jsonlines
 pandas
 more-itertools
-pyoxigraph~=0.2.5
+#pyoxigraph~=0.2.5
+pyoxigraph~=0.4.11
 psycopg2-binary
 pytest
 pytest-cov
-python-Levenshtein-wheels
+#python-Levenshtein-wheels
+python-levenshtein
 pyyaml
 requests
 snakemake

From ea2b83fa473e54a285c727b3d0f44b508edeea1d Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 31 Jul 2025 08:33:04 -0400
Subject: [PATCH 18/28] fixing missed conflicts

---
 src/createcompendia/leftover_umls.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py
index 506e8ac5..0fefccf9 100644
--- a/src/createcompendia/leftover_umls.py
+++ b/src/createcompendia/leftover_umls.py
@@ -4,13 +4,7 @@
 import jsonlines
 from pathlib import Path
 
-<<<<<<< HEAD
 import snakemake.logging
-from bmt import Toolkit
-=======
-from snakemake.logging import Logger
->>>>>>> 80b225419bb30eafafcc82771983a66dc36156b7
-
 from src.node import NodeFactory
 from src.util import get_biolink_model_toolkit
 from src.datahandlers import umls

From 55b59e8269820b0ffbf65817644cd0298db01210 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 31 Jul 2025 08:35:55 -0400
Subject: [PATCH 19/28] initial commit

---
 babel_io/.gitignore                           |   3 +
 .../bin/create_chembl_labels_and_smiles.rs    | 122 ++++++++
 ...reate_complexportal_labels_and_synonyms.rs | 118 ++++++++
 .../bin/create_doid_labels_and_synonyms.rs    |  73 +++++
 .../src/bin/create_ec_labels_and_synonyms.rs  |  85 ++++++
 babel_io/src/bin/create_efo_labels.rs         |  96 +++++++
 .../bin/create_hgnc_labels_and_synonyms.rs    |  74 +++++
 babel_io/src/bin/create_hgncfamily_labels.rs  |  47 +++
 babel_io/src/bin/create_mesh_labels.rs        |  80 ++++++
 ...reate_ncbigene_labels_synonyms_and_taxa.rs | 193 +++++++++++++
 .../create_orphanet_labels_and_synonyms.rs    |  67 +++++
 .../src/bin/create_pantherfamily_labels.rs    |  61 ++++
 .../src/bin/create_pantherpathways_labels.rs  |  83 ++++++
 babel_io/src/bin/create_reactome_labels.rs    |  54 ++++
 babel_io/src/bin/create_rhea_labels.rs        |  73 +++++
 babel_io/src/bin/create_smpdb_labels.rs       |  63 ++++
 babel_io/src/bin/create_uniprot_labels.rs     |  58 ++++
 babel_io/src/bin/filter_unichem.rs            |  72 +++++
 babel_io/src/bin/pull_ensembl.rs              | 264 +++++++++++++++++
 babel_io/src/lib.rs                           | 271 ++++++++++++++++++
 20 files changed, 1957 insertions(+)
 create mode 100644 babel_io/.gitignore
 create mode 100644 babel_io/src/bin/create_chembl_labels_and_smiles.rs
 create mode 100644 babel_io/src/bin/create_complexportal_labels_and_synonyms.rs
 create mode 100644 babel_io/src/bin/create_doid_labels_and_synonyms.rs
 create mode 100644 babel_io/src/bin/create_ec_labels_and_synonyms.rs
 create mode 100644 babel_io/src/bin/create_efo_labels.rs
 create mode 100644 babel_io/src/bin/create_hgnc_labels_and_synonyms.rs
 create mode 100644 babel_io/src/bin/create_hgncfamily_labels.rs
 create mode 100644 babel_io/src/bin/create_mesh_labels.rs
 create mode 100644 babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs
 create mode 100644 babel_io/src/bin/create_orphanet_labels_and_synonyms.rs
 create mode 100644 babel_io/src/bin/create_pantherfamily_labels.rs
 create mode 100644 babel_io/src/bin/create_pantherpathways_labels.rs
 create mode 100644 babel_io/src/bin/create_reactome_labels.rs
 create mode 100644 babel_io/src/bin/create_rhea_labels.rs
 create mode 100644 babel_io/src/bin/create_smpdb_labels.rs
 create mode 100644 babel_io/src/bin/create_uniprot_labels.rs
 create mode 100644 babel_io/src/bin/filter_unichem.rs
 create mode 100644 babel_io/src/bin/pull_ensembl.rs
 create mode 100644 babel_io/src/lib.rs

diff --git a/babel_io/.gitignore b/babel_io/.gitignore
new file mode 100644
index 00000000..e26eb48d
--- /dev/null
+++ b/babel_io/.gitignore
@@ -0,0 +1,3 @@
+./target
+.idea
+./Cargo.lock
\ No newline at end of file
diff --git a/babel_io/src/bin/create_chembl_labels_and_smiles.rs b/babel_io/src/bin/create_chembl_labels_and_smiles.rs
new file mode 100644
index 00000000..e64681b1
--- /dev/null
+++ b/babel_io/src/bin/create_chembl_labels_and_smiles.rs
@@ -0,0 +1,122 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::error::Error;
+use std::fs;
+use std::io::{BufReader, BufWriter, Write};
+use std::path;
+use std::time::Instant;
+
+// NOTE: rust runs in 13s, python runs in 21s
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    cco: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    smiles_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let store = Store::new()?;
+    let start_load = Instant::now();
+
+    // this file is small...no need for bulk loader
+    let cco_br = BufReader::new(fs::File::open(options.cco).unwrap());
+    store.load_from_reader(RdfFormat::Turtle, cco_br).expect("Could not load input");
+
+    let input_br = BufReader::new(fs::File::open(options.input).unwrap());
+    store
+        .bulk_loader()
+        .with_max_memory_size_in_megabytes(4 * 2048)
+        .with_num_threads(4)
+        .load_from_reader(RdfFormat::Turtle, input_br)
+        .expect("Could not load input");
+
+    info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());
+
+    let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+
+    let query_statement = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+             PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+             PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#>
+             SELECT ?molecule ?label
+             WHERE {
+                ?molecule a ?type .
+                ?type rdfs:subClassOf* cco:Substance .
+                ?molecule rdfs:label ?label .
+            }";
+
+    if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
+        for qs in solutions.filter_map(Result::ok).into_iter() {
+            let iterm = qs.get("molecule").expect("molecule was None");
+            let mut iterm = iterm.to_string();
+            iterm = babel_io::trim_gt_and_lt(iterm);
+
+            let iterm_split = iterm.split("/").collect_vec();
+            let id = iterm_split.last().unwrap();
+
+            let label = qs.get("label").expect("label was None");
+            let mut label = label.to_string();
+            label = babel_io::trim_quotes(label);
+
+            if id.to_string() == label {
+                continue;
+            }
+            write!(labels_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple");
+        }
+    }
+
+    let mut smiles_bw = BufWriter::new(fs::File::create(options.smiles_output.clone().as_path()).unwrap());
+
+    let query_statement = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+             PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+             PREFIX cco: <http://rdf.ebi.ac.uk/terms/chembl#>
+             PREFIX cheminf: <http://semanticscience.org/resource/>
+             SELECT ?molecule ?smiles
+             WHERE {
+                ?molecule cheminf:SIO_000008 ?smile_entity .
+                ?smile_entity a cheminf:CHEMINF_000018 ;
+                              cheminf:SIO_000300 ?smiles .
+            }";
+
+    if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
+        for qs in solutions.filter_map(Result::ok).into_iter() {
+            let iterm = qs.get("molecule").expect("molecule was None");
+            let mut iterm = iterm.to_string();
+            iterm = babel_io::trim_gt_and_lt(iterm);
+
+            let iterm_split = iterm.split("/").collect_vec();
+            let id = iterm_split.last().unwrap();
+
+            let label = qs.get("smiles").expect("smiles was None");
+            let mut label = label.to_string();
+            label = babel_io::trim_quotes(label);
+
+            write!(smiles_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple");
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs b/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs
new file mode 100644
index 00000000..fb84248c
--- /dev/null
+++ b/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs
@@ -0,0 +1,118 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use polars::prelude::*;
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    // NOTE: this base implementation runs in 4ms, python version of this runs in 4s
+    // let br = BufReader::new(fs::File::open(options.input).unwrap());
+    //
+    // let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    // let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+    //
+    // let mut used_synonyms = HashSet::new();
+    //
+    // for line in br.lines().skip(1) {
+    //     let line = line.unwrap();
+    //     let line_split = line.split("\t").collect_vec();
+    //     let id = line_split.get(0).unwrap();
+    //     let label = line_split.get(1).unwrap();
+    //     write!(labels_bw, "ComplexPortal:{}\t{}\n", id, label).unwrap();
+    //     let synonyms = line_split.get(2).unwrap();
+    //     if !synonyms.to_string().eq("-") {
+    //         let synonyms_split = synonyms.split("|").collect_vec();
+    //         for synonym in synonyms_split.into_iter().map(|a| a.to_string()) {
+    //             if !used_synonyms.contains(&synonym) {
+    //                 write!(synonyms_bw, "ComplexPortal:{}\t{}\n", id, synonym).unwrap();
+    //                 used_synonyms.insert(synonym);
+    //             }
+    //         }
+    //     }
+    // }
+
+    // NOTE: this polars implementation runs in 16ms
+    let usable_columns = vec!["#Complex ac", "Recommended name", "Aliases for complex"];
+
+    let df = polars::lazy::frame::LazyCsvReader::new(options.input.clone())
+        .with_separator(b'\t')
+        .with_infer_schema_length(Some(0))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(true)
+        .finish()
+        .unwrap()
+        .select(usable_columns.into_iter().map(|a| col(a)).collect_vec())
+        .collect()
+        .unwrap();
+
+    // println!("{}", df.head(None));
+
+    let mut labels_df = df
+        .clone()
+        .lazy()
+        .select([
+            concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"),
+            col("Recommended name"),
+        ])
+        .collect()
+        .unwrap();
+
+    let mut file = fs::File::create(options.labels_output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut labels_df)
+        .unwrap();
+
+    let mut synonyms_df = df
+        .clone()
+        .lazy()
+        .filter(col("Aliases for complex").neq(lit("-")))
+        .select([
+            concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"),
+            col("Aliases for complex").str().split(lit("|")).alias("Aliases for complex"),
+        ])
+        .explode([col("Aliases for complex")])
+        .unique(Some(vec!["Aliases for complex".to_string()]), UniqueKeepStrategy::First)
+        .collect()
+        .unwrap();
+
+    // println!("{}", synonyms_df.head(None));
+
+    let mut file = fs::File::create(options.synonyms_output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut synonyms_df)
+        .unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_doid_labels_and_synonyms.rs b/babel_io/src/bin/create_doid_labels_and_synonyms.rs
new file mode 100644
index 00000000..1fc88d07
--- /dev/null
+++ b/babel_io/src/bin/create_doid_labels_and_synonyms.rs
@@ -0,0 +1,73 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use serde_json::Value;
+use std::error::Error;
+use std::fs;
+use std::io::Write;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = std::io::BufReader::new(fs::File::open(options.input).unwrap());
+    let json_value: Value = serde_json::from_reader(br)?;
+
+    let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+
+    //NOTE: Python runs in 3s, rust runs < 1s
+    for entry in json_value["graphs"][0]["nodes"].as_array().unwrap().into_iter() {
+        if !entry["meta"].is_null() && !entry["meta"]["deprecated"].is_null() && entry["meta"]["deprecated"].as_bool().unwrap() == true {
+            continue;
+        }
+        let doid_id = entry["id"].as_str().unwrap();
+        if !doid_id.starts_with("http://purl.obolibrary.org/obo/DOID_") {
+            continue;
+        }
+        let doid_id_split = doid_id.split("_").collect_vec();
+        let doid_curie = format!("DOID:{}", doid_id_split.get(1).unwrap());
+
+        if !entry["lbl"].is_null() {
+            let label = entry["lbl"].as_str().unwrap();
+            write!(&mut labels_bw, "{}\t{}\n", doid_curie, label).unwrap();
+            write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", doid_curie, label).unwrap();
+        }
+
+        if !entry["meta"].is_null() && !entry["meta"]["synonyms"].is_null() {
+            for synonym_entry in entry["meta"]["synonyms"].as_array().unwrap().into_iter() {
+                write!(
+                    &mut synonyms_bw,
+                    "{}\tOIO:hasExactSynonym\t{}\n",
+                    doid_curie,
+                    synonym_entry["val"].as_str().unwrap()
+                )
+                .unwrap();
+            }
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_ec_labels_and_synonyms.rs b/babel_io/src/bin/create_ec_labels_and_synonyms.rs
new file mode 100644
index 00000000..1b812d14
--- /dev/null
+++ b/babel_io/src/bin/create_ec_labels_and_synonyms.rs
@@ -0,0 +1,85 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::error::Error;
+use std::fs;
+use std::io::{BufReader, BufWriter, Write};
+use std::path;
+use std::time::Instant;
+
+// NOTE: rust runs in 13s, python runs in 21s
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = BufReader::new(fs::File::open(options.input).unwrap());
+    let store = Store::new()?;
+    let start_load = Instant::now();
+    store
+        .bulk_loader()
+        .with_max_memory_size_in_megabytes(4 * 2048)
+        .with_num_threads(4)
+        .load_from_reader(RdfFormat::RdfXml, br)
+        .expect("Could not load input");
+    info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());
+
+    let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+
+    let label_types = vec!["skos:prefLabel", "skos:altLabel", "rdfs:label"];
+
+    for label_type in label_types.into_iter() {
+        let query_statement = format!(
+            "PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+              PREFIX ec: <http://purl.uniprot.org/enzyme/>
+              PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+              SELECT DISTINCT ?x ?label WHERE {{ ?x {label_type} ?label }}"
+        );
+
+        if let QueryResults::Solutions(solutions) = store.query(query_statement.as_str())? {
+            for qs in solutions.filter_map(Result::ok).into_iter() {
+                let iterm = qs.get("x").expect("acc was None");
+                let mut iterm = iterm.to_string();
+                iterm = babel_io::trim_gt_and_lt(iterm);
+
+                let iterm_split = iterm.split("/").collect_vec();
+                let id = iterm_split.last().unwrap();
+
+                let label = qs.get("label").expect("label was None");
+                let label = label.to_string();
+                // label = babel_io::trim_quotes(label);
+
+                write!(synonyms_bw, "EC:{}\t{}\t{}\n", id, label_type, label).expect("Could not write triple");
+                if label_type != "skos:altLabel" {
+                    write!(labels_bw, "EC:{}\t{}\n", id, label).expect("Could not write triple");
+                }
+            }
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_efo_labels.rs b/babel_io/src/bin/create_efo_labels.rs
new file mode 100644
index 00000000..9fe719f0
--- /dev/null
+++ b/babel_io/src/bin/create_efo_labels.rs
@@ -0,0 +1,96 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::error::Error;
+use std::fs;
+use std::fs::File;
+use std::io::{BufReader, Write};
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = BufReader::new(File::open(options.input).unwrap());
+    let store = Store::new()?;
+    let start_load = Instant::now();
+    store
+        .bulk_loader()
+        .with_max_memory_size_in_megabytes(4 * 2048)
+        .with_num_threads(4)
+        .load_from_reader(RdfFormat::RdfXml, br)
+        .expect("Could not load input");
+    info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());
+
+    let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+
+    let label_types = vec!["skos:prefLabel", "skos:altLabel", "rdfs:label"];
+
+    let re = regex::Regex::new("^(.*?)(?:@[^@]*){0,1}$").unwrap();
+
+    for label_type in label_types.into_iter() {
+        let query_statement = format!(
+            "PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+        SELECT DISTINCT ?x ?label WHERE {{ ?x {label_type} ?label }}"
+        );
+        if let QueryResults::Solutions(solutions) = store.query(query_statement.as_str())? {
+            for qs in solutions.filter_map(Result::ok).into_iter() {
+                let x = qs.get("x").expect("x was None");
+                let mut x = x.to_string();
+                x = babel_io::trim_gt_and_lt(x);
+
+                let label = qs.get("label").expect("x was None");
+                let mut label = label.to_string();
+                if label.contains("@") {
+                    if let Some(captures) = re.captures(label.as_str()) {
+                        label = captures.get(1).unwrap().as_str().to_string();
+                    }
+                }
+
+                label = babel_io::trim_quotes(label);
+                label = label.trim().to_string();
+
+                let x_split = x.split("/").collect_vec();
+                let efo = x_split.last().unwrap();
+                if !efo.starts_with("EFO_") {
+                    continue;
+                }
+                let efo_split = efo.split("_").collect_vec();
+                let efo_id = efo_split.last().unwrap();
+                write!(synonyms_bw, "EFO:{}\t{}\t{}\n", efo_id, label_type, label).expect("Could not write triple");
+                if label_type != "skos:altLabel" {
+                    write!(labels_bw, "EFO:{}\t{}\n", efo_id, label).expect("Could not write triple");
+                }
+            }
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs b/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs
new file mode 100644
index 00000000..dc9ad40a
--- /dev/null
+++ b/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs
@@ -0,0 +1,74 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use serde_json::Value;
+use std::error::Error;
+use std::fs;
+use std::io::Write;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = std::io::BufReader::new(fs::File::open(options.input).unwrap());
+    let json_value: Value = serde_json::from_reader(br)?;
+
+    let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+
+    //NOTE: Python runs in 3s, rust runs < 1s
+    for gene in json_value["response"]["docs"].as_array().unwrap().into_iter() {
+        let hgnc_id = gene["hgnc_id"].clone();
+        let symbol = gene["symbol"].clone();
+        write!(&mut labels_bw, "{}\t{}\n", hgnc_id.as_str().unwrap(), symbol.as_str().unwrap()).unwrap();
+
+        let name = gene["name"].clone();
+        write!(
+            &mut synonyms_bw,
+            "{}\t{}\t{}\n",
+            hgnc_id.as_str().unwrap(),
+            "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym",
+            name.as_str().unwrap()
+        )
+        .unwrap();
+
+        for alias_field in vec!["alias_symbol", "alias_name"].into_iter() {
+            if !gene[alias_field].is_null() {
+                let aliases = gene[alias_field].as_array().unwrap();
+                for asym in aliases.into_iter() {
+                    write!(
+                        &mut synonyms_bw,
+                        "{}\t{}\t{}\n",
+                        hgnc_id.as_str().unwrap(),
+                        "http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym",
+                        asym.as_str().unwrap()
+                    )
+                    .unwrap();
+                }
+            }
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_hgncfamily_labels.rs b/babel_io/src/bin/create_hgncfamily_labels.rs
new file mode 100644
index 00000000..c5792209
--- /dev/null
+++ b/babel_io/src/bin/create_hgncfamily_labels.rs
@@ -0,0 +1,47 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use polars::prelude::*;
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let mut df = polars::lazy::frame::LazyCsvReader::new(options.input.clone())
+        .with_infer_schema_length(Some(0))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(true)
+        .finish()
+        .unwrap()
+        .with_column(concat_str([lit("HGNC.FAMILY"), col("id")], ":", true).alias("id"))
+        .select([col("id"), col("name")])
+        .collect()
+        .unwrap();
+
+    let mut file = fs::File::create(options.labels_output).expect("could not create file");
+    CsvWriter::new(&mut file).include_header(false).with_separator(b'\t').finish(&mut df).unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_mesh_labels.rs b/babel_io/src/bin/create_mesh_labels.rs
new file mode 100644
index 00000000..8aa6b8c4
--- /dev/null
+++ b/babel_io/src/bin/create_mesh_labels.rs
@@ -0,0 +1,80 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::error::Error;
+use std::fs;
+use std::io;
+use std::io::Write;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = io::BufReader::new(fs::File::open(options.input).unwrap());
+    let store = Store::new()?;
+    let start_load = Instant::now();
+    store
+        .bulk_loader()
+        .with_max_memory_size_in_megabytes(4 * 2048)
+        .with_num_threads(4)
+        .load_from_reader(RdfFormat::NTriples, br)
+        .expect("Could not load input");
+    info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());
+
+    let mut output_bw = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap());
+
+    let re = regex::Regex::new("^(.*?)(?:@[^@]*){0,1}$").unwrap();
+
+    let query_statement = r#"PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+             PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
+             PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
+
+             SELECT DISTINCT ?term ?label WHERE { ?term rdfs:label ?label } ORDER BY ?term"#;
+
+    if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
+        for qs in solutions.filter_map(Result::ok).into_iter() {
+            let term = qs.get("term").expect("term was None");
+            let mut term = term.to_string();
+            term = babel_io::trim_gt_and_lt(term);
+            let term_split = term.split("/").collect_vec();
+            let id = term_split.last().unwrap();
+
+            let label = qs.get("label").expect("x was None");
+            let mut label = label.to_string();
+            if label.contains("@") {
+                if let Some(captures) = re.captures(label.as_str()) {
+                    label = captures.get(1).unwrap().as_str().to_string();
+                }
+            }
+            label = babel_io::trim_quotes(label);
+            label = label.trim().to_string();
+
+            write!(output_bw, "MESH:{}\t{}\n", id, label).expect("Could not write triple");
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs b/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs
new file mode 100644
index 00000000..50a70d7a
--- /dev/null
+++ b/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs
@@ -0,0 +1,193 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use polars::frame::DataFrame;
+use polars::io::SerWriter;
+use polars::prelude::*;
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::path::PathBuf;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    taxa_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    description_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    // let br = BufReader::new(fs::File::open(options.input).unwrap());
+
+    // let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.as_path()).unwrap());
+    // let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.as_path()).unwrap());
+    // let mut taxa_bw = std::io::BufWriter::new(fs::File::create(options.taxa_output.as_path()).unwrap());
+    // let mut description_bw = std::io::BufWriter::new(fs::File::create(options.description_output.as_path()).unwrap());
+
+    let usable_columns = vec![
+        "#tax_id",
+        "GeneID",
+        "type_of_gene",
+        "Synonyms",
+        "Other_designations",
+        "Symbol_from_nomenclature_authority",
+        "Full_name_from_nomenclature_authority",
+        "Symbol",
+        "description",
+    ];
+
+    let df = polars::lazy::frame::LazyCsvReader::new(options.input.clone())
+        .with_separator(b'\t')
+        .with_infer_schema_length(Some(0))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(true)
+        .finish()
+        .unwrap()
+        .select(usable_columns.into_iter().map(|a| col(a)).collect_vec())
+        .filter(col("type_of_gene").str().contains(lit("^(biological-region|other|unknown)$"), true).not())
+        .with_column(concat_str([lit("NCBIGene"), col("GeneID")], ":", true).alias("GeneID"))
+        .with_column(concat_str([lit("NCBITaxon"), col("#tax_id")], ":", true).alias("#tax_id"))
+        .with_column(
+            concat_str(
+                [
+                    col("Full_name_from_nomenclature_authority"),
+                    col("Synonyms"),
+                    col("Other_designations"),
+                    col("Symbol_from_nomenclature_authority"),
+                    col("Symbol"),
+                ],
+                "|",
+                true,
+            )
+            .str()
+            .split(lit("|"))
+            .alias("synonyms_concat"),
+        )
+        .with_column(
+            col("synonyms_concat")
+                .list()
+                .eval(col("").filter(col("").is_in(lit("-")).not()), false)
+                .alias("synonyms_concat"),
+        )
+        // .drop([col("Full_name_from_nomenclature_authority"), col("Other_designations")])
+        .collect()
+        .unwrap();
+
+    debug!("shape: {:?}", df.shape());
+
+    // NOTE: python impl runs in 13m w/ streaming, rust runs in < 3m while holding data in memory
+    // TODO: these could be async & run in parallel
+    write_description(&df, &options.description_output);
+    write_taxa(&df, &options.taxa_output);
+    write_synonyms(&df, &options.synonyms_output);
+    write_labels(&df, &options.labels_output);
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
+
+fn write_synonyms(df: &DataFrame, output: &PathBuf) {
+    let mut tmp_df = df
+        .clone()
+        .lazy()
+        .select([
+            col("GeneID"),
+            lit("http://www.geneontology.org/formats/oboInOwl#hasSynonym"),
+            col("synonyms_concat"),
+        ])
+        .explode([col("synonyms_concat")])
+        .collect()
+        .unwrap();
+
+    // println!("{}", tmp_df.head(None));
+
+    let mut file = fs::File::create(output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut tmp_df)
+        .unwrap();
+}
+
+fn write_description(df: &DataFrame, output: &PathBuf) {
+    let mut tmp_df = df.clone().lazy().select([col("GeneID"), col("description")]).collect().unwrap();
+
+    // println!("{}", tmp_df.head(None));
+
+    let mut file = fs::File::create(output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut tmp_df)
+        .unwrap();
+}
+
+fn write_labels(df: &DataFrame, output: &PathBuf) {
+    let mut tmp_df = df
+        .clone()
+        .lazy()
+        .with_column(
+            when(
+                col("Symbol_from_nomenclature_authority")
+                    .is_null()
+                    .or(col("Symbol_from_nomenclature_authority").eq(lit("-"))),
+            )
+            .then(col("Symbol"))
+            .otherwise(col("Symbol_from_nomenclature_authority"))
+            .alias("best_symbol"),
+        )
+        .with_column(
+            when(col("best_symbol").is_null().and(col("synonyms_concat").list().len().gt(0)))
+                .then(col("synonyms_concat").list().first())
+                .otherwise(col("best_symbol")),
+        )
+        .select([col("GeneID"), col("best_symbol")])
+        .collect()
+        .unwrap();
+
+    // println!("{}", tmp_df.head(None));
+
+    let mut file = fs::File::create(output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut tmp_df)
+        .unwrap();
+}
+
+fn write_taxa(df: &DataFrame, output: &path::PathBuf) {
+    let mut tmp_df = df.clone().lazy().select([col("GeneID"), col("#tax_id")]).collect().unwrap();
+
+    // println!("{}", tmp_df.head(None));
+
+    let mut file = fs::File::create(output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut tmp_df)
+        .unwrap();
+}
diff --git a/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs b/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs
new file mode 100644
index 00000000..f5d21e00
--- /dev/null
+++ b/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs
@@ -0,0 +1,67 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use std::error::Error;
+use std::fs;
+use std::fs::File;
+use std::io::{Read, Write};
+use std::path;
+use std::time::Instant;
+use zip::ZipArchive;
+
+// NOTE: do not use, utf-8 conversion issues...retaining for S&Gs
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    synonyms_output: path::PathBuf,
+}
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+    let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap());
+
+    let file = File::open(options.input.clone()).unwrap();
+    let mut archive = ZipArchive::new(file).unwrap();
+
+    let mut zip_file = archive.by_name("Orphanet_Nomenclature_Pack_en/ORPHAnomenclature_en.xml").unwrap();
+
+    let mut data = vec![];
+    zip_file.read_to_end(&mut data).unwrap();
+    let contents = String::from_utf8_lossy(data.as_slice());
+
+    let doc = roxmltree::Document::parse(contents.as_ref()).expect("Could not parse document");
+
+    doc.root().descendants().filter(|n| n.tag_name().name() == "Disorder").for_each(|a| {
+        let orpha_code = a.descendants().find(|b| b.tag_name().name() == "OrphaCode").unwrap().text().unwrap();
+        let name = a.descendants().find(|b| b.tag_name().name() == "Name").unwrap().text().unwrap();
+        let curie = format!("orphanet:{}", orpha_code);
+        write!(&mut labels_bw, "{}\t{}\n", curie, name).unwrap();
+        write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", curie, name).unwrap();
+        match a.descendants().find(|b| b.tag_name().name() == "SynonymList") {
+            None => {}
+            Some(all_synonyms) => {
+                let all_synonyms_text = all_synonyms.text().unwrap();
+                write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", curie, all_synonyms_text).unwrap();
+            }
+        }
+    });
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_pantherfamily_labels.rs b/babel_io/src/bin/create_pantherfamily_labels.rs
new file mode 100644
index 00000000..1bcef9cf
--- /dev/null
+++ b/babel_io/src/bin/create_pantherfamily_labels.rs
@@ -0,0 +1,61 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use std::collections::HashSet;
+use std::error::Error;
+use std::fs;
+use std::io::{BufRead, BufReader, Write};
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = BufReader::new(fs::File::open(options.input).unwrap());
+
+    let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+
+    let mut done = HashSet::new();
+
+    for line in br.lines() {
+        let line = line.unwrap();
+        let line_split = line.split("\t").collect_vec();
+        let sub_family = line_split.get(3).unwrap();
+        let sub_family = sub_family.to_string();
+        let main_family_split = sub_family.split(":").collect_vec();
+        let main_family = main_family_split.get(0).unwrap();
+        let main_family = main_family.to_string();
+        let main_family_name = line_split.get(4).unwrap();
+        let sub_family_name = line_split.get(5).unwrap();
+        if !done.contains(&main_family) {
+            write!(labels_bw, "{}\t{}\n", format!("PANTHER.FAMILY:{}", main_family), main_family_name).unwrap();
+            done.insert(main_family.to_string());
+        }
+
+        if !done.contains(&sub_family) {
+            write!(labels_bw, "{}\t{}\n", format!("PANTHER.FAMILY:{}", sub_family), sub_family_name).unwrap();
+            done.insert(sub_family.to_string());
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_pantherpathways_labels.rs b/babel_io/src/bin/create_pantherpathways_labels.rs
new file mode 100644
index 00000000..8d85c293
--- /dev/null
+++ b/babel_io/src/bin/create_pantherpathways_labels.rs
@@ -0,0 +1,83 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use polars::prelude::*;
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    // PANTHER.PATHWAY:P06217	Toll pathway-drosophila
+    // P06217	Toll pathway-drosophila	P06348	SLMB	DROME|FlyBase=FBgn0283468|UniProtKB=A0A0B4KHK1	Supernumerary limbs, isoform B	IDA	9461217	PubMed	PTHR44156:SF29	TNF RECEPTOR ASSOCIATED FACTOR 7
+
+    // NOTE: this polars implementation runs in 16ms
+
+    let usable_columns = vec!["I", "II"];
+
+    let schema = Schema::from_iter(vec![
+        Field::new("I".into(), DataType::String),
+        Field::new("II".into(), DataType::String),
+        Field::new("III".into(), DataType::String),
+        Field::new("IV".into(), DataType::String),
+        Field::new("V".into(), DataType::String),
+        Field::new("VI".into(), DataType::String),
+        Field::new("VII".into(), DataType::String),
+        Field::new("VIII".into(), DataType::String),
+        Field::new("IX".into(), DataType::String),
+        Field::new("X".into(), DataType::String),
+        Field::new("XI".into(), DataType::String),
+    ]);
+
+    let df = LazyCsvReader::new(options.input.clone())
+        .with_separator(b'\t')
+        .with_schema(Some(schema.into()))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(false)
+        .finish()
+        .unwrap()
+        .select(usable_columns.into_iter().map(|a| col(a)).collect_vec())
+        .collect()
+        .unwrap();
+
+    // println!("{}", df.head(None));
+
+    let mut labels_df = df
+        .clone()
+        .lazy()
+        .select([concat_str([lit("PANTHER.PATHWAY"), col("I")], ":", true).alias("I"), col("II")])
+        .unique(Some(vec!["I".into(), "II".into()]), UniqueKeepStrategy::First)
+        .collect()
+        .unwrap();
+
+    let mut file = fs::File::create(options.labels_output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut labels_df)
+        .unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_reactome_labels.rs b/babel_io/src/bin/create_reactome_labels.rs
new file mode 100644
index 00000000..3c05ea2b
--- /dev/null
+++ b/babel_io/src/bin/create_reactome_labels.rs
@@ -0,0 +1,54 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use serde_json::Value;
+use std::error::Error;
+use std::fs;
+use std::io::{BufReader, BufWriter, Write};
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    //NOTE: Python runs in 3s, rust runs < 1s
+    let br = BufReader::new(fs::File::open(options.input).unwrap());
+    let json_value: Value = serde_json::from_reader(br)?;
+
+    let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+
+    for entry in json_value.as_array().unwrap().into_iter() {
+        parse_element_for_labels(&entry, &mut labels_bw);
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
+
+fn parse_element_for_labels(entry: &Value, labels_bw: &mut BufWriter<fs::File>) {
+    let oid = entry["stId"].as_str().unwrap();
+    let name = entry["name"].as_str().unwrap();
+    let species = entry["species"].as_str().unwrap();
+    write!(labels_bw, "REACT:{}\t{} ({})\n", oid, name, species).unwrap();
+    if !entry["children"].is_null() {
+        for child_entry in entry["children"].as_array().unwrap().into_iter() {
+            parse_element_for_labels(child_entry, labels_bw);
+        }
+    }
+}
diff --git a/babel_io/src/bin/create_rhea_labels.rs b/babel_io/src/bin/create_rhea_labels.rs
new file mode 100644
index 00000000..f3005e12
--- /dev/null
+++ b/babel_io/src/bin/create_rhea_labels.rs
@@ -0,0 +1,73 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::error::Error;
+use std::fs;
+use std::io::{BufReader, BufWriter, Write};
+use std::path;
+use std::time::Instant;
+
+// NOTE: rust runs in 13s, python runs in 21s
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let br = BufReader::new(fs::File::open(options.input).unwrap());
+    let store = Store::new()?;
+    let start_load = Instant::now();
+    store
+        .bulk_loader()
+        .with_max_memory_size_in_megabytes(4 * 2048)
+        .with_num_threads(4)
+        .load_from_reader(RdfFormat::RdfXml, br)
+        .expect("Could not load input");
+    info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string());
+
+    let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap());
+
+    let query_statement = r#"PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+       PREFIX rh: <http://rdf.rhea-db.org/>
+       SELECT DISTINCT ?x ?acc ?label WHERE {
+         ?x rdfs:label ?label .
+         ?x rh:accession ?acc .
+       }"#;
+
+    if let QueryResults::Solutions(solutions) = store.query(query_statement)? {
+        for qs in solutions.filter_map(Result::ok).into_iter() {
+            let iterm = qs.get("acc").expect("acc was None");
+            let mut iterm = iterm.to_string();
+            iterm = babel_io::trim_quotes(iterm);
+            let rhea_iterm_split = iterm.split(":").collect_vec();
+            let rhea_id = rhea_iterm_split.last().unwrap();
+
+            let label = qs.get("label").expect("label was None");
+            let mut label = label.to_string();
+            label = babel_io::trim_quotes(label);
+
+            write!(labels_bw, "RHEA:{}\t{}\n", rhea_id, label).expect("Could not write triple");
+        }
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_smpdb_labels.rs b/babel_io/src/bin/create_smpdb_labels.rs
new file mode 100644
index 00000000..0d4b88e0
--- /dev/null
+++ b/babel_io/src/bin/create_smpdb_labels.rs
@@ -0,0 +1,63 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use polars::prelude::*;
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    labels_output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    // NOTE: this polars implementation runs in 16ms
+    let usable_columns = vec!["SMPDB ID", "Name"];
+
+    let df = LazyCsvReader::new(options.input.clone())
+        .with_infer_schema_length(Some(0))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(true)
+        .finish()
+        .unwrap()
+        .select(usable_columns.into_iter().map(|a| col(a)).collect_vec())
+        .collect()
+        .unwrap();
+
+    // println!("{}", df.head(None));
+
+    let mut labels_df = df
+        .clone()
+        .lazy()
+        .select([concat_str([lit("SMPDB"), col("SMPDB ID")], ":", true).alias("SMPDB ID"), col("Name")])
+        .collect()
+        .unwrap();
+
+    let mut file = fs::File::create(options.labels_output).expect("could not create file");
+    CsvWriter::new(&mut file)
+        .include_header(false)
+        .with_separator(b'\t')
+        .finish(&mut labels_df)
+        .unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/create_uniprot_labels.rs b/babel_io/src/bin/create_uniprot_labels.rs
new file mode 100644
index 00000000..b9e27b36
--- /dev/null
+++ b/babel_io/src/bin/create_uniprot_labels.rs
@@ -0,0 +1,58 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use std::error::Error;
+use std::fs;
+use std::fs::File;
+use std::io::{BufRead, Write};
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    sprot_input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    trembl_input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let mut writer = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap());
+
+    write_labels(&mut writer, options.sprot_input, "sprot".into()).unwrap();
+    write_labels(&mut writer, options.trembl_input, "trembl".into()).unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
+
+fn write_labels(writer: &mut std::io::BufWriter<File>, input: path::PathBuf, which: String) -> Result<(), Box<dyn Error>> {
+    let reader = std::io::BufReader::new(fs::File::open(input).unwrap());
+    for line in reader.lines() {
+        let line = line.unwrap();
+        if !line.starts_with(">") {
+            continue;
+        }
+
+        let line_split = line.split('|').collect_vec();
+        let name_split = line_split.get(2).unwrap().split(" OS=").collect_vec();
+        write!(writer, "UniProtKB:{}\t{} ({})\n", line_split.get(1).unwrap(), name_split.get(0).unwrap(), which).unwrap();
+    }
+    Ok(())
+}
+
diff --git a/babel_io/src/bin/filter_unichem.rs b/babel_io/src/bin/filter_unichem.rs
new file mode 100644
index 00000000..2756abbf
--- /dev/null
+++ b/babel_io/src/bin/filter_unichem.rs
@@ -0,0 +1,72 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use polars::io::SerWriter;
+use polars::prelude::{col, lit, CsvWriter, LazyFileListReader};
+use std::error::Error;
+use std::fs;
+use std::path;
+use std::time::Instant;
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    input: path::PathBuf,
+
+    #[clap(short, long, required = true)]
+    output: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    // let reader = std::io::BufReader::new(fs::File::open(options.input).unwrap());
+    // let mut writer = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap());
+    //
+    // write!(writer, "{}\n", "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT").unwrap();
+    //
+    // for line in reader.lines().skip(1) {
+    //     let line = line.unwrap();
+    //     let line_split = line.trim().split("\t");
+    //
+    //     write!(writer, "{}\n", "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT").unwrap();
+    // }
+    let data_sources = std::collections::HashMap::from([
+        ("1", "CHEMBL.COMPOUND"),
+        ("2", "DRUGBANK"),
+        ("4", "GTOPDB"),
+        ("6", "KEGG.COMPOUND"),
+        ("7", "CHEBI"),
+        ("14", "UNII"),
+        ("18", "HMDB"),
+        ("22", "PUBCHEM.COMPOUND"),
+        ("34", "DrugCentral"),
+    ]);
+    let re = format!("^({})$", itertools::join(data_sources.into_keys(), "|"));
+    let mut df = polars::lazy::frame::LazyCsvReader::new(options.input.clone())
+        .with_separator(b'\t')
+        .with_infer_schema_length(Some(0))
+        .with_ignore_errors(true)
+        .with_truncate_ragged_lines(true)
+        .with_has_header(true)
+        .finish()
+        .unwrap()
+        .filter(col("SRC_ID").str().contains(lit(re), true))
+        .filter(col("ASSIGNMENT").eq(lit("1")))
+        .collect()
+        .unwrap();
+
+    let mut file = fs::File::create(options.output).expect("could not create file");
+    CsvWriter::new(&mut file).include_header(true).with_separator(b'\t').finish(&mut df).unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
diff --git a/babel_io/src/bin/pull_ensembl.rs b/babel_io/src/bin/pull_ensembl.rs
new file mode 100644
index 00000000..30852171
--- /dev/null
+++ b/babel_io/src/bin/pull_ensembl.rs
@@ -0,0 +1,264 @@
+use async_once::AsyncOnce;
+use clap::Parser;
+use humantime::format_duration;
+use itertools::{join, Itertools};
+use lazy_static::lazy_static;
+use log::{debug, info};
+use polars::prelude::*;
+use quick_xml::Writer;
+use reqwest::header;
+use reqwest::redirect::Policy;
+use std::collections::HashSet;
+use std::error::Error;
+use std::fs::{create_dir_all, File};
+use std::io::{Cursor, Write};
+use std::path;
+use std::time::{Duration, Instant};
+
+lazy_static! {
+    pub static ref CSV_PARSE_OPTIONS: CsvParseOptions =  CsvParseOptions::default().with_truncate_ragged_lines(true).with_separator(b'\t');
+
+    pub static ref REQWEST_CLIENT: AsyncOnce<reqwest::Client> = AsyncOnce::new(async {
+        let mut headers = header::HeaderMap::new();
+        // headers.insert(header::ACCEPT, header::HeaderValue::from_static("application/json"));
+        headers.insert(header::CONTENT_TYPE, header::HeaderValue::from_static("text/plain"));
+        let result = reqwest::Client::builder()
+            .redirect(Policy::limited(5))
+            // .read_timeout(Duration::from_secs(1500))
+            // .timeout(Duration::from_secs(1500))
+            .default_headers(headers)
+            .build();
+
+        match result {
+            Ok(request_client) => request_client,
+            Err(e) => panic!("Could not create reqwest client: {}", e),
+        }
+    });
+}
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    ensembl_output_dir: path::PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let datasets = pull_datasets().await.expect("Count not get datasets");
+    debug!("datasets: {:?}", datasets);
+
+    let usable_attribute_cols: HashSet<String> = HashSet::from([
+        "ensembl_gene_id",
+        "ensembl_peptide_id",
+        "description",
+        "external_gene_name",
+        "external_gene_source",
+        "external_synonym",
+        "chromosome_name",
+        "source",
+        "gene_biotype",
+        "entrezgene_id",
+        "zfin_id_id",
+        "mgi_id",
+        "rgd_id",
+        "flybase_gene_id",
+        "sgd_gene",
+        "wormbase_gene",
+    ])
+    .into_iter()
+    .map(|a| a.to_string())
+    .collect();
+
+    let request_client = REQWEST_CLIENT.get().await;
+
+    for (idx, dataset_id) in datasets.iter().enumerate() {
+        let pull_dataset_start = Instant::now();
+        info!("dataset_id: {}", dataset_id);
+
+        let ensembl_output_dir = options.ensembl_output_dir.join(&dataset_id);
+        create_dir_all(&ensembl_output_dir).expect("Could not create dataset dir");
+        let output_path = ensembl_output_dir.join("BioMart.tsv");
+        if output_path.exists() {
+            continue;
+        }
+        let mut output_file = File::create(output_path).expect("could not create file");
+
+        let attributes = pull_attributes(&dataset_id, &usable_attribute_cols).await.expect("Could not get attributes");
+        debug!("attributes: {:?}", attributes);
+
+        let mut writer = Writer::new(Cursor::new(Vec::new()));
+        writer
+            .create_element("Query")
+            .with_attributes(vec![
+                ("virtualSchemaName", "default"),
+                ("formatter", "TSV"),
+                ("header", "1"),
+                ("datasetConfigVersion", "0.6"),
+            ])
+            .write_inner_content(|writer| {
+                writer
+                    .create_element("Dataset")
+                    .with_attributes(vec![("name", dataset_id.as_str()), ("interface", "default")])
+                    .write_inner_content(|writer| {
+                        for attribute in attributes.iter() {
+                            writer.create_element("Attribute").with_attribute(("name", attribute.as_str())).write_empty()?;
+                        }
+                        Ok(())
+                    })
+                    .unwrap();
+                Ok(())
+            })
+            .unwrap();
+
+        let xml_output = writer.into_inner().into_inner();
+        let xml_result = std::str::from_utf8(xml_output.as_slice()).unwrap();
+        debug!("xml result: {}", xml_result);
+
+        let query_response = request_client
+            .get("http://www.ensembl.org/biomart/martservice")
+            .query(&[("query", xml_result)])
+            .send()
+            .await
+            .expect("Could not send query");
+
+        let query_response_text = query_response.text().await.expect("Could not get text from response");
+        let handle = std::io::Cursor::new(query_response_text);
+        let mut df = CsvReadOptions::default()
+            .with_parse_options(CSV_PARSE_OPTIONS.clone())
+            .with_has_header(true)
+            .with_ignore_errors(true)
+            .with_infer_schema_length(None)
+            .with_low_memory(true)
+            .into_reader_with_file_handle(handle)
+            .finish()
+            .unwrap();
+
+        CsvWriter::new(&mut output_file)
+            .include_header(true)
+            .with_separator(b'\t')
+            .finish(&mut df)
+            .expect("Could not write Ensembl output Dataframe");
+
+        info!(
+            "dataset_id: {}, finished {}/{}, duration to pull: {}",
+            dataset_id,
+            idx + 1,
+            datasets.len(),
+            format_duration(pull_dataset_start.elapsed()).to_string()
+        );
+    }
+
+    let mut w = File::create(options.ensembl_output_dir.join("BioMartDownloadComplete")).unwrap();
+
+    writeln!(&mut w, "{}", format!("Downloaded gene sets for {} data sets.", datasets.len())).unwrap();
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}
+
+async fn pull_datasets() -> Result<Vec<String>, Box<dyn Error>> {
+    let dataset_url = "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL";
+
+    let request_client = REQWEST_CLIENT.get().await;
+
+    let dataset_response = request_client.get(dataset_url).send().await?;
+    let dataset_text = dataset_response.text().await.unwrap();
+    let filtered_data = dataset_text
+        .lines()
+        .filter_map(|line| if line.trim().is_empty() { None } else { Some(line.trim().to_string()) })
+        .collect_vec();
+    let joined_filtered_data = filtered_data.join("\n");
+
+    let handle = Cursor::new(joined_filtered_data);
+    let dataset_df = CsvReadOptions::default()
+        .with_parse_options(CSV_PARSE_OPTIONS.clone())
+        .with_has_header(false)
+        .with_ignore_errors(true)
+        .with_infer_schema_length(None)
+        .into_reader_with_file_handle(handle)
+        .finish()
+        .unwrap();
+    // println!("{}", dataset_df.head(None));
+
+    let datasets_to_skip = join(
+        &vec![
+            "elucius_gene_ensembl",
+            "hgfemale_gene_ensembl",
+            "charengus_gene_ensembl",
+            "otshawytscha_gene_ensembl",
+            "aocellaris_gene_ensembl",
+            "omykiss_gene_ensembl",
+        ],
+        "|",
+    );
+    let reg = format!("^({})$", datasets_to_skip);
+    debug!("regex: {}", reg);
+
+    let filtered_dataset_df = dataset_df
+        .clone()
+        .lazy()
+        .select([col("column_2").alias("dataset_id")])
+        .filter(col("dataset_id").str().contains(lit(reg), true).not())
+        .collect()
+        .unwrap();
+
+    let datasets: Vec<String> = filtered_dataset_df
+        .column("dataset_id")
+        .unwrap()
+        .str()
+        .unwrap()
+        .into_iter()
+        .filter_map(|a| a.map(String::from))
+        .collect();
+
+    Ok(datasets)
+}
+
+async fn pull_attributes(dataset_id: &String, usable_attribute_cols: &HashSet<String>) -> Result<Vec<String>, Box<dyn Error>> {
+    let request_client = REQWEST_CLIENT.get().await;
+    let attributes_url = format!(
+        "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={}",
+        dataset_id
+    );
+
+    let attributes_response = request_client.get(attributes_url).send().await?;
+    let attributes_text = attributes_response.text().await.unwrap();
+    let filtered_attributes_text = attributes_text
+        .lines()
+        .filter_map(|line| if line.trim().is_empty() { None } else { Some(line.trim().to_string()) })
+        .collect_vec();
+    let joined_filtered_attributes_text = filtered_attributes_text.join("\n");
+
+    let handle = Cursor::new(joined_filtered_attributes_text);
+    let attributes_df = CsvReadOptions::default()
+        .with_parse_options(CSV_PARSE_OPTIONS.clone())
+        .with_has_header(false)
+        .with_ignore_errors(true)
+        .with_infer_schema_length(None)
+        .into_reader_with_file_handle(handle)
+        .finish()
+        .unwrap();
+    // println!("{}", attributes_df.head(None));
+
+    let filtered_attributes_df = attributes_df.clone().lazy().select([col("column_1").alias("attribute_id")]).collect().unwrap();
+
+    let attributes: HashSet<String> = filtered_attributes_df
+        .column("attribute_id")
+        .unwrap()
+        .str()
+        .unwrap()
+        .into_iter()
+        .filter_map(|a| a.map(String::from))
+        .collect();
+
+    let intersection = usable_attribute_cols.intersection(&attributes).into_iter().cloned().collect_vec();
+
+    Ok(intersection)
+}
diff --git a/babel_io/src/lib.rs b/babel_io/src/lib.rs
new file mode 100644
index 00000000..193640c7
--- /dev/null
+++ b/babel_io/src/lib.rs
@@ -0,0 +1,271 @@
+pub fn trim_gt_and_lt(mut x: String) -> String {
+    if x.starts_with("<") {
+        x = x.strip_prefix("<").unwrap().to_string();
+    }
+    if x.ends_with(">") {
+        x = x.strip_suffix(">").unwrap().to_string();
+    }
+    x
+}
+
+pub fn trim_quotes(mut label: String) -> String {
+    if label.starts_with("\"") {
+        label = label.strip_prefix("\"").unwrap().to_string();
+    }
+
+    if label.ends_with("\"") {
+        label = label.strip_suffix("\"").unwrap().to_string();
+    }
+    label
+}
+
+// extern crate core;
+//
+// use itertools::Itertools;
+// use pyo3::prelude::*;
+// use pyo3::types::PySet;
+// use std::collections::{HashMap, HashSet};
+// use std::fs;
+// use std::fs::File;
+// use std::hash::Hash;
+// use std::io::prelude::*;
+// use std::io::{BufRead, BufReader, BufWriter};
+// use std::path::{Path, PathBuf};
+//
+// #[pyfunction]
+// pub fn pull_uniprot_labels(input: &str, which: &str) -> PyResult<String> {
+//     let input_path = PathBuf::from(input);
+//
+//     let output_file_name = format!("uniprot_{}.output.txt", which);
+//     let mut output_path = input_path.with_file_name(output_file_name);
+//     if !output_path.exists() {
+//         let br = BufReader::new(File::open(input_path.as_path()).unwrap());
+//         let mut bw = BufWriter::new(File::create(output_path.as_path()).unwrap());
+//
+//         for line in br.lines() {
+//             let line = line.unwrap();
+//             if !line.starts_with(">") {
+//                 continue;
+//             }
+//             let line_split = line.split("|").collect_vec();
+//             let name_split = line_split[2].split(" OS=").collect_vec();
+//             let entry = format!("UniProtKB:{}\t{} ({})\n", line_split[1], name_split[0], which);
+//             bw.write_all(entry.as_bytes()).unwrap();
+//         }
+//     }
+//
+//     Ok(output_path.display().to_string())
+// }
+//
+// #[pyfunction]
+// pub fn merge_uniprot_label_files(inputs: Vec<&str>, output: &str, remove_inputs: bool) -> PyResult<String> {
+//     let output_path = PathBuf::from(output);
+//     let mut bw = BufWriter::new(File::create(output_path.as_path()).unwrap());
+//     inputs.clone().into_iter().map(|input| PathBuf::from(input)).for_each(|input_path| {
+//         let br = BufReader::new(File::open(input_path.as_path()).unwrap());
+//         for line in br.lines() {
+//             let line = line.unwrap();
+//             bw.write_all(format!("{}\n", line).as_bytes()).unwrap();
+//         }
+//     });
+//
+//     if remove_inputs {
+//         inputs.iter().for_each(|input| fs::remove_file(input).unwrap());
+//     }
+//
+//     Ok(output_path.display().to_string())
+// }
+//
+// #[pyfunction]
+// pub fn read_identifier_file(input: &str) -> PyResult<(Vec<Vec<String>>, HashMap<String, String>)> {
+//     let input_path = PathBuf::from(input);
+//     let br = BufReader::new(File::open(input_path.as_path()).unwrap());
+//     let mut types: HashMap<String, String> = HashMap::new();
+//     let mut identifiers = vec![];
+//     for line in br.lines() {
+//         let line = line.unwrap();
+//         let x = line.trim().split('\t').collect_vec();
+//         identifiers.push(vec![x[0].to_string()]);
+//         if x.len() > 1 {
+//             types.insert(x[0].into(), x[1].into());
+//         }
+//     }
+//
+//     Ok((identifiers, types))
+// }
+//
+// #[pyfunction]
+// pub fn glom(conc_set: HashSet<String>, newgroups: Vec<Vec<String>>, unique_prefixes: Vec<String>) -> PyResult<HashSet<String>> {
+//     let mut n = 0;
+//     let bad = 0;
+//     let shit_prefixes = vec!["KEGG", "PUBCHEM"];
+//     let test_id = "xUBERON:0002262";
+//     // let mut excised = vec![];
+//
+//     for xgroup in newgroups {
+//         if xgroup.len() > 2 {
+//             println!("{:?}", xgroup);
+//             panic!("nope");
+//         }
+//         n = n + 1;
+//         if xgroup.contains(&test_id.to_string()) {
+//             println!("{:?}", xgroup);
+//         }
+//
+//         let existing_sets_w_x = xgroup
+//             .clone()
+//             .into_iter()
+//             .filter(|x| conc_set.contains(x))
+//             .map(|x| (conc_set.get(&x).unwrap(), x))
+//             .collect_vec();
+//
+//         let existing_sets: Vec<String> = existing_sets_w_x.clone().into_iter().map(|a| a.0.clone()).collect_vec();
+//         let x = existing_sets_w_x.iter().map(|a| a.1.clone()).collect_vec();
+//         let mut newset = existing_sets.clone();
+//         newset.dedup();
+//         xgroup.iter().for_each(|a| newset.push(a.clone()));
+//
+//         if newset.contains(&test_id.to_string()) {
+//             println!("hiset: {:?}", newset);
+//             println!("input_set: {:?}", xgroup);
+//             println!("esets");
+//             // existing_sets.iter().for_each(|a| println!("{} {}", a, xgroup))
+//         }
+//
+//         newset.iter().for_each(|entry| {
+//             let prefix = entry.split(':').next().unwrap();
+//             if shit_prefixes.contains(&prefix) {
+//                 println!("entry: {}, prefix: {}", entry, prefix);
+//                 panic!("garbage");
+//             }
+//         });
+//
+//         let setok = true;
+//         if xgroup.contains(&test_id.to_string()) {
+//             println!("setok: {}", setok);
+//         }
+//
+//         unique_prefixes.iter().for_each(|up| {
+//             if xgroup.contains(&test_id.to_string()) {
+//                 println!("up: {}", up);
+//             }
+//             // newset.iter().filter_map(|a| );
+//         });
+//     }
+//
+//     Ok(conc_set.clone())
+// }
+//
+// #[pymodule]
+// fn babel_io(_py: Python, m: &PyModule) -> PyResult<()> {
+//     m.add_function(wrap_pyfunction!(pull_uniprot_labels, m)?)?;
+//     m.add_function(wrap_pyfunction!(merge_uniprot_label_files, m)?)?;
+//     Ok(())
+// }
+//
+// #[cfg(test)]
+// mod tests {
+//     use crate::glom;
+//     use itertools::{Itertools, TupleWindows};
+//     use std::collections::HashSet;
+//
+//     #[test]
+//     fn test_glom() {
+//         let local_glom = |conc_set: HashSet<String>, mut newgroups: Vec<(String, String)>, unique_prefixes: Vec<String>| -> HashSet<String> {
+//             let mut n = 0;
+//             let bad = 0;
+//             let shit_prefixes = vec!["KEGG", "PUBCHEM"];
+//             let test_id = "xUBERON:0002262";
+//             // let mut excised = vec![];
+//
+//             for xgroup in newgroups.iter_mut() {
+//                 if xgroup.len() > 2 {
+//                     println!("{:?}", xgroup);
+//                     panic!("nope");
+//                 }
+//                 n = n + 1;
+//                 if xgroup.contains(&test_id.to_string()) {
+//                     println!("{:?}", xgroup);
+//                 }
+//
+//                 let existing_sets_w_x = xgroup
+//                     .clone()
+//                     .into_iter()
+//                     .filter(|x| conc_set.contains(x))
+//                     .map(|x| (conc_set.get(&x).unwrap(), x))
+//                     .collect_vec();
+//
+//                 let existing_sets: Vec<String> = existing_sets_w_x.clone().into_iter().map(|a| a.0.clone()).collect_vec();
+//                 let x = existing_sets_w_x.iter().map(|a| a.1.clone()).collect_vec();
+//                 let mut newset = existing_sets.clone();
+//                 newset.dedup();
+//                 xgroup.iter().for_each(|a| newset.push(a.clone()));
+//
+//                 if newset.contains(&test_id.to_string()) {
+//                     println!("hiset: {:?}", newset);
+//                     println!("input_set: {:?}", xgroup);
+//                     println!("esets");
+//                     // existing_sets.iter().for_each(|a| println!("{} {}", a, xgroup))
+//                 }
+//
+//                 newset.iter().for_each(|entry| {
+//                     let prefix = entry.split(':').next().unwrap();
+//                     if shit_prefixes.contains(&prefix) {
+//                         println!("entry: {}, prefix: {}", entry, prefix);
+//                         panic!("garbage");
+//                     }
+//                 });
+//
+//                 let setok = true;
+//                 if xgroup.contains(&test_id.to_string()) {
+//                     println!("setok: {}", setok);
+//                 }
+//
+//                 unique_prefixes.iter().for_each(|up| {
+//                     if xgroup.clcontains(&test_id.to_string()) {
+//                         println!("up: {}", up);
+//                     }
+//                     // newset.iter().filter_map(|a| );
+//                 });
+//             }
+//             conc_set.clone()
+//         };
+//
+//         // let v: TupleWindows<_, (String, String)> = vec!["1", "2", "3", "4", "5", "6", "7"]
+//         //     .into_iter()
+//         //     .map(|a| a.to_string())
+//         //     .collect_vec()
+//         //     .into_iter()
+//         //     .tuple_windows();
+//         // println!("{:?}", v.collect::<Vec<_>>());
+//         let mut conc_set = std::collections::HashSet::new();
+//         let newgroups: Vec<(String, String)> = vec![
+//             ("UMLS:C0000005".to_string(), String::new()),
+//             ("UMLS:C0000052".to_string(), String::new()),
+//             ("UMLS:C0000084".to_string(), String::new()),
+//             ("UMLS:C0000107".to_string(), String::new()),
+//             ("UMLS:C0000132".to_string(), String::new()),
+//             ("UMLS:C0000152".to_string(), String::new()),
+//             ("UMLS:C0000165".to_string(), String::new()),
+//             ("UMLS:C0000184".to_string(), String::new()),
+//             ("UMLS:C0000189".to_string(), String::new()),
+//             ("UMLS:C0000246".to_string(), String::new()),
+//             ("UMLS:C0000254".to_string(), String::new()),
+//             ("UMLS:C0000257".to_string(), String::new()),
+//             ("UMLS:C0000291".to_string(), String::new()),
+//             ("UMLS:C0000324".to_string(), String::new()),
+//             ("UMLS:C0000340".to_string(), String::new()),
+//             ("UMLS:C0000353".to_string(), String::new()),
+//             ("UMLS:C0000359".to_string(), String::new()),
+//             ("UMLS:C0000360".to_string(), String::new()),
+//         ];
+//         println!("conc_set before glom: {:?}", conc_set);
+//         let conc_set = local_glom(conc_set, newgroups, vec!["UniProtKB".to_string(), "PR".to_string()]);
+//         println!("conc_set after glom: {:?}", conc_set);
+//         // glom(d, eqs)
+//         // print(f"{d}")
+//         // assert len(d) == 5
+//         // assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+//         // assert d["4"] == d["5"] == {"4", "5"}
+//     }
+// }

From ea34ccb2bd9129e3380cfe0bff453e350daecee3 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 31 Jul 2025 08:36:11 -0400
Subject: [PATCH 20/28] initial commit

---
 babel_io/Cargo.toml   | 32 ++++++++++++++++++++++++++++++++
 babel_io/rustfmt.toml |  4 ++++
 2 files changed, 36 insertions(+)
 create mode 100644 babel_io/Cargo.toml
 create mode 100644 babel_io/rustfmt.toml

diff --git a/babel_io/Cargo.toml b/babel_io/Cargo.toml
new file mode 100644
index 00000000..2abb96ae
--- /dev/null
+++ b/babel_io/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "babel_io"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+name = "babel_io"
+
+[dependencies]
+async_once = "^0.2"
+csv = "^1.3"
+clap = { version = "^4.5", features = ["derive"] }
+env_logger = "^0.11"
+humantime = "^2.2"
+itertools = "^0.14"
+lazy_static = "^1.3"
+log = { version = "^0.4", features = ["std"] }
+polars = { version = "^0.45", features = ["default", "cloud", "concat_str", "string_pad", "dtype-array", "strings", "regex", "json", "cross_join", "lazy", "coalesce", "polars-lazy", "parquet", "find_many", "csv", "decompress", "list_eval", "is_in"] }
+oxigraph = "^0.4"
+rand = "^0.9"
+rayon = "^1.10"
+regex = "^1.11"
+reqwest = { version = "^0.12", features = ["default", "json"] }
+roxmltree = "^0.20"
+serde = { version = "^1.0", features = ["derive", "serde_derive"] }
+serde_derive = "^1.0"
+serde_json = "^1.0"
+serde_with = { version = "^3.12", features = ["std", "macros", "json"] }
+tokio = { version = "^1.45", features = ["rt", "rt-multi-thread", "macros"] }
+uuid = { version = "^1.1", features = ["v4"] }
+quick-xml = "^0.38"
+zip = "^4.2"
diff --git a/babel_io/rustfmt.toml b/babel_io/rustfmt.toml
new file mode 100644
index 00000000..fc5e46a1
--- /dev/null
+++ b/babel_io/rustfmt.toml
@@ -0,0 +1,4 @@
+max_width = 160
+newline_style = "Unix"
+use_field_init_shorthand = true
+use_try_shorthand = true

From 811deae7c90ae1be37c7a0da04d0ec9bf42d8d4e Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:38:54 -0400
Subject: [PATCH 21/28] initial commit

---
 babel_io/src/bin/build_compendia.rs | 65 +++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 babel_io/src/bin/build_compendia.rs

diff --git a/babel_io/src/bin/build_compendia.rs b/babel_io/src/bin/build_compendia.rs
new file mode 100644
index 00000000..9e6e2483
--- /dev/null
+++ b/babel_io/src/bin/build_compendia.rs
@@ -0,0 +1,65 @@
+#[macro_use]
+extern crate log;
+
+use clap::Parser;
+use humantime::format_duration;
+use itertools::Itertools;
+use oxigraph::io::RdfFormat;
+use oxigraph::sparql::QueryResults;
+use oxigraph::store::Store;
+use std::collections::HashSet;
+use std::error::Error;
+use std::fs::read_to_string;
+use std::io::{BufReader, BufWriter};
+use std::time::Instant;
+use std::{fs, path};
+
+#[derive(Parser, PartialEq, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Options {
+    #[clap(short, long, required = true)]
+    concordances: Vec<path::PathBuf>,
+
+    #[clap(short, long, required = true)]
+    identifiers: Vec<path::PathBuf>,
+
+    #[clap(short = 'z', long, required = true)]
+    ic_rdf: path::PathBuf,
+}
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn Error>> {
+    let start = Instant::now();
+    env_logger::init();
+
+    let options = Options::parse();
+    debug!("{:?}", options);
+
+    let dicts = HashSet::new();
+    let types = HashSet::new();
+
+    for ifile in options.identifiers {
+        let asdf = read_to_string(ifile).unwrap();
+        for line in asdf.lines() {}
+        // new_identifiers, new_types = read_identifier_file(ifile)
+        //
+        //
+        //
+        // types = {}
+        // identifiers = list()
+        // with open(infile,'r') as inf:
+        // for line in inf:
+        //     x = line.strip().split('\t')
+        // identifiers.append((x[0],))
+        // if len(x) > 1:
+        //     types[x[0]] = x[1]
+        // return identifiers,types
+        //
+        //
+        //
+        // glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO])
+        // types.update(new_types)
+    }
+
+    info!("Duration: {}", format_duration(start.elapsed()).to_string());
+    Ok(())
+}

From 48ade29cd1af25e1542fc1de8248dcd37bbbe695 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:39:10 -0400
Subject: [PATCH 22/28] adding targets

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 638f85ec..0388e479 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ babel_outputs/
 .snakemake/
 .env
 .idea
+**/target/
\ No newline at end of file

From 9e7d8cc3e5c32122496af4b908022cc1f0e4bad1 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:39:26 -0400
Subject: [PATCH 23/28] renaming

---
 requirements.txt => requirements.in | 3 ---
 1 file changed, 3 deletions(-)
 rename requirements.txt => requirements.in (85%)

diff --git a/requirements.txt b/requirements.in
similarity index 85%
rename from requirements.txt
rename to requirements.in
index b6d6479a..45be78fc 100644
--- a/requirements.txt
+++ b/requirements.in
@@ -4,13 +4,10 @@ bmt
 jsonlines
 pandas
 more-itertools
-#pyoxigraph~=0.2.5
 pyoxigraph~=0.4.11
 psycopg2-binary
 pytest
 pytest-cov
-#python-Levenshtein-wheels
-python-levenshtein
 pyyaml
 requests
 PuLP==2.7.0

From 1bde21dd922283205f5d6558ca880f3c91ea79e8 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:39:44 -0400
Subject: [PATCH 24/28] incrementing snakemake

---
 requirements.in | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.in b/requirements.in
index 45be78fc..4d09b5eb 100644
--- a/requirements.in
+++ b/requirements.in
@@ -11,7 +11,8 @@ pytest-cov
 pyyaml
 requests
 PuLP==2.7.0
-snakemake==7.32.4
+#snakemake==7.32.4
+snakemake==9.9.0
 sparqlwrapper
 # Added by Gaurav, Jan 2022
 xmltodict

From b142c29f81f4118fa6b1b50919a78ae763832aa0 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:39:57 -0400
Subject: [PATCH 25/28] updating

---
 requirements.lock | 443 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 311 insertions(+), 132 deletions(-)

diff --git a/requirements.lock b/requirements.lock
index 987e81f5..8dc2ba53 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -1,168 +1,347 @@
-aiohttp==3.8.4
-aiosignal==1.3.1
-airium==0.2.5
+#
+# This file is autogenerated by pip-compile with Python 3.13
+# by the following command:
+#    pip-compile requirements.in
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via apybiomart
+aiosignal==1.4.0
+airium==0.2.7
+    # via
+    #   linkml-renderer
+    #   oaklib
+alabaster==1.0.0
+    # via sphinx
+annotated-types==0.7.0
+    # via pydantic
 antlr4-python3-runtime==4.9.3
-anyio==3.7.0
+    #   linkml
+    #   pyjsg
+    #   pyshexc
 appdirs==1.4.4
+    #   snakemake
 apybiomart==0.5.3
-async-timeout==4.0.2
+    # via -r requirements.in
+argparse-dataclass==2.0.0
+    #   snakemake-interface-common
+    #   snakemake-interface-executor-plugins
+    #   yte
+arrow==1.3.0
+    # via isoduration
 asyncio==3.4.3
-attrs==23.1.0
-Babel==2.12.1
-bcp47==0.0.4
-beautifulsoup4==4.12.2
-biopython==1.81
-bmt==1.1.1
-cattrs==23.1.2
-certifi==2023.5.7
-cffconvert==2.0.0
+attrs==25.3.0
+    #   aiohttp
+    #   cattrs
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+    #   requests-cache
+babel==2.17.0
+bcp47==0.1.0
+    # via funowl
+beautifulsoup4==4.13.4
+biopython==1.85
+bmt==1.4.5
+cattrs==25.1.1
+    # via requests-cache
+certifi==2025.7.14
+    # via requests
+CFGraph==0.2.1
+    # via pyshex
 chardet==5.2.0
-charset-normalizer==3.1.0
-class-resolver==0.4.2
-click==8.1.7
-colorama==0.4.6
-ConfigArgParse==1.7
-connection-pool==0.0.3
-coverage==7.3.0
-curies==0.6.0
-datrie==0.8.2
-Deprecated==1.2.14
+    #   pronto
+    #   pyshex
+charset-normalizer==3.4.2
+class-resolver==0.6.0
+    #   ols-client
+click==8.2.1
+    #   json-flattener
+    #   linkml-runtime
+    #   more-click
+    #   prefixcommons
+    #   pystow
+    #   semsql
+    #   sphinx-click
+    #   sssom
+conda-inject==1.3.2
+    # via snakemake
+ConfigArgParse==1.7.1
+connection_pool==0.0.3
+coverage==7.10.1
+    # via pytest-cov
+curies==0.10.19
+    #   -r requirements.in
+    #   kgcl-schema
+    #   prefixmaps
+Deprecated==1.2.18
+    # via linkml-runtime
 deprecation==2.1.0
-docopt==0.6.2
-docutils==0.20.1
-dpath==2.1.6
-EditorConfig==0.12.3
-elasticsearch==7.16.3
+    #   bmt
+docutils==0.21.2
+    #   sphinx
+dpath==2.2.0
+duckdb==1.3.2
+et_xmlfile==2.0.0
+    # via openpyxl
 eutils==0.6.0
-fastapi==0.95.0
-fastjsonschema==2.18.0
-fastobo==0.12.2
-frozenlist==1.3.3
+    # via oaklib
+fastjsonschema==2.21.1
+    # via nbformat
+fastobo==0.13.0
+    # via pronto
+fqdn==1.5.1
+    # via jsonschema
+frozenlist==1.7.0
+    #   aiosignal
 funowl==0.2.3
-ghp-import==2.1.0
-gitdb==4.0.10
-GitPython==3.1.34
-greenlet==2.0.1
-gunicorn==20.1.0
-h11==0.14.0
+gitdb==4.0.12
+    # via gitpython
+GitPython==3.1.45
+graphviz==0.21
+    # via linkml
+greenlet==3.2.3
+    # via sqlalchemy
 hbreader==0.9.1
+    #   jsonasobj2
 humanfriendly==10.0
-idna==3.4
-ijson==3.2.3
-importlib-metadata==6.8.0
-iniconfig==2.0.0
-isodate==0.6.1
-itsdangerous==2.1.2
-Jinja2==3.1.2
-jsbeautifier==1.14.9
+idna==3.10
+    #   requests
+    #   url-normalize
+    #   yarl
+ijson==3.4.0
+    # via ndex2
+imagesize==1.4.1
+immutables==0.21
+importlib_resources==6.5.2
+    # via sssom
+iniconfig==2.1.0
+    # via pytest
+isodate==0.7.2
+isoduration==20.11.0
+Jinja2==3.1.6
 json-flattener==0.1.9
 jsonasobj==1.3.1
+    #   funowl
 jsonasobj2==1.0.4
 jsonlines==4.0.0
-jsonschema==3.2.0
-jupyter_core==5.3.1
+jsonpointer==3.0.0
+jsonschema==4.25.0
+    #   nbformat
+jsonschema-specifications==2025.4.1
+jupyter_core==5.8.1
 kgcl-rdflib==0.5.0
-kgcl-schema==0.6.0
-lark==1.1.7
-linkml-renderer==0.3.0
-linkml-runtime==1.5.6
-lxml==4.9.3
-Markdown==3.4.4
-MarkupSafe==2.1.3
-mergedeep==1.3.4
-mistune==2.0.3
-mkdocs==1.5.2
-mkdocs-material==9.2.7
-mkdocs-material-extensions==1.1.1
-mkdocs-mermaid2-plugin==0.6.0
+kgcl_schema==0.6.9
+    #   kgcl-rdflib
+lark==1.2.2
+    # via kgcl-schema
+linkml==1.9.3
+linkml-renderer==0.3.1
+linkml-runtime==1.9.4
+    #   sssom-schema
+lxml==6.0.0
+    # via eutils
+MarkupSafe==3.0.2
+    # via jinja2
 more-click==0.1.2
-more-itertools==10.1.0
-multidict==6.0.4
-nbformat==5.9.2
-ndex2==3.5.1
-networkx==3.1
-numpy==1.25.2
-oaklib==0.5.18
+    # via ols-client
+more-itertools==10.7.0
+multidict==6.6.3
+nbformat==5.10.4
+ndex2==3.11.0
+networkx==3.5
+    #   ndex2
+numpy==2.3.2
+    #   biopython
+    #   pandas
+    #   pansql
+    #   scipy
+oaklib==0.5.33
 ols-client==0.1.4
 ontoportal-client==0.0.4
-packaging==23.1
-paginate==0.5.6
-pandas==2.1.0
+openpyxl==3.1.5
+packaging==25.0
+    #   deprecation
+    #   pytest
+pandas==2.3.1
+    #   apybiomart
 pansql==0.0.1
-pathspec==0.11.2
-plac==1.3.5
-platformdirs==3.10.0
-pluggy==1.0.0
+parse==1.20.2
+platformdirs==4.3.8
+    #   jupyter-core
+pluggy==1.6.0
+    #   pytest-cov
 prefixcommons==0.1.12
-prefixmaps==0.1.5
-pronto==2.5.5
-psutil==5.9.5
-psycopg2-binary==2.9.7
+prefixmaps==0.2.6
+pronto==2.7.0
+propcache==0.3.2
+psutil==7.0.0
+psycopg2-binary==2.9.10
 PuLP==2.7.0
-pydantic==1.10.9
-Pygments==2.16.1
+pydantic==2.11.7
+    #   curies
+pydantic_core==2.33.2
+Pygments==2.19.2
 PyJSG==0.11.10
-pykwalify==1.8.0
-pymdown-extensions==10.3
-pyoxigraph==0.2.5
-pyparsing==3.1.1
-pyrsistent==0.17.3
-pysolr==3.9.0
-pystow==0.5.0
-pytest==7.3.2
-pytest-cov==4.1.0
+    #   shexjsg
+pyoxigraph==0.4.11
+pyparsing==3.2.3
+    # via rdflib
+PyShEx==0.8.1
+PyShExC==0.9.1
+pysolr==3.10.0
+pystow==0.7.1
+    #   ontoportal-client
+pytest==8.4.1
+    #   pytest-logging
+pytest-cov==6.2.1
 pytest-logging==2015.11.4
-python-dateutil==2.8.2
-python-Levenshtein-wheels==0.13.2
+    # via prefixcommons
+python-dateutil==2.9.0.post0
+    #   arrow
 PyTrie==0.4.0
-pytz==2021.1
-PyYAML==6.0.1
-pyyaml_env_tag==0.1
+    # via curies
+pytz==2025.2
+    #   eutils
+PyYAML==6.0.2
+    #   conda-inject
 ratelimit==2.2.1
-rdflib==7.0.0
+rdflib==7.1.4
+    #   cfgraph
+    #   rdflib-jsonld
+    #   rdflib-shim
+    #   sparqlslurper
+    #   sparqlwrapper
 rdflib-jsonld==0.6.1
+    # via rdflib-shim
 rdflib-shim==1.0.3
-redis==4.4.2
-regex==2022.10.31
-requests==2.28.2
-requests-cache==1.1.0
+referencing==0.36.2
+    #   jsonschema-specifications
+requests==2.32.4
+    #   pysolr
+    #   requests-toolbelt
+requests-cache==1.2.1
 requests-toolbelt==1.0.0
 reretry==0.11.8
+    #   snakemake-interface-storage-plugins
+rfc3339-validator==0.1.4
 rfc3987==1.3.8
-ruamel.yaml==0.17.26
-ruamel.yaml.clib==0.2.7
-scipy==1.11.2
-semsimian==0.2.1
-semsql==0.3.2
-six==1.16.0
-smart-open==6.3.0
-smmap==5.0.0
-snakemake==7.32.3
-sniffio==1.3.0
+roman-numerals-py==3.1.0
+rpds-py==0.26.0
+scipy==1.16.1
+semsql==0.4.0
+ShExJSG==0.8.2
+six==1.17.0
+    #   python-dateutil
+    #   rfc3339-validator
+smart_open==7.3.0.post1
+smmap==5.0.2
+    # via gitdb
+snakemake==9.9.0
+snakemake-interface-common==1.21.0
+    #   snakemake-interface-logger-plugins
+    #   snakemake-interface-report-plugins
+snakemake-interface-executor-plugins==9.3.9
+snakemake-interface-logger-plugins==1.2.4
+snakemake-interface-report-plugins==1.2.0
+snakemake-interface-storage-plugins==4.2.2
+snowballstemmer==3.0.1
 sortedcontainers==2.4.0
-soupsieve==2.5
+    # via pytrie
+soupsieve==2.7
+    # via beautifulsoup4
+sparqlslurper==0.5.1
 SPARQLWrapper==2.0.0
-SQLAlchemy==2.0.20
+Sphinx==8.2.3
+    # via sphinx-click
+sphinx-click==6.0.0
+sphinxcontrib-applehelp==2.0.0
+sphinxcontrib-devhelp==2.0.0
+sphinxcontrib-htmlhelp==2.1.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==2.0.0
+sphinxcontrib-serializinghtml==2.0.0
+SQLAlchemy==2.0.42
+    #   sqlalchemy-utils
 SQLAlchemy-Utils==0.38.3
-sssom==0.3.40
-sssom-schema==0.15.0
-starlette==0.26.1
-stopit==1.1.2
+    # via semsql
+sssom==0.4.16
+sssom-schema==1.0.0
 stringcase==1.2.0
+    # via bmt
 tabulate==0.9.0
+tenacity==8.5.0
 throttler==1.2.2
-toposort==1.10
-tqdm==4.66.1
-traitlets==5.9.0
-typing_extensions==4.6.3
-tzdata==2023.3
-url-normalize==1.4.3
-urllib3==1.26.16
-uvicorn==0.22.0
-validators==0.22.0
-watchdog==3.0.0
-wrapt==1.15.0
-xmltodict==0.13.0
-yarl==1.9.2
-yte==1.5.1
-zipp==3.16.2
+tqdm==4.67.1
+    # via pystow
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20250708
+    # via arrow
+typing_extensions==4.14.1
+    #   beautifulsoup4
+    #   class-resolver
+    #   pydantic
+    #   pydantic-core
+    #   sqlalchemy
+    #   typing-inspection
+typing-inspection==0.4.1
+tzdata==2025.2
+    # via pandas
+uri-template==1.3.0
+url-normalize==2.2.1
+urllib3==2.5.0
+validators==0.35.0
+watchdog==6.0.0
+webcolors==24.11.1
+wheel==0.45.1
+wrapt==1.17.2
+    #   deprecated
+    #   smart-open
+xmltodict==0.14.2
+yarl==1.20.1
+yte==1.9.0
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools
+## The following requirements were added by pip freeze:
+anyio==4.9.0
+build==1.2.2.post1
+CacheControl==0.14.3
+cffi==1.17.1
+cleo==2.1.0
+crashtest==0.4.1
+cryptography==45.0.5
+distlib==0.3.9
+dulwich==0.22.8
+filelock==3.18.0
+findpython==0.6.3
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+installer==0.7.0
+jaraco.classes==3.4.0
+jaraco.context==6.0.1
+jaraco.functools==4.2.1
+jeepney==0.9.0
+keyring==25.6.0
+Levenshtein==0.27.1
+msgpack==1.1.1
+pbs-installer==2025.7.8
+pip-tools==7.5.0
+pipdeptree==2.28.0
+pkginfo==1.12.1.2
+plac==1.4.5
+poetry==2.1.3
+poetry-core==2.1.3
+pycparser==2.22
+pyproject_hooks==1.2.0
+python-Levenshtein==0.27.1
+RapidFuzz==3.13.0
+SecretStorage==3.3.3
+setuptools==80.9.0
+shellingham==1.5.4
+sniffio==1.3.1
+tomlkit==0.13.3
+trove-classifiers==2025.5.9.12
+virtualenv==20.31.2
+zstandard==0.23.0

From b54e41c88ba05d8a2b2638fe3777581f2e60a3ca Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:41:44 -0400
Subject: [PATCH 26/28] adding pull_via_urllib

---
 src/datahandlers/datacollect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py
index b5137fbd..f5b710a1 100644
--- a/src/datahandlers/datacollect.py
+++ b/src/datahandlers/datacollect.py
@@ -1,5 +1,5 @@
 from src.ubergraph import UberGraph
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
 from collections import defaultdict
 import os, gzip
 from json import loads,dumps

From 5e2bfb833ff2fd6a4a07d2dc879c0ca2cbba4f14 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:42:15 -0400
Subject: [PATCH 27/28] adding rust impls

---
 src/snakefiles/anatomy.snakefile | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile
index 6e80a026..3db5a212 100644
--- a/src/snakefiles/anatomy.snakefile
+++ b/src/snakefiles/anatomy.snakefile
@@ -1,3 +1,5 @@
+from snakemake_interface_executor_plugins.utils import join_cli_args
+
 import src.createcompendia.anatomy as anatomy
 import src.assess_compendia as assessments
 import src.snakefiles.util as util
@@ -67,18 +69,26 @@ rule get_anatomy_umls_relationships:
     run:
         anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile)
 
+def add_flag(files):
+    return {'-c ': '{wildcards.token}'.format(wildcards=files)}
+
 rule anatomy_compendia:
     input:
         labels=os.path.join(config["download_directory"], 'common', config["common"]["labels"][0]),
         synonyms=os.path.join(config["download_directory"], 'common', config["common"]["synonyms"][0]),
-        concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']),
-        idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']),
+        concords=expand("{dd}/anatomy/concords/{ap}", dd=config['intermediate_directory'], ap=config['anatomy_concords']),
+        idlists=expand("{dd}/anatomy/ids/{ap}", dd=config['intermediate_directory'], ap=config['anatomy_ids']),
         icrdf_filename=config['download_directory']+'/icRDF.tsv',
+    params:
+        flagged_concords = " ".join(["-c " + config['intermediate_directory'] + "/anatomy/concords/" + a for a in config['anatomy_concords']]),
+        flagged_ids = " ".join(["-i " + config['intermediate_directory'] + "/anatomy/ids/" + a for a in config['anatomy_ids']])
     output:
         expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']),
         temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']))
-    run:
-        anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename)
+    # run:
+    #     anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename)
+    shell:
+        "./babel_io/target/release/build_compendia {params.flagged_concords} {params.flagged_ids} -z {input.icrdf_filename}"
 
 rule check_anatomy_completeness:
     input:

From b08465cc0101c64664ca9c2cf44bd184ddd795f4 Mon Sep 17 00:00:00 2001
From: Jason Reilly <jdr0887@gmail.com>
Date: Thu, 25 Sep 2025 11:42:44 -0400
Subject: [PATCH 28/28] initial commit

---
 requirements.txt | 594 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 594 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..25318166
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,594 @@
+#
+# This file is autogenerated by pip-compile with Python 3.13
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.12.15
+    # via apybiomart
+aiosignal==1.4.0
+    # via aiohttp
+airium==0.2.7
+    # via
+    #   linkml-renderer
+    #   oaklib
+alabaster==1.0.0
+    # via sphinx
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via
+    #   linkml
+    #   pyjsg
+    #   pyshexc
+appdirs==1.4.4
+    # via
+    #   oaklib
+    #   snakemake
+apybiomart==0.5.3
+    # via -r requirements.in
+argparse-dataclass==2.0.0
+    # via
+    #   snakemake-interface-common
+    #   snakemake-interface-executor-plugins
+    #   yte
+arrow==1.3.0
+    # via isoduration
+asyncio==3.4.3
+    # via apybiomart
+attrs==25.3.0
+    # via
+    #   aiohttp
+    #   cattrs
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+    #   requests-cache
+babel==2.17.0
+    # via sphinx
+bcp47==0.1.0
+    # via funowl
+beautifulsoup4==4.13.4
+    # via -r requirements.in
+biopython==1.85
+    # via -r requirements.in
+bmt==1.4.5
+    # via -r requirements.in
+cattrs==25.1.1
+    # via requests-cache
+certifi==2025.7.14
+    # via requests
+cfgraph==0.2.1
+    # via pyshex
+chardet==5.2.0
+    # via
+    #   pronto
+    #   pyshex
+    #   pyshexc
+charset-normalizer==3.4.2
+    # via requests
+class-resolver==0.6.0
+    # via
+    #   oaklib
+    #   ols-client
+click==8.2.1
+    # via
+    #   json-flattener
+    #   linkml
+    #   linkml-renderer
+    #   linkml-runtime
+    #   more-click
+    #   oaklib
+    #   ols-client
+    #   prefixcommons
+    #   pystow
+    #   semsql
+    #   sphinx-click
+    #   sssom
+conda-inject==1.3.2
+    # via snakemake
+configargparse==1.7.1
+    # via
+    #   snakemake
+    #   snakemake-interface-common
+connection-pool==0.0.3
+    # via snakemake
+coverage[toml]==7.10.1
+    # via pytest-cov
+curies==0.10.19
+    # via
+    #   -r requirements.in
+    #   kgcl-schema
+    #   linkml-runtime
+    #   oaklib
+    #   prefixmaps
+    #   sssom
+deprecated==1.2.18
+    # via linkml-runtime
+deprecation==2.1.0
+    # via
+    #   bmt
+    #   sssom
+docutils==0.21.2
+    # via
+    #   snakemake
+    #   sphinx
+    #   sphinx-click
+dpath==2.2.0
+    # via
+    #   snakemake
+    #   yte
+duckdb==1.3.2
+    # via -r requirements.in
+et-xmlfile==2.0.0
+    # via openpyxl
+eutils==0.6.0
+    # via oaklib
+fastjsonschema==2.21.1
+    # via nbformat
+fastobo==0.13.0
+    # via pronto
+fqdn==1.5.1
+    # via jsonschema
+frozenlist==1.7.0
+    # via
+    #   aiohttp
+    #   aiosignal
+funowl==0.2.3
+    # via oaklib
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.45
+    # via snakemake
+graphviz==0.21
+    # via linkml
+greenlet==3.2.3
+    # via sqlalchemy
+hbreader==0.9.1
+    # via
+    #   jsonasobj2
+    #   linkml
+    #   linkml-runtime
+humanfriendly==10.0
+    # via snakemake
+idna==3.10
+    # via
+    #   jsonschema
+    #   requests
+    #   url-normalize
+    #   yarl
+ijson==3.4.0
+    # via ndex2
+imagesize==1.4.1
+    # via sphinx
+immutables==0.21
+    # via snakemake
+importlib-resources==6.5.2
+    # via sssom
+iniconfig==2.1.0
+    # via pytest
+isodate==0.7.2
+    # via linkml
+isoduration==20.11.0
+    # via jsonschema
+jinja2==3.1.6
+    # via
+    #   linkml
+    #   snakemake
+    #   sphinx
+json-flattener==0.1.9
+    # via linkml-runtime
+jsonasobj==1.3.1
+    # via
+    #   funowl
+    #   pyjsg
+    #   pyshexc
+jsonasobj2==1.0.4
+    # via
+    #   linkml
+    #   linkml-runtime
+jsonlines==4.0.0
+    # via
+    #   -r requirements.in
+    #   oaklib
+jsonpointer==3.0.0
+    # via jsonschema
+jsonschema[format]==4.25.0
+    # via
+    #   linkml
+    #   linkml-runtime
+    #   nbformat
+    #   snakemake
+jsonschema-specifications==2025.4.1
+    # via jsonschema
+jupyter-core==5.8.1
+    # via nbformat
+kgcl-rdflib==0.5.0
+    # via oaklib
+kgcl-schema==0.6.9
+    # via
+    #   kgcl-rdflib
+    #   oaklib
+lark==1.2.2
+    # via kgcl-schema
+linkml==1.9.3
+    # via sssom
+linkml-renderer==0.3.1
+    # via oaklib
+linkml-runtime==1.9.4
+    # via
+    #   bmt
+    #   kgcl-rdflib
+    #   kgcl-schema
+    #   linkml
+    #   linkml-renderer
+    #   oaklib
+    #   semsql
+    #   sssom
+    #   sssom-schema
+lxml==6.0.0
+    # via eutils
+markupsafe==3.0.2
+    # via jinja2
+more-click==0.1.2
+    # via ols-client
+more-itertools==10.7.0
+    # via -r requirements.in
+multidict==6.6.3
+    # via
+    #   aiohttp
+    #   yarl
+nbformat==5.10.4
+    # via snakemake
+ndex2==3.11.0
+    # via oaklib
+networkx[networkx]==3.5
+    # via
+    #   ndex2
+    #   oaklib
+    #   pronto
+    #   sssom
+numpy==2.3.2
+    # via
+    #   biopython
+    #   ndex2
+    #   pandas
+    #   pansql
+    #   scipy
+oaklib==0.5.33
+    # via -r requirements.in
+ols-client==0.1.4
+    # via oaklib
+ontoportal-client==0.0.4
+    # via oaklib
+openpyxl==3.1.5
+    # via linkml
+packaging==25.0
+    # via
+    #   deprecation
+    #   pytest
+    #   snakemake
+    #   snakemake-interface-common
+    #   sphinx
+pandas==2.3.1
+    # via
+    #   -r requirements.in
+    #   apybiomart
+    #   ndex2
+    #   pansql
+    #   sssom
+pansql[pansql]==0.0.1
+    # via sssom
+parse==1.20.2
+    # via linkml
+platformdirs==4.3.8
+    # via
+    #   jupyter-core
+    #   requests-cache
+pluggy==1.6.0
+    # via
+    #   pytest
+    #   pytest-cov
+prefixcommons==0.1.12
+    # via
+    #   linkml
+    #   linkml-runtime
+prefixmaps==0.2.6
+    # via
+    #   kgcl-schema
+    #   linkml
+    #   linkml-runtime
+    #   oaklib
+pronto==2.7.0
+    # via oaklib
+propcache==0.3.2
+    # via
+    #   aiohttp
+    #   yarl
+psutil==7.0.0
+    # via snakemake
+psycopg2-binary==2.9.10
+    # via -r requirements.in
+pulp==2.7.0
+    # via
+    #   -r requirements.in
+    #   snakemake
+pydantic==2.11.7
+    # via
+    #   curies
+    #   linkml
+    #   linkml-renderer
+    #   linkml-runtime
+    #   oaklib
+pydantic-core==2.33.2
+    # via pydantic
+pygments==2.19.2
+    # via
+    #   pytest
+    #   sphinx
+pyjsg==0.11.10
+    # via
+    #   funowl
+    #   linkml
+    #   pyshexc
+    #   shexjsg
+pyoxigraph==0.4.11
+    # via -r requirements.in
+pyparsing==3.2.3
+    # via rdflib
+pyshex==0.8.1
+    # via linkml
+pyshexc==0.9.1
+    # via
+    #   linkml
+    #   pyshex
+pysolr==3.10.0
+    # via oaklib
+pystow==0.7.1
+    # via
+    #   oaklib
+    #   ols-client
+    #   ontoportal-client
+pytest==8.4.1
+    # via
+    #   -r requirements.in
+    #   pytest-cov
+    #   pytest-logging
+pytest-cov==6.2.1
+    # via -r requirements.in
+pytest-logging==2015.11.4
+    # via prefixcommons
+python-dateutil==2.9.0.post0
+    # via
+    #   arrow
+    #   linkml
+    #   pandas
+    #   pronto
+pytrie==0.4.0
+    # via curies
+pytz==2025.2
+    # via
+    #   eutils
+    #   pandas
+pyyaml==6.0.2
+    # via
+    #   -r requirements.in
+    #   conda-inject
+    #   json-flattener
+    #   linkml
+    #   linkml-runtime
+    #   prefixcommons
+    #   prefixmaps
+    #   snakemake
+    #   sssom
+    #   yte
+ratelimit==2.2.1
+    # via oaklib
+rdflib==7.1.4
+    # via
+    #   cfgraph
+    #   funowl
+    #   linkml
+    #   linkml-runtime
+    #   rdflib-jsonld
+    #   rdflib-shim
+    #   sparqlslurper
+    #   sparqlwrapper
+    #   sssom
+rdflib-jsonld==0.6.1
+    # via rdflib-shim
+rdflib-shim==1.0.3
+    # via
+    #   funowl
+    #   pyshex
+    #   pyshexc
+    #   sparqlslurper
+referencing==0.36.2
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   snakemake
+requests==2.32.4
+    # via
+    #   -r requirements.in
+    #   apybiomart
+    #   eutils
+    #   linkml
+    #   linkml-runtime
+    #   ndex2
+    #   ols-client
+    #   prefixcommons
+    #   pyshex
+    #   pysolr
+    #   pystow
+    #   requests-cache
+    #   requests-toolbelt
+    #   snakemake
+    #   sphinx
+requests-cache==1.2.1
+    # via oaklib
+requests-toolbelt==1.0.0
+    # via ndex2
+reretry==0.11.8
+    # via
+    #   snakemake
+    #   snakemake-interface-storage-plugins
+rfc3339-validator==0.1.4
+    # via jsonschema
+rfc3987==1.3.8
+    # via
+    #   funowl
+    #   jsonschema
+roman-numerals-py==3.1.0
+    # via sphinx
+rpds-py==0.26.0
+    # via
+    #   jsonschema
+    #   referencing
+scipy[scipy]==1.16.1
+    # via sssom
+semsql==0.4.0
+    # via oaklib
+shexjsg==0.8.2
+    # via
+    #   pyshex
+    #   pyshexc
+six==1.17.0
+    # via
+    #   ndex2
+    #   python-dateutil
+    #   rfc3339-validator
+smart-open==7.3.0.post1
+    # via snakemake
+smmap==5.0.2
+    # via gitdb
+snakemake==9.9.0
+    # via -r requirements.in
+snakemake-interface-common==1.21.0
+    # via
+    #   snakemake
+    #   snakemake-interface-executor-plugins
+    #   snakemake-interface-logger-plugins
+    #   snakemake-interface-report-plugins
+    #   snakemake-interface-storage-plugins
+snakemake-interface-executor-plugins==9.3.9
+    # via snakemake
+snakemake-interface-logger-plugins==1.2.4
+    # via snakemake
+snakemake-interface-report-plugins==1.2.0
+    # via snakemake
+snakemake-interface-storage-plugins==4.2.2
+    # via snakemake
+snowballstemmer==3.0.1
+    # via sphinx
+sortedcontainers==2.4.0
+    # via pytrie
+soupsieve==2.7
+    # via beautifulsoup4
+sparqlslurper==0.5.1
+    # via pyshex
+sparqlwrapper==2.0.0
+    # via
+    #   -r requirements.in
+    #   oaklib
+    #   pyshex
+    #   sparqlslurper
+    #   sssom
+sphinx==8.2.3
+    # via sphinx-click
+sphinx-click==6.0.0
+    # via linkml
+sphinxcontrib-applehelp==2.0.0
+    # via sphinx
+sphinxcontrib-devhelp==2.0.0
+    # via sphinx
+sphinxcontrib-htmlhelp==2.1.0
+    # via sphinx
+sphinxcontrib-jsmath==1.0.1
+    # via sphinx
+sphinxcontrib-qthelp==2.0.0
+    # via sphinx
+sphinxcontrib-serializinghtml==2.0.0
+    # via sphinx
+sqlalchemy==2.0.42
+    # via
+    #   linkml
+    #   oaklib
+    #   pansql
+    #   sqlalchemy-utils
+sqlalchemy-utils==0.38.3
+    # via semsql
+sssom==0.4.16
+    # via oaklib
+sssom-schema==1.0.0
+    # via sssom
+stringcase==1.2.0
+    # via bmt
+tabulate==0.9.0
+    # via snakemake
+tenacity==8.5.0
+    # via oaklib
+throttler==1.2.2
+    # via
+    #   snakemake
+    #   snakemake-interface-executor-plugins
+    #   snakemake-interface-storage-plugins
+tqdm==4.67.1
+    # via pystow
+traitlets==5.14.3
+    # via
+    #   jupyter-core
+    #   nbformat
+types-python-dateutil==2.9.0.20250708
+    # via arrow
+typing-extensions==4.14.1
+    # via
+    #   beautifulsoup4
+    #   cattrs
+    #   class-resolver
+    #   curies
+    #   ontoportal-client
+    #   pydantic
+    #   pydantic-core
+    #   pystow
+    #   sqlalchemy
+    #   typing-inspection
+typing-inspection==0.4.1
+    # via pydantic
+tzdata==2025.2
+    # via pandas
+uri-template==1.3.0
+    # via jsonschema
+url-normalize==2.2.1
+    # via requests-cache
+urllib3==2.5.0
+    # via
+    #   ndex2
+    #   pyshex
+    #   requests
+    #   requests-cache
+validators==0.35.0
+    # via sssom
+watchdog==6.0.0
+    # via linkml
+webcolors==24.11.1
+    # via jsonschema
+wheel==0.45.1
+    # via -r requirements.in
+wrapt==1.17.2
+    # via
+    #   deprecated
+    #   smart-open
+    #   snakemake
+    #   snakemake-interface-storage-plugins
+xmltodict==0.14.2
+    # via -r requirements.in
+yarl==1.20.1
+    # via aiohttp
+yte==1.9.0
+    # via snakemake
+
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools