From ebabe70fd357c200f6756e41ce0b1080cf685b04 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:14:18 -0400 Subject: [PATCH 01/28] adding omykiss_gene_ensembl --- config.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config.json b/config.json index eb602cb0..89373efd 100644 --- a/config.json +++ b/config.json @@ -133,7 +133,8 @@ "hgfemale_gene_ensembl", "charengus_gene_ensembl", "otshawytscha_gene_ensembl", - "aocellaris_gene_ensembl" + "aocellaris_gene_ensembl", + "omykiss_gene_ensembl" ], "duckdb_config": { From 90786c252f486237c1cc79bde6fb163a60c25db2 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:14:50 -0400 Subject: [PATCH 02/28] formatting --- tests/test_glom.py | 54 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/tests/test_glom.py b/tests/test_glom.py index d90b8669..db6f84fc 100644 --- a/tests/test_glom.py +++ b/tests/test_glom.py @@ -3,54 +3,58 @@ """glom is a tool that looks at list of sets of values and combines them together if they share members""" + def test_uberon(): - uberon=[('UBERON:123',)] - dict={} - glom(dict,uberon,unique_prefixes='UBERON') - uber2 = [set(['UBERON:123','SOME:other'])] - glom(dict,uber2,unique_prefixes='UBERON') + uberon = [("UBERON:123",)] + dict = {} + glom(dict, uberon, unique_prefixes="UBERON") + uber2 = [{"UBERON:123", "SOME:other"}] + glom(dict, uber2, unique_prefixes="UBERON") print(dict) + def test_simple(): """Given 3 sets, 2 of which share a member, output 2 sets, with the sharing sets combined""" d = {} - eqs = [('1','2'), ('2','3'), ('4','5')] - glom(d,eqs) + eqs = [("1", "2"), ("2", "3"), ("4", "5")] + glom(d, eqs) + print(f"{d}") assert len(d) == 5 - assert d['1'] == d['2'] == d['3'] == {'1','2','3'} - assert d['4'] == d['5'] == {'4','5'} + assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"} + assert d["4"] == d["5"] == {"4", "5"} + def test_two_calls(): """Test using glom iteratively. The first call joins the first two sets, then the second call joins the next two and the new set.""" d = {} - eqs = [('1','2'), ('2','3'), ('4','5'), ('6','7')] - oeqs = [('5','7')] - glom(d,eqs) - glom(d,oeqs) - assert d['1']==d['2']==d['3']=={'1','2','3'} - assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'} + eqs = [("1", "2"), ("2", "3"), ("4", "5"), ("6", "7")] + oeqs = [("5", "7")] + glom(d, eqs) + glom(d, oeqs) + assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"} + assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"} + def test_sets(): """Test using set() as opposed to {}""" d = {} - eqs = [{'1','2'}, set(['2','3']), set(['4','5']), set(['6','7'])] - oeqs = [{'5','7'}] - glom(d,eqs) - glom(d,oeqs) - assert d['1']==d['2']==d['3']=={'1','2','3'} - assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'} + eqs = [{"1", "2"}, {"2", "3"}, {"4", "5"}, {"6", "7"}] + oeqs = [{"5", "7"}] + glom(d, eqs) + glom(d, oeqs) + assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"} + assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"} + def test_bigger_sets(): """Test when the sets have more than two members. As of recent builds, we no longer expect this to work. Now glom only operates on new pairwise sets""" d = {} - eqs = [{'1','2','3'}, {'4','5','6'} ] + eqs = [{"1", "2", "3"}, {"4", "5", "6"}] try: - glom(d,eqs) + glom(d, eqs) assert False except ValueError: assert True - - From 3bae93d1e39aea916c44e921953ff65ea5c3e674 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:15:40 -0400 Subject: [PATCH 03/28] adding and 'datacollect' target, using rust on occasion --- src/snakefiles/datacollect.snakefile | 223 ++++++++++++++++++++++----- 1 file changed, 184 insertions(+), 39 deletions(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 89e61ac2..1f5d923f 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -56,8 +56,10 @@ rule get_EFO_labels: output: labelfile=config['download_directory'] + '/EFO/labels', synonymfile =config['download_directory'] + '/EFO/synonyms' - run: - efo.make_labels(output.labelfile,output.synonymfile) + # run: + # efo.make_labels(output.labelfile,output.synonymfile) # 21 seconds + shell: + "./babel_io/target/release/create_efo_labels --input {input.infile} --labels-output {output.labelfile} --synonyms-output {output.synonymfile}" ### Complex Portal # https://www.ebi.ac.uk/complexportal/ @@ -74,20 +76,22 @@ rule get_complexportal_labels_and_synonyms: output: lfile = config['download_directory']+'/ComplexPortal'+'/559292_labels.tsv', sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv' - run: - complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile) + # run: + # complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile) + shell: + "./babel_io/target/release/create_complexportal_labels_and_synonyms --input {input.infile} --labels-output {output.lfile} --synonyms-output {output.sfile}" ### MODS rule get_mods: output: - expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json", download_directory = config['download_directory'], mod = config['mods']), + expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json", download_directory = config['download_directory'], mod = config['mods']), run: mods.pull_mods() rule get_mods_labels: input: - expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json",download_directory=config['download_directory'], mod=config['mods']), + expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json",download_directory=config['download_directory'], mod=config['mods']), output: expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']), run: @@ -119,8 +123,10 @@ rule get_uniprotkb_labels: trembl_input=config['download_directory']+'/UniProtKB/uniprot_trembl.fasta', output: outfile=config['download_directory']+'/UniProtKB/labels' - run: - uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile) + #run: + # uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile) + shell: + "./babel_io/target/release/create_uniprot_labels --sprot-input {input.sprot_input} --trembl-input {input.trembl_input} --output {output.outfile}" rule get_umls_gene_protein_mappings: output: @@ -148,8 +154,10 @@ rule get_mesh_labels: config['download_directory']+'/MESH/mesh.nt' output: config['download_directory']+'/MESH/labels' - run: - mesh.pull_mesh_labels() + # run: + # mesh.pull_mesh_labels() + shell: + "./babel_io/target/release/create_mesh_labels --input {input} --output {output}" rule get_mesh_synonyms: #We don't actually get any. Maybe we could from the nt? @@ -178,6 +186,8 @@ rule get_umls_labels_and_synonyms: config['download_directory']+'/SNOMEDCT/synonyms' run: umls.pull_umls(input.mrconso) + # shell: + # "./babel_io/target/release/create_umls_labels_and_synonyms --input {input.mrconso}" ### OBO Ontologies @@ -232,16 +242,22 @@ rule get_ncbigene_labels_synonyms_and_taxa: synonyms_filename=config['download_directory']+'/NCBIGene/synonyms', taxa_filename=config['download_directory']+'/NCBIGene/taxa', descriptions_filename=config['download_directory']+'/NCBIGene/descriptions', - run: - ncbigene.pull_ncbigene_labels_synonyms_and_taxa(input.gene_info_filename, output.labels_filename, output.synonyms_filename, output.taxa_filename, output.descriptions_filename) + # run: + # ncbigene.pull_ncbigene_labels_synonyms_and_taxa(input.gene_info_filename, output.labels_filename, output.synonyms_filename, output.taxa_filename, output.descriptions_filename) + shell: + "./babel_io/target/release/create_ncbigene_labels_synonyms_and_taxa -i {input.gene_info_filename} -l {output.labels_filename} -s {output.synonyms_filename} -t {output.taxa_filename} -d {output.descriptions_filename}" ### ENSEMBL rule get_ensembl: output: outfile=config['download_directory']+'/ENSEMBL/BioMartDownloadComplete' - run: - ensembl.pull_ensembl(output.outfile) + params: + output_dir=config['download_directory']+'/ENSEMBL' + # run: + # ensembl.pull_ensembl(output.outfile) + shell: + "./babel_io/target/release/pull_ensembl --ensembl-output-dir {params.output_dir}" ### HGNC @@ -252,13 +268,15 @@ rule get_hgnc: hgnc.pull_hgnc() rule get_hgnc_labels_and_synonyms: - output: - config['download_directory']+'/HGNC/labels', - config['download_directory']+'/HGNC/synonyms' input: infile=rules.get_hgnc.output.outfile - run: - hgnc.pull_hgnc_labels_and_synonyms(input.infile) + output: + labels_filename=config['download_directory']+'/HGNC/labels', + synonyms_filename=config['download_directory']+'/HGNC/synonyms' + # run: + # hgnc.pull_hgnc_labels_and_synonyms(input.infile) + shell: + "./babel_io/target/release/create_hgnc_labels_and_synonyms -i {input.infile} -l {output.labels_filename} -s {output.synonyms_filename}" ### HGNC.FAMILY @@ -273,8 +291,10 @@ rule get_hgncfamily_labels: infile=rules.get_hgncfamily.output.outfile output: outfile = config['download_directory'] + '/HGNC.FAMILY/labels', - run: - hgncfamily.pull_labels(input.infile,output.outfile) + # run: + # hgncfamily.pull_labels(input.infile,output.outfile) + shell: + "./babel_io/target/release/create_hgncfamily_labels -i {input.infile} -l {output.outfile}" ### PANTHER.FAMILY @@ -289,8 +309,10 @@ rule get_pantherfamily_labels: infile=rules.get_pantherfamily.output.outfile output: outfile = config['download_directory'] + '/PANTHER.FAMILY/labels', - run: - pantherfamily.pull_labels(input.infile,output.outfile) + # run: + # pantherfamily.pull_labels(input.infile,output.outfile) + shell: + "./babel_io/target/release/create_pantherfamily_labels -i {input.infile} -l {output.outfile}" ### OMIM @@ -324,8 +346,10 @@ rule get_doid_labels_and_synonyms: output: labelfile = config['download_directory'] + '/DOID/labels', synonymfile = config['download_directory'] + '/DOID/synonyms' - run: - doid.pull_doid_labels_and_synonyms(input.infile, output.labelfile, output.synonymfile) + # run: + # doid.pull_doid_labels_and_synonyms(input.infile, output.labelfile, output.synonymfile) + shell: + "./babel_io/target/release/create_doid_labels_and_synonyms -i {input.infile} -l {output.labelfile} -s {output.synonymfile}" ### Orphanet @@ -357,8 +381,10 @@ rule get_reactome_labels: infile=config['download_directory'] + '/REACT/Events.json', output: labelfile=config['download_directory'] + '/REACT/labels', - run: - reactome.make_labels(input.infile,output.labelfile) + # run: + # reactome.make_labels(input.infile,output.labelfile) + shell: + "./babel_io/target/release/create_reactome_labels -i {input.infile} -l {output.labelfile}" ### RHEA @@ -373,8 +399,10 @@ rule get_rhea_labels: infile=config['download_directory'] + '/RHEA/rhea.rdf', output: labelfile=config['download_directory'] + '/RHEA/labels', - run: - rhea.make_labels(output.labelfile) + # run: + # rhea.make_labels(output.labelfile) + shell: + "./babel_io/target/release/create_rhea_labels -i {input.infile} -l {output.labelfile}" ### EC @@ -390,8 +418,10 @@ rule get_EC_labels: output: labelfile=config['download_directory'] + '/EC/labels', synonymfile =config['download_directory'] + '/EC/synonyms' - run: - ec.make_labels(output.labelfile,output.synonymfile) + # run: + # ec.make_labels(output.labelfile,output.synonymfile) + shell: + "./babel_io/target/release/create_ec_labels -i {input.infile} -l {output.labelfile} -s {output.synonymfile}" ### SMPDB @@ -406,8 +436,10 @@ rule get_SMPDB_labels: infile=config['download_directory'] + '/SMPDB/smpdb_pathways.csv' output: labelfile=config['download_directory'] + '/SMPDB/labels' - run: - smpdb.make_labels(input.infile,output.labelfile) + # run: + # smpdb.make_labels(input.infile,output.labelfile) + shell: + "./babel_io/target/release/create_smpdb_labels -i {input.infile} -l {output.labelfile}" ### PantherPathways @@ -422,8 +454,10 @@ rule get_panther_pathway_labels: infile=config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt' output: labelfile=config['download_directory'] + '/PANTHER.PATHWAY/labels' - run: - pantherpathways.make_pathway_labels(input.infile,output.labelfile) + # run: + # pantherpathways.make_pathway_labels(input.infile,output.labelfile) + shell: + "./babel_io/target/release/create_pantherpathways_labels -i {input.infile} -l {output.labelfile}" ### Unichem @@ -439,8 +473,10 @@ rule filter_unichem: reffile=config['download_directory'] + '/UNICHEM/reference.tsv.gz', output: filteredreffile=config['download_directory'] + '/UNICHEM/reference.filtered.tsv', - run: - unichem.filter_unichem(input.reffile, output.filteredreffile) + # run: + # unichem.filter_unichem(input.reffile, output.filteredreffile) + shell: + "./babel_io/target/release/filter_unichem -i {input.reffile} -o {output.filteredreffile}" ### CHEMBL @@ -458,8 +494,10 @@ rule chembl_labels_and_smiles: output: outfile=config['download_directory']+'/CHEMBL.COMPOUND/labels', smifile=config['download_directory']+'/CHEMBL.COMPOUND/smiles' - run: - chembl.pull_chembl_labels_and_smiles(input.infile,input.ccofile,output.outfile,output.smifile) + # run: + # chembl.pull_chembl_labels_and_smiles(input.infile,input.ccofile,output.outfile,output.smifile) + shell: + "./babel_io/target/release/create_chembl_labels_and_smiles -i {input.infile} -c {input.ccofile} -l {output.outfile} -s {output.smifile}" ### DrugBank requires a login... but not for basic vocabulary information. rule get_drugbank_labels_and_synonyms: @@ -634,3 +672,110 @@ rule get_CLO_labels: synonymfile =config['download_directory'] + '/CLO/synonyms' run: clo.make_labels(input.infile, output.labelfile,output.synonymfile) + +rule datacollect: + input: + config['download_directory'] + '/EFO/labels', + config['download_directory'] + '/EFO/synonyms', + config['download_directory'] + '/ComplexPortal/559292.tsv', + config['download_directory'] + '/ComplexPortal/559292_labels.tsv', + config['download_directory'] + '/ComplexPortal/559292_synonyms.tsv', + expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}_9.json", download_directory = config['download_directory'], mod = config['mods']), + expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']), + config['download_directory'] + '/UniProtKB/idmapping.dat', + config['download_directory'] + '/UniProtKB/uniprot_sprot.fasta', + config['download_directory'] + '/UniProtKB/uniprot_trembl.fasta', + config['download_directory'] + '/UniProtKB/labels', + config['download_directory'] + '/UMLS_UniProtKB/UMLS_UniProtKB.tsv', + config['output_directory'] + '/intermediate/gene/concords/UMLS_NCBIGene', + config['output_directory'] + '/intermediate/protein/concords/UMLS_UniProtKB', + config['download_directory'] + '/MESH/mesh.nt', + config['download_directory'] + '/MESH/labels', + config['download_directory'] + '/MESH/synonyms', + config['download_directory'] + '/UMLS/MRCONSO.RRF', + config['download_directory'] + '/UMLS/MRSTY.RRF', + config['download_directory'] + '/UMLS/MRREL.RRF', + config['download_directory'] + '/UMLS/labels', + config['download_directory'] + '/UMLS/synonyms', + config['download_directory'] + '/SNOMEDCT/labels', + config['download_directory'] + '/SNOMEDCT/synonyms', + config['download_directory'] + '/common/ubergraph/labels', + config['download_directory'] + '/common/ubergraph/synonyms.jsonl', + config['download_directory'] + '/common/ubergraph/descriptions.jsonl', + config['download_directory'] + '/icRDF.tsv', + expand("{download_directory}/NCBIGene/{ncbi_files}", download_directory=config['download_directory'],ncbi_files=config['ncbi_files']), + config['download_directory'] + '/NCBIGene/labels', + config['download_directory'] + '/NCBIGene/synonyms', + config['download_directory'] + '/NCBIGene/taxa', + config['download_directory'] + '/NCBIGene/descriptions', + config['download_directory'] + '/ENSEMBL/BioMartDownloadComplete', + config['download_directory'] + '/HGNC/hgnc_complete_set.json', + config['download_directory'] + '/HGNC/labels', + config['download_directory'] + '/HGNC/synonyms', + config['download_directory'] + '/HGNC.FAMILY/family.csv', + config['download_directory'] + '/HGNC.FAMILY/labels', + config['download_directory'] + '/PANTHER.FAMILY/family.csv', + config['download_directory'] + '/PANTHER.FAMILY/labels', + config['download_directory'] + '/OMIM/mim2gene.txt', + config['download_directory'] + '/NCIT/NCIt-SwissProt_Mapping.txt', + config['download_directory'] + '/DOID/doid.json', + config['download_directory'] + '/DOID/labels', + config['download_directory'] + '/DOID/synonyms', + config['download_directory'] + '/Orphanet/Orphanet_Nomenclature_Pack_EN.zip', + config['download_directory'] + '/Orphanet/labels', + config['download_directory'] + '/Orphanet/synonyms', + config['download_directory'] + '/REACT/Events.json', + config['download_directory'] + '/REACT/labels', + config['download_directory'] + '/RHEA/rhea.rdf', + config['download_directory'] + '/RHEA/labels', + config['download_directory'] + '/EC/enzyme.rdf', + config['download_directory'] + '/EC/labels', + config['download_directory'] + '/EC/synonyms', + config['download_directory'] + '/SMPDB/smpdb_pathways.csv', + config['download_directory'] + '/SMPDB/labels', + config['download_directory'] + '/PANTHER.PATHWAY/SequenceAssociationPathway3.6.8.txt', + config['download_directory'] + '/PANTHER.PATHWAY/labels', + config['download_directory'] + '/UNICHEM/structure.tsv.gz', + config['download_directory'] + '/UNICHEM/reference.tsv.gz', + config['download_directory'] + '/UNICHEM/reference.filtered.tsv', + config['download_directory'] + '/CHEMBL.COMPOUND/chembl_latest_molecule.ttl', + config['download_directory'] + '/CHEMBL.COMPOUND/cco.ttl', + config['download_directory'] + '/CHEMBL.COMPOUND/labels', + config['download_directory'] + '/CHEMBL.COMPOUND/smiles', + config['download_directory'] + '/DRUGBANK/drugbank vocabulary.csv', + config['download_directory'] + '/DRUGBANK/labels', + config['download_directory'] + '/DRUGBANK/synonyms', + config['download_directory'] + '/GTOPDB/ligands.tsv', + config['download_directory'] + '/GTOPDB/labels', + config['download_directory'] + '/GTOPDB/synonyms', + config['download_directory'] + '/KEGG.COMPOUND/labels', + config['download_directory'] + '/UNII/Latest_UNII_Names.txt', + config['download_directory'] + '/UNII/Latest_UNII_Records.txt', + config['download_directory'] + '/UNII/labels', + config['download_directory'] + '/UNII/synonyms', + config['download_directory'] + '/HMDB/hmdb_metabolites.xml', + config['download_directory'] + '/HMDB/labels', + config['download_directory'] + '/HMDB/synonyms', + config['download_directory'] + '/HMDB/smiles', + config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH', + config['download_directory'] + '/PUBCHEM.COMPOUND/CID-Synonym-filtered.gz', + config['download_directory'] + '/PUBCHEM.COMPOUND/CID-Title.gz', + config['download_directory'] + '/PUBCHEM.COMPOUND/CID-InChI-Key.gz', + config['download_directory'] + '/PUBCHEM.COMPOUND/CID-SMILES.gz', + config['download_directory'] + '/PUBCHEM.COMPOUND/labels', + config['download_directory'] + '/PUBCHEM.COMPOUND/synonyms', + config['download_directory'] + '/RxNorm/RXNCONSO.RRF', + config['download_directory'] + '/RxNorm/RXNREL.RRF', + config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', + config['download_directory'] + '/DrugCentral/structures', + config['download_directory'] + '/DrugCentral/labels', + config['download_directory'] + '/DrugCentral/xrefs', + config['download_directory'] + '/NCBITaxon/taxdump.tar', + config['download_directory'] + '/NCBITaxon/labels', + config['download_directory'] + '/NCBITaxon/synonyms', + config['download_directory'] + '/NCBITaxon/properties.tsv.gz', + config['download_directory'] + '/CHEBI/ChEBI_complete.sdf', + config['download_directory'] + '/CHEBI/database_accession.tsv', + config['download_directory'] + '/CLO/clo.owl', + config['download_directory'] + '/CLO/labels', + config['download_directory'] + '/CLO/synonyms' From fbc25a3b302a73b795fdd1f316d7d1d718a04389 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:16:03 -0400 Subject: [PATCH 04/28] cleaning --- src/datahandlers/unichem.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datahandlers/unichem.py b/src/datahandlers/unichem.py index 7755f440..85229130 100644 --- a/src/datahandlers/unichem.py +++ b/src/datahandlers/unichem.py @@ -8,7 +8,6 @@ data_sources: dict = {'1': CHEMBLCOMPOUND, '2': DRUGBANK, '4': GTOPDB, '6': KEGGCOMPOUND, '7': CHEBI, '14': UNII, '18': HMDB, '22': PUBCHEMCOMPOUND, '34': DRUGCENTRAL} - def pull_unichem(): """ Download UniChem files. """ pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/', 'structure.tsv.gz', decompress=False, subpath='UNICHEM') From 6bee5867b6378119cac593691bd79dfc118ff7d8 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:16:21 -0400 Subject: [PATCH 05/28] using bulk_load --- src/datahandlers/rhea.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py index 6546e8ff..619c50df 100644 --- a/src/datahandlers/rhea.py +++ b/src/datahandlers/rhea.py @@ -15,9 +15,9 @@ def __init__(self): from datetime import datetime as dt print('loading rhea') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ifname,'rb') as inf: - self.m.load(inf,'application/rdf+xml') + self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML) end = dt.now() print('loading complete') print(f'took {end-start}') From a96d9a03fb68c915fdc52d66df63238dda1b8153 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:17:08 -0400 Subject: [PATCH 06/28] adding comments --- src/datahandlers/pantherfamily.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index f4a0c596..2197739f 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -18,14 +18,15 @@ def pull_labels(infile,outfile): labels = {} done = set() with open(outfile,'w') as labelf: + # FIXME: first line should not be skipped for line in lines[1:]: parts = line.split('\t') if len(parts) < 5: continue sf = parts[SUBFAMILY_COLUMN] - mf = sf.split(':')[0] - mfname = parts[MAINFAMILY_NAME_COLUMN] - sfname = parts[SUBFAMILY_NAME_COLUMN] + mf = sf.split(':')[0] # PTHR10845:SF155 -> PTHR10845 + mfname = parts[MAINFAMILY_NAME_COLUMN] # REGULATOR OF G PROTEIN SIGNALING + sfname = parts[SUBFAMILY_NAME_COLUMN] # REGULATOR OF G-PROTEIN SIGNALING 18 if mf not in done: main_family = f'{PANTHERFAMILY}:{mf}' #panther_families.append(main_family) From 6c8a68952286c38fedb76d085d44a214e2120299 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:17:26 -0400 Subject: [PATCH 07/28] fixing download url --- src/datahandlers/mods.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py index 3de6672b..0f5d2c43 100644 --- a/src/datahandlers/mods.py +++ b/src/datahandlers/mods.py @@ -10,7 +10,10 @@ def pull_mods(): for mod in mods: subp = modmap[mod] - origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp) + # https://www.alliancegenome.org/downloads#gene-descriptions + # https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/SGD/GENE-DESCRIPTION-JSON_SGD_9.json.gz + # origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp) + origname = pull_via_urllib(f'https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/{mod}/',f'GENE-DESCRIPTION-JSON_{mod}_9.json.gz', subpath=subp) #This should be fine. But for the makefile it's nice if the directory in which this goes is the same as the {mod} in the filename. # And we'd like it to be the names of the prefixes if mod != modmap[mod]: @@ -20,7 +23,7 @@ def pull_mods(): def write_labels(dd): for mod,prefix in modmap.items(): - with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf: + with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}_9.json','r') as inf: j = json.load(inf) with open(f'{dd}/{prefix}/labels','w') as outf: for gene in j['data']: From c2d90b12826dce1a5d59e446fef23a09d1813d76 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:17:42 -0400 Subject: [PATCH 08/28] adding bulk_load --- src/datahandlers/mesh.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index 6c198c4d..b90827e0 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -13,9 +13,9 @@ def __init__(self): from datetime import datetime as dt print('loading mesh.nt') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ifname,'rb') as inf: - self.m.load(inf,'application/n-triples') + self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.N_TRIPLES) end = dt.now() print('loading complete') print(f'took {end-start}') From a43bccd11dc54aac088fbb1e6905b3634e6c8a9f Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:18:02 -0400 Subject: [PATCH 09/28] adding comment --- src/datahandlers/hgncfamily.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py index cc6f8c13..5fd9a784 100644 --- a/src/datahandlers/hgncfamily.py +++ b/src/datahandlers/hgncfamily.py @@ -21,6 +21,6 @@ def pull_labels(infile,outfile): if len(parts) < 10: continue i = f"{HGNCFAMILY}:{parts[0][1:-1]}" - l = parts[2][1:-1] + l = parts[2][1:-1] # FIXME...this is a bug since commas are used in the fields of a line outf.write(f'{i}\t{l}\n') From 73979fa1a1efe0a7f60afcb880319a2085a26fcd Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:18:32 -0400 Subject: [PATCH 10/28] adding comments, but switch to rust solution --- src/datahandlers/ensembl.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/datahandlers/ensembl.py b/src/datahandlers/ensembl.py index b6c3593d..89ce6c0f 100644 --- a/src/datahandlers/ensembl.py +++ b/src/datahandlers/ensembl.py @@ -1,5 +1,8 @@ import traceback +import apybiomart +import pandas + from src.babel_utils import make_local_name, get_config from apybiomart import find_datasets, query, find_attributes import os @@ -13,14 +16,19 @@ # genes that can be gathered without downloading hundreds of gigs of other stuff. So, we'll use biomart to pull # just what we need. def pull_ensembl(complete_file): - f = find_datasets() + dataset_df = find_datasets() + + # dataset_url = "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL" + # dataset_df = pandas.read_csv(dataset_url, sep='\t', header=None, index_col=False) skip_dataset_ids = set(get_config()['ensembl_datasets_to_skip']) cols = {"ensembl_gene_id", "ensembl_peptide_id", "description", "external_gene_name", "external_gene_source", "external_synonym", "chromosome_name", "source", "gene_biotype", "entrezgene_id", "zfin_id_id", 'mgi_id', 'rgd_id', 'flybase_gene_id', 'sgd_gene', 'wormbase_gene'} - for ds in f['Dataset_ID']: + + # for ds in dataset_df[1]: + for ds in dataset_df['Dataset_ID']: print(ds) if ds in skip_dataset_ids: print(f'Skipping {ds} as it is included in skip_dataset_ids: {skip_dataset_ids}') @@ -32,9 +40,18 @@ def pull_ensembl(complete_file): if os.path.exists(outfile): continue try: - atts = find_attributes(ds) - existingatts = set(atts['Attribute_ID'].to_list()) + + attributes_df = find_attributes(ds) + # attributes_url = f"http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={ds}" + # attributes_df = pandas.read_csv(attributes_url, sep='\t', header=None, index_col=False) + + # existingatts = set(attributes_df[0].to_list()) + existingatts = set(attributes_df['Attribute_ID'].to_list()) attsIcanGet = cols.intersection(existingatts) + + # query_url = f"http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={ds}" + # query_df = pandas.read_csv(attributes_url, sep='\t', header=None, index_col=False) + df = query(attributes=list(attsIcanGet), filters={}, dataset=ds) df.to_csv(outfile, index=False, sep='\t') except Exception as exc: @@ -48,4 +65,7 @@ def pull_ensembl(complete_file): if __name__ == '__main__': - pull_ensembl() + # marts = apybiomart.find_marts() + # print(marts.head()) + + pull_ensembl("/tmp/asdfasdf.txt") From 5069a5cd2304c7ab80f50822a74ef9930b62bb40 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:18:50 -0400 Subject: [PATCH 11/28] using bulk_load --- src/datahandlers/efo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 03fd59f1..6c468921 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -26,9 +26,9 @@ def __init__(self): from datetime import datetime as dt print('loading EFO') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ifname,'rb') as inf: - self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/') + self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/') end = dt.now() print('loading complete') print(f'took {end-start}') @@ -119,6 +119,7 @@ def get_exacts(self, iri, outfile): outfile.write(f"{iri}\tskos:exactMatch\t{otherid}\n") nwrite += 1 return nwrite + def get_xrefs(self, iri, outfile): query = f""" prefix rdfs: From dc89d2624ca79add582f4bc31f1241cb84b19fee Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:18:59 -0400 Subject: [PATCH 12/28] using bulk_load --- src/datahandlers/ec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py index 1d77c71b..37fc16e0 100644 --- a/src/datahandlers/ec.py +++ b/src/datahandlers/ec.py @@ -23,9 +23,9 @@ def __init__(self): from datetime import datetime as dt print('loading EC') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ifname,'rb') as inf: - self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/') + self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/') end = dt.now() print('loading complete') print(f'took {end-start}') From 39b4ba775a0df5f752f153cc578bbdc0b6ae3886 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:19:10 -0400 Subject: [PATCH 13/28] using bulk_load --- src/datahandlers/clo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py index 018f8d44..e0251169 100644 --- a/src/datahandlers/clo.py +++ b/src/datahandlers/clo.py @@ -19,9 +19,9 @@ def __init__(self,ifname): from datetime import datetime as dt print('loading CLO') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ifname,'rb') as inf: - self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/') + self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri='http://example.org/') end = dt.now() print('loading complete') print(f'took {end-start}') From d9d0a735433b16c9b5e8301828bf237632b5e12c Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:19:24 -0400 Subject: [PATCH 14/28] using bulk_load --- src/datahandlers/chembl.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py index ac8e6513..dff5dfde 100644 --- a/src/datahandlers/chembl.py +++ b/src/datahandlers/chembl.py @@ -1,3 +1,6 @@ +import os.path +import pathlib + from src.prefixes import CHEMBLCOMPOUND from src.babel_utils import pull_via_ftp, make_local_name import ftplib @@ -48,11 +51,11 @@ def __init__(self,ifname,ccofile): from datetime import datetime as dt print('loading chembl') start = dt.now() - self.m= pyoxigraph.MemoryStore() + self.m= pyoxigraph.Store() with open(ccofile,'rb') as inf: - self.m.load(inf,'application/turtle') + self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE) with open(ifname,'rb') as inf: - self.m.load(inf,'application/turtle') + self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE) end = dt.now() print('loading complete') print(f'took {end-start}') From 3ce12fa0a7a9027f74a67a11b616c9b886ef521d Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:19:48 -0400 Subject: [PATCH 15/28] using snakemake.logger --- src/createcompendia/leftover_umls.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py index 604ac248..0c1082ac 100644 --- a/src/createcompendia/leftover_umls.py +++ b/src/createcompendia/leftover_umls.py @@ -1,9 +1,10 @@ +import logging from datetime import datetime import json import jsonlines from pathlib import Path -from snakemake.logging import Logger +import snakemake.logging from bmt import Toolkit from src.node import NodeFactory @@ -30,7 +31,7 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym :return: Nothing. """ - logging = Logger() + logging = snakemake.logging.logger logging.info(f"write_leftover_umls({compendia}, {umls_labels_filename}, {mrconso}, {mrsty}, {synonyms}, {umls_compendium}, {umls_synonyms}, {report}, {biolink_version})") # For now, we have many more UMLS entities in MRCONSO than in the compendia, so From 650166bcfbdab922101692a3829d2fe159104976 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:20:08 -0400 Subject: [PATCH 16/28] adding condition for file size --- src/babel_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index deee0d5f..56d5c692 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -185,7 +185,7 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None # write out the data to the output file compressed_file.write(data) - if decompress: + if os.stat(dl_file_name).st_size > 0 and decompress: out_file_name = dl_file_name[:-3] # create the output text file @@ -201,6 +201,7 @@ def pull_via_urllib(url: str, in_file_name: str, decompress = True, subpath=None else: out_file_name = dl_file_name + # return the filename to the caller return out_file_name From 8e36c0f9934748ddec667c9b4ae1aca153b25665 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Wed, 30 Jul 2025 18:20:28 -0400 Subject: [PATCH 17/28] incrementing pyoxigraph version --- requirements.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index e330149d..aaac1f26 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ apybiomart biopython bmt -datrie jsonlines pandas more-itertools -pyoxigraph~=0.2.5 +#pyoxigraph~=0.2.5 +pyoxigraph~=0.4.11 psycopg2-binary pytest pytest-cov -python-Levenshtein-wheels +#python-Levenshtein-wheels +python-levenshtein pyyaml requests snakemake From ea2b83fa473e54a285c727b3d0f44b508edeea1d Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 31 Jul 2025 08:33:04 -0400 Subject: [PATCH 18/28] fixing missed conflicts --- src/createcompendia/leftover_umls.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py index 506e8ac5..0fefccf9 100644 --- a/src/createcompendia/leftover_umls.py +++ b/src/createcompendia/leftover_umls.py @@ -4,13 +4,7 @@ import jsonlines from pathlib import Path -<<<<<<< HEAD import snakemake.logging -from bmt import Toolkit -======= -from snakemake.logging import Logger ->>>>>>> 80b225419bb30eafafcc82771983a66dc36156b7 - from src.node import NodeFactory from src.util import get_biolink_model_toolkit from src.datahandlers import umls From 55b59e8269820b0ffbf65817644cd0298db01210 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 31 Jul 2025 08:35:55 -0400 Subject: [PATCH 19/28] initial commit --- babel_io/.gitignore | 3 + .../bin/create_chembl_labels_and_smiles.rs | 122 ++++++++ ...reate_complexportal_labels_and_synonyms.rs | 118 ++++++++ .../bin/create_doid_labels_and_synonyms.rs | 73 +++++ .../src/bin/create_ec_labels_and_synonyms.rs | 85 ++++++ babel_io/src/bin/create_efo_labels.rs | 96 +++++++ .../bin/create_hgnc_labels_and_synonyms.rs | 74 +++++ babel_io/src/bin/create_hgncfamily_labels.rs | 47 +++ babel_io/src/bin/create_mesh_labels.rs | 80 ++++++ ...reate_ncbigene_labels_synonyms_and_taxa.rs | 193 +++++++++++++ .../create_orphanet_labels_and_synonyms.rs | 67 +++++ .../src/bin/create_pantherfamily_labels.rs | 61 ++++ .../src/bin/create_pantherpathways_labels.rs | 83 ++++++ babel_io/src/bin/create_reactome_labels.rs | 54 ++++ babel_io/src/bin/create_rhea_labels.rs | 73 +++++ babel_io/src/bin/create_smpdb_labels.rs | 63 ++++ babel_io/src/bin/create_uniprot_labels.rs | 58 ++++ babel_io/src/bin/filter_unichem.rs | 72 +++++ babel_io/src/bin/pull_ensembl.rs | 264 +++++++++++++++++ babel_io/src/lib.rs | 271 ++++++++++++++++++ 20 files changed, 1957 insertions(+) create mode 100644 babel_io/.gitignore create mode 100644 babel_io/src/bin/create_chembl_labels_and_smiles.rs create mode 100644 babel_io/src/bin/create_complexportal_labels_and_synonyms.rs create mode 100644 babel_io/src/bin/create_doid_labels_and_synonyms.rs create mode 100644 babel_io/src/bin/create_ec_labels_and_synonyms.rs create mode 100644 babel_io/src/bin/create_efo_labels.rs create mode 100644 babel_io/src/bin/create_hgnc_labels_and_synonyms.rs create mode 100644 babel_io/src/bin/create_hgncfamily_labels.rs create mode 100644 babel_io/src/bin/create_mesh_labels.rs create mode 100644 babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs create mode 100644 babel_io/src/bin/create_orphanet_labels_and_synonyms.rs create mode 100644 babel_io/src/bin/create_pantherfamily_labels.rs create mode 100644 babel_io/src/bin/create_pantherpathways_labels.rs create mode 100644 babel_io/src/bin/create_reactome_labels.rs create mode 100644 babel_io/src/bin/create_rhea_labels.rs create mode 100644 babel_io/src/bin/create_smpdb_labels.rs create mode 100644 babel_io/src/bin/create_uniprot_labels.rs create mode 100644 babel_io/src/bin/filter_unichem.rs create mode 100644 babel_io/src/bin/pull_ensembl.rs create mode 100644 babel_io/src/lib.rs diff --git a/babel_io/.gitignore b/babel_io/.gitignore new file mode 100644 index 00000000..e26eb48d --- /dev/null +++ b/babel_io/.gitignore @@ -0,0 +1,3 @@ +./target +.idea +./Cargo.lock \ No newline at end of file diff --git a/babel_io/src/bin/create_chembl_labels_and_smiles.rs b/babel_io/src/bin/create_chembl_labels_and_smiles.rs new file mode 100644 index 00000000..e64681b1 --- /dev/null +++ b/babel_io/src/bin/create_chembl_labels_and_smiles.rs @@ -0,0 +1,122 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::error::Error; +use std::fs; +use std::io::{BufReader, BufWriter, Write}; +use std::path; +use std::time::Instant; + +// NOTE: rust runs in 13s, python runs in 21s +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + cco: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + smiles_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let store = Store::new()?; + let start_load = Instant::now(); + + // this file is small...no need for bulk loader + let cco_br = BufReader::new(fs::File::open(options.cco).unwrap()); + store.load_from_reader(RdfFormat::Turtle, cco_br).expect("Could not load input"); + + let input_br = BufReader::new(fs::File::open(options.input).unwrap()); + store + .bulk_loader() + .with_max_memory_size_in_megabytes(4 * 2048) + .with_num_threads(4) + .load_from_reader(RdfFormat::Turtle, input_br) + .expect("Could not load input"); + + info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string()); + + let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + + let query_statement = "PREFIX rdf: + PREFIX rdfs: + PREFIX cco: + SELECT ?molecule ?label + WHERE { + ?molecule a ?type . + ?type rdfs:subClassOf* cco:Substance . + ?molecule rdfs:label ?label . + }"; + + if let QueryResults::Solutions(solutions) = store.query(query_statement)? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let iterm = qs.get("molecule").expect("molecule was None"); + let mut iterm = iterm.to_string(); + iterm = babel_io::trim_gt_and_lt(iterm); + + let iterm_split = iterm.split("/").collect_vec(); + let id = iterm_split.last().unwrap(); + + let label = qs.get("label").expect("label was None"); + let mut label = label.to_string(); + label = babel_io::trim_quotes(label); + + if id.to_string() == label { + continue; + } + write!(labels_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple"); + } + } + + let mut smiles_bw = BufWriter::new(fs::File::create(options.smiles_output.clone().as_path()).unwrap()); + + let query_statement = "PREFIX rdf: + PREFIX rdfs: + PREFIX cco: + PREFIX cheminf: + SELECT ?molecule ?smiles + WHERE { + ?molecule cheminf:SIO_000008 ?smile_entity . + ?smile_entity a cheminf:CHEMINF_000018 ; + cheminf:SIO_000300 ?smiles . + }"; + + if let QueryResults::Solutions(solutions) = store.query(query_statement)? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let iterm = qs.get("molecule").expect("molecule was None"); + let mut iterm = iterm.to_string(); + iterm = babel_io::trim_gt_and_lt(iterm); + + let iterm_split = iterm.split("/").collect_vec(); + let id = iterm_split.last().unwrap(); + + let label = qs.get("smiles").expect("smiles was None"); + let mut label = label.to_string(); + label = babel_io::trim_quotes(label); + + write!(smiles_bw, "CHEMBL.COMPOUND:{}\t{}\n", id, label).expect("Could not write triple"); + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs b/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs new file mode 100644 index 00000000..fb84248c --- /dev/null +++ b/babel_io/src/bin/create_complexportal_labels_and_synonyms.rs @@ -0,0 +1,118 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use polars::prelude::*; +use std::error::Error; +use std::fs; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + // NOTE: this base implementation runs in 4ms, python version of this runs in 4s + // let br = BufReader::new(fs::File::open(options.input).unwrap()); + // + // let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + // let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + // + // let mut used_synonyms = HashSet::new(); + // + // for line in br.lines().skip(1) { + // let line = line.unwrap(); + // let line_split = line.split("\t").collect_vec(); + // let id = line_split.get(0).unwrap(); + // let label = line_split.get(1).unwrap(); + // write!(labels_bw, "ComplexPortal:{}\t{}\n", id, label).unwrap(); + // let synonyms = line_split.get(2).unwrap(); + // if !synonyms.to_string().eq("-") { + // let synonyms_split = synonyms.split("|").collect_vec(); + // for synonym in synonyms_split.into_iter().map(|a| a.to_string()) { + // if !used_synonyms.contains(&synonym) { + // write!(synonyms_bw, "ComplexPortal:{}\t{}\n", id, synonym).unwrap(); + // used_synonyms.insert(synonym); + // } + // } + // } + // } + + // NOTE: this polars implementation runs in 16ms + let usable_columns = vec!["#Complex ac", "Recommended name", "Aliases for complex"]; + + let df = polars::lazy::frame::LazyCsvReader::new(options.input.clone()) + .with_separator(b'\t') + .with_infer_schema_length(Some(0)) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(true) + .finish() + .unwrap() + .select(usable_columns.into_iter().map(|a| col(a)).collect_vec()) + .collect() + .unwrap(); + + // println!("{}", df.head(None)); + + let mut labels_df = df + .clone() + .lazy() + .select([ + concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"), + col("Recommended name"), + ]) + .collect() + .unwrap(); + + let mut file = fs::File::create(options.labels_output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut labels_df) + .unwrap(); + + let mut synonyms_df = df + .clone() + .lazy() + .filter(col("Aliases for complex").neq(lit("-"))) + .select([ + concat_str([lit("ComplexPortal"), col("#Complex ac")], ":", true).alias("#Complex ac"), + col("Aliases for complex").str().split(lit("|")).alias("Aliases for complex"), + ]) + .explode([col("Aliases for complex")]) + .unique(Some(vec!["Aliases for complex".to_string()]), UniqueKeepStrategy::First) + .collect() + .unwrap(); + + // println!("{}", synonyms_df.head(None)); + + let mut file = fs::File::create(options.synonyms_output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut synonyms_df) + .unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_doid_labels_and_synonyms.rs b/babel_io/src/bin/create_doid_labels_and_synonyms.rs new file mode 100644 index 00000000..1fc88d07 --- /dev/null +++ b/babel_io/src/bin/create_doid_labels_and_synonyms.rs @@ -0,0 +1,73 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use serde_json::Value; +use std::error::Error; +use std::fs; +use std::io::Write; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = std::io::BufReader::new(fs::File::open(options.input).unwrap()); + let json_value: Value = serde_json::from_reader(br)?; + + let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + + //NOTE: Python runs in 3s, rust runs < 1s + for entry in json_value["graphs"][0]["nodes"].as_array().unwrap().into_iter() { + if !entry["meta"].is_null() && !entry["meta"]["deprecated"].is_null() && entry["meta"]["deprecated"].as_bool().unwrap() == true { + continue; + } + let doid_id = entry["id"].as_str().unwrap(); + if !doid_id.starts_with("http://purl.obolibrary.org/obo/DOID_") { + continue; + } + let doid_id_split = doid_id.split("_").collect_vec(); + let doid_curie = format!("DOID:{}", doid_id_split.get(1).unwrap()); + + if !entry["lbl"].is_null() { + let label = entry["lbl"].as_str().unwrap(); + write!(&mut labels_bw, "{}\t{}\n", doid_curie, label).unwrap(); + write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", doid_curie, label).unwrap(); + } + + if !entry["meta"].is_null() && !entry["meta"]["synonyms"].is_null() { + for synonym_entry in entry["meta"]["synonyms"].as_array().unwrap().into_iter() { + write!( + &mut synonyms_bw, + "{}\tOIO:hasExactSynonym\t{}\n", + doid_curie, + synonym_entry["val"].as_str().unwrap() + ) + .unwrap(); + } + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_ec_labels_and_synonyms.rs b/babel_io/src/bin/create_ec_labels_and_synonyms.rs new file mode 100644 index 00000000..1b812d14 --- /dev/null +++ b/babel_io/src/bin/create_ec_labels_and_synonyms.rs @@ -0,0 +1,85 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::error::Error; +use std::fs; +use std::io::{BufReader, BufWriter, Write}; +use std::path; +use std::time::Instant; + +// NOTE: rust runs in 13s, python runs in 21s +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = BufReader::new(fs::File::open(options.input).unwrap()); + let store = Store::new()?; + let start_load = Instant::now(); + store + .bulk_loader() + .with_max_memory_size_in_megabytes(4 * 2048) + .with_num_threads(4) + .load_from_reader(RdfFormat::RdfXml, br) + .expect("Could not load input"); + info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string()); + + let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + + let label_types = vec!["skos:prefLabel", "skos:altLabel", "rdfs:label"]; + + for label_type in label_types.into_iter() { + let query_statement = format!( + "PREFIX skos: + PREFIX ec: + PREFIX rdfs: + SELECT DISTINCT ?x ?label WHERE {{ ?x {label_type} ?label }}" + ); + + if let QueryResults::Solutions(solutions) = store.query(query_statement.as_str())? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let iterm = qs.get("x").expect("acc was None"); + let mut iterm = iterm.to_string(); + iterm = babel_io::trim_gt_and_lt(iterm); + + let iterm_split = iterm.split("/").collect_vec(); + let id = iterm_split.last().unwrap(); + + let label = qs.get("label").expect("label was None"); + let label = label.to_string(); + // label = babel_io::trim_quotes(label); + + write!(synonyms_bw, "EC:{}\t{}\t{}\n", id, label_type, label).expect("Could not write triple"); + if label_type != "skos:altLabel" { + write!(labels_bw, "EC:{}\t{}\n", id, label).expect("Could not write triple"); + } + } + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_efo_labels.rs b/babel_io/src/bin/create_efo_labels.rs new file mode 100644 index 00000000..9fe719f0 --- /dev/null +++ b/babel_io/src/bin/create_efo_labels.rs @@ -0,0 +1,96 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{BufReader, Write}; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = BufReader::new(File::open(options.input).unwrap()); + let store = Store::new()?; + let start_load = Instant::now(); + store + .bulk_loader() + .with_max_memory_size_in_megabytes(4 * 2048) + .with_num_threads(4) + .load_from_reader(RdfFormat::RdfXml, br) + .expect("Could not load input"); + info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string()); + + let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + + let label_types = vec!["skos:prefLabel", "skos:altLabel", "rdfs:label"]; + + let re = regex::Regex::new("^(.*?)(?:@[^@]*){0,1}$").unwrap(); + + for label_type in label_types.into_iter() { + let query_statement = format!( + "PREFIX skos: + PREFIX rdfs: + SELECT DISTINCT ?x ?label WHERE {{ ?x {label_type} ?label }}" + ); + if let QueryResults::Solutions(solutions) = store.query(query_statement.as_str())? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let x = qs.get("x").expect("x was None"); + let mut x = x.to_string(); + x = babel_io::trim_gt_and_lt(x); + + let label = qs.get("label").expect("x was None"); + let mut label = label.to_string(); + if label.contains("@") { + if let Some(captures) = re.captures(label.as_str()) { + label = captures.get(1).unwrap().as_str().to_string(); + } + } + + label = babel_io::trim_quotes(label); + label = label.trim().to_string(); + + let x_split = x.split("/").collect_vec(); + let efo = x_split.last().unwrap(); + if !efo.starts_with("EFO_") { + continue; + } + let efo_split = efo.split("_").collect_vec(); + let efo_id = efo_split.last().unwrap(); + write!(synonyms_bw, "EFO:{}\t{}\t{}\n", efo_id, label_type, label).expect("Could not write triple"); + if label_type != "skos:altLabel" { + write!(labels_bw, "EFO:{}\t{}\n", efo_id, label).expect("Could not write triple"); + } + } + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs b/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs new file mode 100644 index 00000000..dc9ad40a --- /dev/null +++ b/babel_io/src/bin/create_hgnc_labels_and_synonyms.rs @@ -0,0 +1,74 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use serde_json::Value; +use std::error::Error; +use std::fs; +use std::io::Write; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = std::io::BufReader::new(fs::File::open(options.input).unwrap()); + let json_value: Value = serde_json::from_reader(br)?; + + let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + + //NOTE: Python runs in 3s, rust runs < 1s + for gene in json_value["response"]["docs"].as_array().unwrap().into_iter() { + let hgnc_id = gene["hgnc_id"].clone(); + let symbol = gene["symbol"].clone(); + write!(&mut labels_bw, "{}\t{}\n", hgnc_id.as_str().unwrap(), symbol.as_str().unwrap()).unwrap(); + + let name = gene["name"].clone(); + write!( + &mut synonyms_bw, + "{}\t{}\t{}\n", + hgnc_id.as_str().unwrap(), + "http://www.geneontology.org/formats/oboInOwl#hasExactSynonym", + name.as_str().unwrap() + ) + .unwrap(); + + for alias_field in vec!["alias_symbol", "alias_name"].into_iter() { + if !gene[alias_field].is_null() { + let aliases = gene[alias_field].as_array().unwrap(); + for asym in aliases.into_iter() { + write!( + &mut synonyms_bw, + "{}\t{}\t{}\n", + hgnc_id.as_str().unwrap(), + "http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym", + asym.as_str().unwrap() + ) + .unwrap(); + } + } + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_hgncfamily_labels.rs b/babel_io/src/bin/create_hgncfamily_labels.rs new file mode 100644 index 00000000..c5792209 --- /dev/null +++ b/babel_io/src/bin/create_hgncfamily_labels.rs @@ -0,0 +1,47 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use polars::prelude::*; +use std::error::Error; +use std::fs; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let mut df = polars::lazy::frame::LazyCsvReader::new(options.input.clone()) + .with_infer_schema_length(Some(0)) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(true) + .finish() + .unwrap() + .with_column(concat_str([lit("HGNC.FAMILY"), col("id")], ":", true).alias("id")) + .select([col("id"), col("name")]) + .collect() + .unwrap(); + + let mut file = fs::File::create(options.labels_output).expect("could not create file"); + CsvWriter::new(&mut file).include_header(false).with_separator(b'\t').finish(&mut df).unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_mesh_labels.rs b/babel_io/src/bin/create_mesh_labels.rs new file mode 100644 index 00000000..8aa6b8c4 --- /dev/null +++ b/babel_io/src/bin/create_mesh_labels.rs @@ -0,0 +1,80 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::error::Error; +use std::fs; +use std::io; +use std::io::Write; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = io::BufReader::new(fs::File::open(options.input).unwrap()); + let store = Store::new()?; + let start_load = Instant::now(); + store + .bulk_loader() + .with_max_memory_size_in_megabytes(4 * 2048) + .with_num_threads(4) + .load_from_reader(RdfFormat::NTriples, br) + .expect("Could not load input"); + info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string()); + + let mut output_bw = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap()); + + let re = regex::Regex::new("^(.*?)(?:@[^@]*){0,1}$").unwrap(); + + let query_statement = r#"PREFIX rdfs: + PREFIX meshv: + PREFIX mesh: + + SELECT DISTINCT ?term ?label WHERE { ?term rdfs:label ?label } ORDER BY ?term"#; + + if let QueryResults::Solutions(solutions) = store.query(query_statement)? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let term = qs.get("term").expect("term was None"); + let mut term = term.to_string(); + term = babel_io::trim_gt_and_lt(term); + let term_split = term.split("/").collect_vec(); + let id = term_split.last().unwrap(); + + let label = qs.get("label").expect("x was None"); + let mut label = label.to_string(); + if label.contains("@") { + if let Some(captures) = re.captures(label.as_str()) { + label = captures.get(1).unwrap().as_str().to_string(); + } + } + label = babel_io::trim_quotes(label); + label = label.trim().to_string(); + + write!(output_bw, "MESH:{}\t{}\n", id, label).expect("Could not write triple"); + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs b/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs new file mode 100644 index 00000000..50a70d7a --- /dev/null +++ b/babel_io/src/bin/create_ncbigene_labels_synonyms_and_taxa.rs @@ -0,0 +1,193 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use polars::frame::DataFrame; +use polars::io::SerWriter; +use polars::prelude::*; +use std::error::Error; +use std::fs; +use std::path; +use std::path::PathBuf; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, + + #[clap(short, long, required = true)] + taxa_output: path::PathBuf, + + #[clap(short, long, required = true)] + description_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + // let br = BufReader::new(fs::File::open(options.input).unwrap()); + + // let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.as_path()).unwrap()); + // let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.as_path()).unwrap()); + // let mut taxa_bw = std::io::BufWriter::new(fs::File::create(options.taxa_output.as_path()).unwrap()); + // let mut description_bw = std::io::BufWriter::new(fs::File::create(options.description_output.as_path()).unwrap()); + + let usable_columns = vec![ + "#tax_id", + "GeneID", + "type_of_gene", + "Synonyms", + "Other_designations", + "Symbol_from_nomenclature_authority", + "Full_name_from_nomenclature_authority", + "Symbol", + "description", + ]; + + let df = polars::lazy::frame::LazyCsvReader::new(options.input.clone()) + .with_separator(b'\t') + .with_infer_schema_length(Some(0)) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(true) + .finish() + .unwrap() + .select(usable_columns.into_iter().map(|a| col(a)).collect_vec()) + .filter(col("type_of_gene").str().contains(lit("^(biological-region|other|unknown)$"), true).not()) + .with_column(concat_str([lit("NCBIGene"), col("GeneID")], ":", true).alias("GeneID")) + .with_column(concat_str([lit("NCBITaxon"), col("#tax_id")], ":", true).alias("#tax_id")) + .with_column( + concat_str( + [ + col("Full_name_from_nomenclature_authority"), + col("Synonyms"), + col("Other_designations"), + col("Symbol_from_nomenclature_authority"), + col("Symbol"), + ], + "|", + true, + ) + .str() + .split(lit("|")) + .alias("synonyms_concat"), + ) + .with_column( + col("synonyms_concat") + .list() + .eval(col("").filter(col("").is_in(lit("-")).not()), false) + .alias("synonyms_concat"), + ) + // .drop([col("Full_name_from_nomenclature_authority"), col("Other_designations")]) + .collect() + .unwrap(); + + debug!("shape: {:?}", df.shape()); + + // NOTE: python impl runs in 13m w/ streaming, rust runs in < 3m while holding data in memory + // TODO: these could be async & run in parallel + write_description(&df, &options.description_output); + write_taxa(&df, &options.taxa_output); + write_synonyms(&df, &options.synonyms_output); + write_labels(&df, &options.labels_output); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} + +fn write_synonyms(df: &DataFrame, output: &PathBuf) { + let mut tmp_df = df + .clone() + .lazy() + .select([ + col("GeneID"), + lit("http://www.geneontology.org/formats/oboInOwl#hasSynonym"), + col("synonyms_concat"), + ]) + .explode([col("synonyms_concat")]) + .collect() + .unwrap(); + + // println!("{}", tmp_df.head(None)); + + let mut file = fs::File::create(output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut tmp_df) + .unwrap(); +} + +fn write_description(df: &DataFrame, output: &PathBuf) { + let mut tmp_df = df.clone().lazy().select([col("GeneID"), col("description")]).collect().unwrap(); + + // println!("{}", tmp_df.head(None)); + + let mut file = fs::File::create(output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut tmp_df) + .unwrap(); +} + +fn write_labels(df: &DataFrame, output: &PathBuf) { + let mut tmp_df = df + .clone() + .lazy() + .with_column( + when( + col("Symbol_from_nomenclature_authority") + .is_null() + .or(col("Symbol_from_nomenclature_authority").eq(lit("-"))), + ) + .then(col("Symbol")) + .otherwise(col("Symbol_from_nomenclature_authority")) + .alias("best_symbol"), + ) + .with_column( + when(col("best_symbol").is_null().and(col("synonyms_concat").list().len().gt(0))) + .then(col("synonyms_concat").list().first()) + .otherwise(col("best_symbol")), + ) + .select([col("GeneID"), col("best_symbol")]) + .collect() + .unwrap(); + + // println!("{}", tmp_df.head(None)); + + let mut file = fs::File::create(output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut tmp_df) + .unwrap(); +} + +fn write_taxa(df: &DataFrame, output: &path::PathBuf) { + let mut tmp_df = df.clone().lazy().select([col("GeneID"), col("#tax_id")]).collect().unwrap(); + + // println!("{}", tmp_df.head(None)); + + let mut file = fs::File::create(output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut tmp_df) + .unwrap(); +} diff --git a/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs b/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs new file mode 100644 index 00000000..f5d21e00 --- /dev/null +++ b/babel_io/src/bin/create_orphanet_labels_and_synonyms.rs @@ -0,0 +1,67 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{Read, Write}; +use std::path; +use std::time::Instant; +use zip::ZipArchive; + +// NOTE: do not use, utf-8 conversion issues...retaining for S&Gs + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, + + #[clap(short, long, required = true)] + synonyms_output: path::PathBuf, +} +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + let mut synonyms_bw = std::io::BufWriter::new(fs::File::create(options.synonyms_output.clone().as_path()).unwrap()); + + let file = File::open(options.input.clone()).unwrap(); + let mut archive = ZipArchive::new(file).unwrap(); + + let mut zip_file = archive.by_name("Orphanet_Nomenclature_Pack_en/ORPHAnomenclature_en.xml").unwrap(); + + let mut data = vec![]; + zip_file.read_to_end(&mut data).unwrap(); + let contents = String::from_utf8_lossy(data.as_slice()); + + let doc = roxmltree::Document::parse(contents.as_ref()).expect("Could not parse document"); + + doc.root().descendants().filter(|n| n.tag_name().name() == "Disorder").for_each(|a| { + let orpha_code = a.descendants().find(|b| b.tag_name().name() == "OrphaCode").unwrap().text().unwrap(); + let name = a.descendants().find(|b| b.tag_name().name() == "Name").unwrap().text().unwrap(); + let curie = format!("orphanet:{}", orpha_code); + write!(&mut labels_bw, "{}\t{}\n", curie, name).unwrap(); + write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", curie, name).unwrap(); + match a.descendants().find(|b| b.tag_name().name() == "SynonymList") { + None => {} + Some(all_synonyms) => { + let all_synonyms_text = all_synonyms.text().unwrap(); + write!(&mut synonyms_bw, "{}\tOIO:hasExactSynonym\t{}\n", curie, all_synonyms_text).unwrap(); + } + } + }); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_pantherfamily_labels.rs b/babel_io/src/bin/create_pantherfamily_labels.rs new file mode 100644 index 00000000..1bcef9cf --- /dev/null +++ b/babel_io/src/bin/create_pantherfamily_labels.rs @@ -0,0 +1,61 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use std::collections::HashSet; +use std::error::Error; +use std::fs; +use std::io::{BufRead, BufReader, Write}; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = BufReader::new(fs::File::open(options.input).unwrap()); + + let mut labels_bw = std::io::BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + + let mut done = HashSet::new(); + + for line in br.lines() { + let line = line.unwrap(); + let line_split = line.split("\t").collect_vec(); + let sub_family = line_split.get(3).unwrap(); + let sub_family = sub_family.to_string(); + let main_family_split = sub_family.split(":").collect_vec(); + let main_family = main_family_split.get(0).unwrap(); + let main_family = main_family.to_string(); + let main_family_name = line_split.get(4).unwrap(); + let sub_family_name = line_split.get(5).unwrap(); + if !done.contains(&main_family) { + write!(labels_bw, "{}\t{}\n", format!("PANTHER.FAMILY:{}", main_family), main_family_name).unwrap(); + done.insert(main_family.to_string()); + } + + if !done.contains(&sub_family) { + write!(labels_bw, "{}\t{}\n", format!("PANTHER.FAMILY:{}", sub_family), sub_family_name).unwrap(); + done.insert(sub_family.to_string()); + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_pantherpathways_labels.rs b/babel_io/src/bin/create_pantherpathways_labels.rs new file mode 100644 index 00000000..8d85c293 --- /dev/null +++ b/babel_io/src/bin/create_pantherpathways_labels.rs @@ -0,0 +1,83 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use polars::prelude::*; +use std::error::Error; +use std::fs; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + // PANTHER.PATHWAY:P06217 Toll pathway-drosophila + // P06217 Toll pathway-drosophila P06348 SLMB DROME|FlyBase=FBgn0283468|UniProtKB=A0A0B4KHK1 Supernumerary limbs, isoform B IDA 9461217 PubMed PTHR44156:SF29 TNF RECEPTOR ASSOCIATED FACTOR 7 + + // NOTE: this polars implementation runs in 16ms + + let usable_columns = vec!["I", "II"]; + + let schema = Schema::from_iter(vec![ + Field::new("I".into(), DataType::String), + Field::new("II".into(), DataType::String), + Field::new("III".into(), DataType::String), + Field::new("IV".into(), DataType::String), + Field::new("V".into(), DataType::String), + Field::new("VI".into(), DataType::String), + Field::new("VII".into(), DataType::String), + Field::new("VIII".into(), DataType::String), + Field::new("IX".into(), DataType::String), + Field::new("X".into(), DataType::String), + Field::new("XI".into(), DataType::String), + ]); + + let df = LazyCsvReader::new(options.input.clone()) + .with_separator(b'\t') + .with_schema(Some(schema.into())) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(false) + .finish() + .unwrap() + .select(usable_columns.into_iter().map(|a| col(a)).collect_vec()) + .collect() + .unwrap(); + + // println!("{}", df.head(None)); + + let mut labels_df = df + .clone() + .lazy() + .select([concat_str([lit("PANTHER.PATHWAY"), col("I")], ":", true).alias("I"), col("II")]) + .unique(Some(vec!["I".into(), "II".into()]), UniqueKeepStrategy::First) + .collect() + .unwrap(); + + let mut file = fs::File::create(options.labels_output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut labels_df) + .unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_reactome_labels.rs b/babel_io/src/bin/create_reactome_labels.rs new file mode 100644 index 00000000..3c05ea2b --- /dev/null +++ b/babel_io/src/bin/create_reactome_labels.rs @@ -0,0 +1,54 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use serde_json::Value; +use std::error::Error; +use std::fs; +use std::io::{BufReader, BufWriter, Write}; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + //NOTE: Python runs in 3s, rust runs < 1s + let br = BufReader::new(fs::File::open(options.input).unwrap()); + let json_value: Value = serde_json::from_reader(br)?; + + let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + + for entry in json_value.as_array().unwrap().into_iter() { + parse_element_for_labels(&entry, &mut labels_bw); + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} + +fn parse_element_for_labels(entry: &Value, labels_bw: &mut BufWriter) { + let oid = entry["stId"].as_str().unwrap(); + let name = entry["name"].as_str().unwrap(); + let species = entry["species"].as_str().unwrap(); + write!(labels_bw, "REACT:{}\t{} ({})\n", oid, name, species).unwrap(); + if !entry["children"].is_null() { + for child_entry in entry["children"].as_array().unwrap().into_iter() { + parse_element_for_labels(child_entry, labels_bw); + } + } +} diff --git a/babel_io/src/bin/create_rhea_labels.rs b/babel_io/src/bin/create_rhea_labels.rs new file mode 100644 index 00000000..f3005e12 --- /dev/null +++ b/babel_io/src/bin/create_rhea_labels.rs @@ -0,0 +1,73 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::error::Error; +use std::fs; +use std::io::{BufReader, BufWriter, Write}; +use std::path; +use std::time::Instant; + +// NOTE: rust runs in 13s, python runs in 21s +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let br = BufReader::new(fs::File::open(options.input).unwrap()); + let store = Store::new()?; + let start_load = Instant::now(); + store + .bulk_loader() + .with_max_memory_size_in_megabytes(4 * 2048) + .with_num_threads(4) + .load_from_reader(RdfFormat::RdfXml, br) + .expect("Could not load input"); + info!("duration to load input: {}", format_duration(start_load.elapsed()).to_string()); + + let mut labels_bw = BufWriter::new(fs::File::create(options.labels_output.clone().as_path()).unwrap()); + + let query_statement = r#"PREFIX rdfs: + PREFIX rh: + SELECT DISTINCT ?x ?acc ?label WHERE { + ?x rdfs:label ?label . + ?x rh:accession ?acc . + }"#; + + if let QueryResults::Solutions(solutions) = store.query(query_statement)? { + for qs in solutions.filter_map(Result::ok).into_iter() { + let iterm = qs.get("acc").expect("acc was None"); + let mut iterm = iterm.to_string(); + iterm = babel_io::trim_quotes(iterm); + let rhea_iterm_split = iterm.split(":").collect_vec(); + let rhea_id = rhea_iterm_split.last().unwrap(); + + let label = qs.get("label").expect("label was None"); + let mut label = label.to_string(); + label = babel_io::trim_quotes(label); + + write!(labels_bw, "RHEA:{}\t{}\n", rhea_id, label).expect("Could not write triple"); + } + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_smpdb_labels.rs b/babel_io/src/bin/create_smpdb_labels.rs new file mode 100644 index 00000000..0d4b88e0 --- /dev/null +++ b/babel_io/src/bin/create_smpdb_labels.rs @@ -0,0 +1,63 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use polars::prelude::*; +use std::error::Error; +use std::fs; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + labels_output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + // NOTE: this polars implementation runs in 16ms + let usable_columns = vec!["SMPDB ID", "Name"]; + + let df = LazyCsvReader::new(options.input.clone()) + .with_infer_schema_length(Some(0)) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(true) + .finish() + .unwrap() + .select(usable_columns.into_iter().map(|a| col(a)).collect_vec()) + .collect() + .unwrap(); + + // println!("{}", df.head(None)); + + let mut labels_df = df + .clone() + .lazy() + .select([concat_str([lit("SMPDB"), col("SMPDB ID")], ":", true).alias("SMPDB ID"), col("Name")]) + .collect() + .unwrap(); + + let mut file = fs::File::create(options.labels_output).expect("could not create file"); + CsvWriter::new(&mut file) + .include_header(false) + .with_separator(b'\t') + .finish(&mut labels_df) + .unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/create_uniprot_labels.rs b/babel_io/src/bin/create_uniprot_labels.rs new file mode 100644 index 00000000..b9e27b36 --- /dev/null +++ b/babel_io/src/bin/create_uniprot_labels.rs @@ -0,0 +1,58 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use std::error::Error; +use std::fs; +use std::fs::File; +use std::io::{BufRead, Write}; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + sprot_input: path::PathBuf, + + #[clap(short, long, required = true)] + trembl_input: path::PathBuf, + + #[clap(short, long, required = true)] + output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let mut writer = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap()); + + write_labels(&mut writer, options.sprot_input, "sprot".into()).unwrap(); + write_labels(&mut writer, options.trembl_input, "trembl".into()).unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} + +fn write_labels(writer: &mut std::io::BufWriter, input: path::PathBuf, which: String) -> Result<(), Box> { + let reader = std::io::BufReader::new(fs::File::open(input).unwrap()); + for line in reader.lines() { + let line = line.unwrap(); + if !line.starts_with(">") { + continue; + } + + let line_split = line.split('|').collect_vec(); + let name_split = line_split.get(2).unwrap().split(" OS=").collect_vec(); + write!(writer, "UniProtKB:{}\t{} ({})\n", line_split.get(1).unwrap(), name_split.get(0).unwrap(), which).unwrap(); + } + Ok(()) +} + diff --git a/babel_io/src/bin/filter_unichem.rs b/babel_io/src/bin/filter_unichem.rs new file mode 100644 index 00000000..2756abbf --- /dev/null +++ b/babel_io/src/bin/filter_unichem.rs @@ -0,0 +1,72 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use polars::io::SerWriter; +use polars::prelude::{col, lit, CsvWriter, LazyFileListReader}; +use std::error::Error; +use std::fs; +use std::path; +use std::time::Instant; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + input: path::PathBuf, + + #[clap(short, long, required = true)] + output: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + // let reader = std::io::BufReader::new(fs::File::open(options.input).unwrap()); + // let mut writer = std::io::BufWriter::new(fs::File::create(options.output.clone().as_path()).unwrap()); + // + // write!(writer, "{}\n", "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT").unwrap(); + // + // for line in reader.lines().skip(1) { + // let line = line.unwrap(); + // let line_split = line.trim().split("\t"); + // + // write!(writer, "{}\n", "UCI\tSRC_ID\tSRC_COMPOUND_ID\tASSIGNMENT").unwrap(); + // } + let data_sources = std::collections::HashMap::from([ + ("1", "CHEMBL.COMPOUND"), + ("2", "DRUGBANK"), + ("4", "GTOPDB"), + ("6", "KEGG.COMPOUND"), + ("7", "CHEBI"), + ("14", "UNII"), + ("18", "HMDB"), + ("22", "PUBCHEM.COMPOUND"), + ("34", "DrugCentral"), + ]); + let re = format!("^({})$", itertools::join(data_sources.into_keys(), "|")); + let mut df = polars::lazy::frame::LazyCsvReader::new(options.input.clone()) + .with_separator(b'\t') + .with_infer_schema_length(Some(0)) + .with_ignore_errors(true) + .with_truncate_ragged_lines(true) + .with_has_header(true) + .finish() + .unwrap() + .filter(col("SRC_ID").str().contains(lit(re), true)) + .filter(col("ASSIGNMENT").eq(lit("1"))) + .collect() + .unwrap(); + + let mut file = fs::File::create(options.output).expect("could not create file"); + CsvWriter::new(&mut file).include_header(true).with_separator(b'\t').finish(&mut df).unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} diff --git a/babel_io/src/bin/pull_ensembl.rs b/babel_io/src/bin/pull_ensembl.rs new file mode 100644 index 00000000..30852171 --- /dev/null +++ b/babel_io/src/bin/pull_ensembl.rs @@ -0,0 +1,264 @@ +use async_once::AsyncOnce; +use clap::Parser; +use humantime::format_duration; +use itertools::{join, Itertools}; +use lazy_static::lazy_static; +use log::{debug, info}; +use polars::prelude::*; +use quick_xml::Writer; +use reqwest::header; +use reqwest::redirect::Policy; +use std::collections::HashSet; +use std::error::Error; +use std::fs::{create_dir_all, File}; +use std::io::{Cursor, Write}; +use std::path; +use std::time::{Duration, Instant}; + +lazy_static! { + pub static ref CSV_PARSE_OPTIONS: CsvParseOptions = CsvParseOptions::default().with_truncate_ragged_lines(true).with_separator(b'\t'); + + pub static ref REQWEST_CLIENT: AsyncOnce = AsyncOnce::new(async { + let mut headers = header::HeaderMap::new(); + // headers.insert(header::ACCEPT, header::HeaderValue::from_static("application/json")); + headers.insert(header::CONTENT_TYPE, header::HeaderValue::from_static("text/plain")); + let result = reqwest::Client::builder() + .redirect(Policy::limited(5)) + // .read_timeout(Duration::from_secs(1500)) + // .timeout(Duration::from_secs(1500)) + .default_headers(headers) + .build(); + + match result { + Ok(request_client) => request_client, + Err(e) => panic!("Could not create reqwest client: {}", e), + } + }); +} + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + ensembl_output_dir: path::PathBuf, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let datasets = pull_datasets().await.expect("Count not get datasets"); + debug!("datasets: {:?}", datasets); + + let usable_attribute_cols: HashSet = HashSet::from([ + "ensembl_gene_id", + "ensembl_peptide_id", + "description", + "external_gene_name", + "external_gene_source", + "external_synonym", + "chromosome_name", + "source", + "gene_biotype", + "entrezgene_id", + "zfin_id_id", + "mgi_id", + "rgd_id", + "flybase_gene_id", + "sgd_gene", + "wormbase_gene", + ]) + .into_iter() + .map(|a| a.to_string()) + .collect(); + + let request_client = REQWEST_CLIENT.get().await; + + for (idx, dataset_id) in datasets.iter().enumerate() { + let pull_dataset_start = Instant::now(); + info!("dataset_id: {}", dataset_id); + + let ensembl_output_dir = options.ensembl_output_dir.join(&dataset_id); + create_dir_all(&ensembl_output_dir).expect("Could not create dataset dir"); + let output_path = ensembl_output_dir.join("BioMart.tsv"); + if output_path.exists() { + continue; + } + let mut output_file = File::create(output_path).expect("could not create file"); + + let attributes = pull_attributes(&dataset_id, &usable_attribute_cols).await.expect("Could not get attributes"); + debug!("attributes: {:?}", attributes); + + let mut writer = Writer::new(Cursor::new(Vec::new())); + writer + .create_element("Query") + .with_attributes(vec![ + ("virtualSchemaName", "default"), + ("formatter", "TSV"), + ("header", "1"), + ("datasetConfigVersion", "0.6"), + ]) + .write_inner_content(|writer| { + writer + .create_element("Dataset") + .with_attributes(vec![("name", dataset_id.as_str()), ("interface", "default")]) + .write_inner_content(|writer| { + for attribute in attributes.iter() { + writer.create_element("Attribute").with_attribute(("name", attribute.as_str())).write_empty()?; + } + Ok(()) + }) + .unwrap(); + Ok(()) + }) + .unwrap(); + + let xml_output = writer.into_inner().into_inner(); + let xml_result = std::str::from_utf8(xml_output.as_slice()).unwrap(); + debug!("xml result: {}", xml_result); + + let query_response = request_client + .get("http://www.ensembl.org/biomart/martservice") + .query(&[("query", xml_result)]) + .send() + .await + .expect("Could not send query"); + + let query_response_text = query_response.text().await.expect("Could not get text from response"); + let handle = std::io::Cursor::new(query_response_text); + let mut df = CsvReadOptions::default() + .with_parse_options(CSV_PARSE_OPTIONS.clone()) + .with_has_header(true) + .with_ignore_errors(true) + .with_infer_schema_length(None) + .with_low_memory(true) + .into_reader_with_file_handle(handle) + .finish() + .unwrap(); + + CsvWriter::new(&mut output_file) + .include_header(true) + .with_separator(b'\t') + .finish(&mut df) + .expect("Could not write Ensembl output Dataframe"); + + info!( + "dataset_id: {}, finished {}/{}, duration to pull: {}", + dataset_id, + idx + 1, + datasets.len(), + format_duration(pull_dataset_start.elapsed()).to_string() + ); + } + + let mut w = File::create(options.ensembl_output_dir.join("BioMartDownloadComplete")).unwrap(); + + writeln!(&mut w, "{}", format!("Downloaded gene sets for {} data sets.", datasets.len())).unwrap(); + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} + +async fn pull_datasets() -> Result, Box> { + let dataset_url = "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=datasets&mart=ENSEMBL_MART_ENSEMBL"; + + let request_client = REQWEST_CLIENT.get().await; + + let dataset_response = request_client.get(dataset_url).send().await?; + let dataset_text = dataset_response.text().await.unwrap(); + let filtered_data = dataset_text + .lines() + .filter_map(|line| if line.trim().is_empty() { None } else { Some(line.trim().to_string()) }) + .collect_vec(); + let joined_filtered_data = filtered_data.join("\n"); + + let handle = Cursor::new(joined_filtered_data); + let dataset_df = CsvReadOptions::default() + .with_parse_options(CSV_PARSE_OPTIONS.clone()) + .with_has_header(false) + .with_ignore_errors(true) + .with_infer_schema_length(None) + .into_reader_with_file_handle(handle) + .finish() + .unwrap(); + // println!("{}", dataset_df.head(None)); + + let datasets_to_skip = join( + &vec![ + "elucius_gene_ensembl", + "hgfemale_gene_ensembl", + "charengus_gene_ensembl", + "otshawytscha_gene_ensembl", + "aocellaris_gene_ensembl", + "omykiss_gene_ensembl", + ], + "|", + ); + let reg = format!("^({})$", datasets_to_skip); + debug!("regex: {}", reg); + + let filtered_dataset_df = dataset_df + .clone() + .lazy() + .select([col("column_2").alias("dataset_id")]) + .filter(col("dataset_id").str().contains(lit(reg), true).not()) + .collect() + .unwrap(); + + let datasets: Vec = filtered_dataset_df + .column("dataset_id") + .unwrap() + .str() + .unwrap() + .into_iter() + .filter_map(|a| a.map(String::from)) + .collect(); + + Ok(datasets) +} + +async fn pull_attributes(dataset_id: &String, usable_attribute_cols: &HashSet) -> Result, Box> { + let request_client = REQWEST_CLIENT.get().await; + let attributes_url = format!( + "http://www.ensembl.org/biomart/martservice/biomart/martservice?type=attributes&dataset={}", + dataset_id + ); + + let attributes_response = request_client.get(attributes_url).send().await?; + let attributes_text = attributes_response.text().await.unwrap(); + let filtered_attributes_text = attributes_text + .lines() + .filter_map(|line| if line.trim().is_empty() { None } else { Some(line.trim().to_string()) }) + .collect_vec(); + let joined_filtered_attributes_text = filtered_attributes_text.join("\n"); + + let handle = Cursor::new(joined_filtered_attributes_text); + let attributes_df = CsvReadOptions::default() + .with_parse_options(CSV_PARSE_OPTIONS.clone()) + .with_has_header(false) + .with_ignore_errors(true) + .with_infer_schema_length(None) + .into_reader_with_file_handle(handle) + .finish() + .unwrap(); + // println!("{}", attributes_df.head(None)); + + let filtered_attributes_df = attributes_df.clone().lazy().select([col("column_1").alias("attribute_id")]).collect().unwrap(); + + let attributes: HashSet = filtered_attributes_df + .column("attribute_id") + .unwrap() + .str() + .unwrap() + .into_iter() + .filter_map(|a| a.map(String::from)) + .collect(); + + let intersection = usable_attribute_cols.intersection(&attributes).into_iter().cloned().collect_vec(); + + Ok(intersection) +} diff --git a/babel_io/src/lib.rs b/babel_io/src/lib.rs new file mode 100644 index 00000000..193640c7 --- /dev/null +++ b/babel_io/src/lib.rs @@ -0,0 +1,271 @@ +pub fn trim_gt_and_lt(mut x: String) -> String { + if x.starts_with("<") { + x = x.strip_prefix("<").unwrap().to_string(); + } + if x.ends_with(">") { + x = x.strip_suffix(">").unwrap().to_string(); + } + x +} + +pub fn trim_quotes(mut label: String) -> String { + if label.starts_with("\"") { + label = label.strip_prefix("\"").unwrap().to_string(); + } + + if label.ends_with("\"") { + label = label.strip_suffix("\"").unwrap().to_string(); + } + label +} + +// extern crate core; +// +// use itertools::Itertools; +// use pyo3::prelude::*; +// use pyo3::types::PySet; +// use std::collections::{HashMap, HashSet}; +// use std::fs; +// use std::fs::File; +// use std::hash::Hash; +// use std::io::prelude::*; +// use std::io::{BufRead, BufReader, BufWriter}; +// use std::path::{Path, PathBuf}; +// +// #[pyfunction] +// pub fn pull_uniprot_labels(input: &str, which: &str) -> PyResult { +// let input_path = PathBuf::from(input); +// +// let output_file_name = format!("uniprot_{}.output.txt", which); +// let mut output_path = input_path.with_file_name(output_file_name); +// if !output_path.exists() { +// let br = BufReader::new(File::open(input_path.as_path()).unwrap()); +// let mut bw = BufWriter::new(File::create(output_path.as_path()).unwrap()); +// +// for line in br.lines() { +// let line = line.unwrap(); +// if !line.starts_with(">") { +// continue; +// } +// let line_split = line.split("|").collect_vec(); +// let name_split = line_split[2].split(" OS=").collect_vec(); +// let entry = format!("UniProtKB:{}\t{} ({})\n", line_split[1], name_split[0], which); +// bw.write_all(entry.as_bytes()).unwrap(); +// } +// } +// +// Ok(output_path.display().to_string()) +// } +// +// #[pyfunction] +// pub fn merge_uniprot_label_files(inputs: Vec<&str>, output: &str, remove_inputs: bool) -> PyResult { +// let output_path = PathBuf::from(output); +// let mut bw = BufWriter::new(File::create(output_path.as_path()).unwrap()); +// inputs.clone().into_iter().map(|input| PathBuf::from(input)).for_each(|input_path| { +// let br = BufReader::new(File::open(input_path.as_path()).unwrap()); +// for line in br.lines() { +// let line = line.unwrap(); +// bw.write_all(format!("{}\n", line).as_bytes()).unwrap(); +// } +// }); +// +// if remove_inputs { +// inputs.iter().for_each(|input| fs::remove_file(input).unwrap()); +// } +// +// Ok(output_path.display().to_string()) +// } +// +// #[pyfunction] +// pub fn read_identifier_file(input: &str) -> PyResult<(Vec>, HashMap)> { +// let input_path = PathBuf::from(input); +// let br = BufReader::new(File::open(input_path.as_path()).unwrap()); +// let mut types: HashMap = HashMap::new(); +// let mut identifiers = vec![]; +// for line in br.lines() { +// let line = line.unwrap(); +// let x = line.trim().split('\t').collect_vec(); +// identifiers.push(vec![x[0].to_string()]); +// if x.len() > 1 { +// types.insert(x[0].into(), x[1].into()); +// } +// } +// +// Ok((identifiers, types)) +// } +// +// #[pyfunction] +// pub fn glom(conc_set: HashSet, newgroups: Vec>, unique_prefixes: Vec) -> PyResult> { +// let mut n = 0; +// let bad = 0; +// let shit_prefixes = vec!["KEGG", "PUBCHEM"]; +// let test_id = "xUBERON:0002262"; +// // let mut excised = vec![]; +// +// for xgroup in newgroups { +// if xgroup.len() > 2 { +// println!("{:?}", xgroup); +// panic!("nope"); +// } +// n = n + 1; +// if xgroup.contains(&test_id.to_string()) { +// println!("{:?}", xgroup); +// } +// +// let existing_sets_w_x = xgroup +// .clone() +// .into_iter() +// .filter(|x| conc_set.contains(x)) +// .map(|x| (conc_set.get(&x).unwrap(), x)) +// .collect_vec(); +// +// let existing_sets: Vec = existing_sets_w_x.clone().into_iter().map(|a| a.0.clone()).collect_vec(); +// let x = existing_sets_w_x.iter().map(|a| a.1.clone()).collect_vec(); +// let mut newset = existing_sets.clone(); +// newset.dedup(); +// xgroup.iter().for_each(|a| newset.push(a.clone())); +// +// if newset.contains(&test_id.to_string()) { +// println!("hiset: {:?}", newset); +// println!("input_set: {:?}", xgroup); +// println!("esets"); +// // existing_sets.iter().for_each(|a| println!("{} {}", a, xgroup)) +// } +// +// newset.iter().for_each(|entry| { +// let prefix = entry.split(':').next().unwrap(); +// if shit_prefixes.contains(&prefix) { +// println!("entry: {}, prefix: {}", entry, prefix); +// panic!("garbage"); +// } +// }); +// +// let setok = true; +// if xgroup.contains(&test_id.to_string()) { +// println!("setok: {}", setok); +// } +// +// unique_prefixes.iter().for_each(|up| { +// if xgroup.contains(&test_id.to_string()) { +// println!("up: {}", up); +// } +// // newset.iter().filter_map(|a| ); +// }); +// } +// +// Ok(conc_set.clone()) +// } +// +// #[pymodule] +// fn babel_io(_py: Python, m: &PyModule) -> PyResult<()> { +// m.add_function(wrap_pyfunction!(pull_uniprot_labels, m)?)?; +// m.add_function(wrap_pyfunction!(merge_uniprot_label_files, m)?)?; +// Ok(()) +// } +// +// #[cfg(test)] +// mod tests { +// use crate::glom; +// use itertools::{Itertools, TupleWindows}; +// use std::collections::HashSet; +// +// #[test] +// fn test_glom() { +// let local_glom = |conc_set: HashSet, mut newgroups: Vec<(String, String)>, unique_prefixes: Vec| -> HashSet { +// let mut n = 0; +// let bad = 0; +// let shit_prefixes = vec!["KEGG", "PUBCHEM"]; +// let test_id = "xUBERON:0002262"; +// // let mut excised = vec![]; +// +// for xgroup in newgroups.iter_mut() { +// if xgroup.len() > 2 { +// println!("{:?}", xgroup); +// panic!("nope"); +// } +// n = n + 1; +// if xgroup.contains(&test_id.to_string()) { +// println!("{:?}", xgroup); +// } +// +// let existing_sets_w_x = xgroup +// .clone() +// .into_iter() +// .filter(|x| conc_set.contains(x)) +// .map(|x| (conc_set.get(&x).unwrap(), x)) +// .collect_vec(); +// +// let existing_sets: Vec = existing_sets_w_x.clone().into_iter().map(|a| a.0.clone()).collect_vec(); +// let x = existing_sets_w_x.iter().map(|a| a.1.clone()).collect_vec(); +// let mut newset = existing_sets.clone(); +// newset.dedup(); +// xgroup.iter().for_each(|a| newset.push(a.clone())); +// +// if newset.contains(&test_id.to_string()) { +// println!("hiset: {:?}", newset); +// println!("input_set: {:?}", xgroup); +// println!("esets"); +// // existing_sets.iter().for_each(|a| println!("{} {}", a, xgroup)) +// } +// +// newset.iter().for_each(|entry| { +// let prefix = entry.split(':').next().unwrap(); +// if shit_prefixes.contains(&prefix) { +// println!("entry: {}, prefix: {}", entry, prefix); +// panic!("garbage"); +// } +// }); +// +// let setok = true; +// if xgroup.contains(&test_id.to_string()) { +// println!("setok: {}", setok); +// } +// +// unique_prefixes.iter().for_each(|up| { +// if xgroup.clcontains(&test_id.to_string()) { +// println!("up: {}", up); +// } +// // newset.iter().filter_map(|a| ); +// }); +// } +// conc_set.clone() +// }; +// +// // let v: TupleWindows<_, (String, String)> = vec!["1", "2", "3", "4", "5", "6", "7"] +// // .into_iter() +// // .map(|a| a.to_string()) +// // .collect_vec() +// // .into_iter() +// // .tuple_windows(); +// // println!("{:?}", v.collect::>()); +// let mut conc_set = std::collections::HashSet::new(); +// let newgroups: Vec<(String, String)> = vec![ +// ("UMLS:C0000005".to_string(), String::new()), +// ("UMLS:C0000052".to_string(), String::new()), +// ("UMLS:C0000084".to_string(), String::new()), +// ("UMLS:C0000107".to_string(), String::new()), +// ("UMLS:C0000132".to_string(), String::new()), +// ("UMLS:C0000152".to_string(), String::new()), +// ("UMLS:C0000165".to_string(), String::new()), +// ("UMLS:C0000184".to_string(), String::new()), +// ("UMLS:C0000189".to_string(), String::new()), +// ("UMLS:C0000246".to_string(), String::new()), +// ("UMLS:C0000254".to_string(), String::new()), +// ("UMLS:C0000257".to_string(), String::new()), +// ("UMLS:C0000291".to_string(), String::new()), +// ("UMLS:C0000324".to_string(), String::new()), +// ("UMLS:C0000340".to_string(), String::new()), +// ("UMLS:C0000353".to_string(), String::new()), +// ("UMLS:C0000359".to_string(), String::new()), +// ("UMLS:C0000360".to_string(), String::new()), +// ]; +// println!("conc_set before glom: {:?}", conc_set); +// let conc_set = local_glom(conc_set, newgroups, vec!["UniProtKB".to_string(), "PR".to_string()]); +// println!("conc_set after glom: {:?}", conc_set); +// // glom(d, eqs) +// // print(f"{d}") +// // assert len(d) == 5 +// // assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"} +// // assert d["4"] == d["5"] == {"4", "5"} +// } +// } From ea34ccb2bd9129e3380cfe0bff453e350daecee3 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 31 Jul 2025 08:36:11 -0400 Subject: [PATCH 20/28] initial commit --- babel_io/Cargo.toml | 32 ++++++++++++++++++++++++++++++++ babel_io/rustfmt.toml | 4 ++++ 2 files changed, 36 insertions(+) create mode 100644 babel_io/Cargo.toml create mode 100644 babel_io/rustfmt.toml diff --git a/babel_io/Cargo.toml b/babel_io/Cargo.toml new file mode 100644 index 00000000..2abb96ae --- /dev/null +++ b/babel_io/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "babel_io" +version = "0.1.0" +edition = "2024" + +[lib] +name = "babel_io" + +[dependencies] +async_once = "^0.2" +csv = "^1.3" +clap = { version = "^4.5", features = ["derive"] } +env_logger = "^0.11" +humantime = "^2.2" +itertools = "^0.14" +lazy_static = "^1.3" +log = { version = "^0.4", features = ["std"] } +polars = { version = "^0.45", features = ["default", "cloud", "concat_str", "string_pad", "dtype-array", "strings", "regex", "json", "cross_join", "lazy", "coalesce", "polars-lazy", "parquet", "find_many", "csv", "decompress", "list_eval", "is_in"] } +oxigraph = "^0.4" +rand = "^0.9" +rayon = "^1.10" +regex = "^1.11" +reqwest = { version = "^0.12", features = ["default", "json"] } +roxmltree = "^0.20" +serde = { version = "^1.0", features = ["derive", "serde_derive"] } +serde_derive = "^1.0" +serde_json = "^1.0" +serde_with = { version = "^3.12", features = ["std", "macros", "json"] } +tokio = { version = "^1.45", features = ["rt", "rt-multi-thread", "macros"] } +uuid = { version = "^1.1", features = ["v4"] } +quick-xml = "^0.38" +zip = "^4.2" diff --git a/babel_io/rustfmt.toml b/babel_io/rustfmt.toml new file mode 100644 index 00000000..fc5e46a1 --- /dev/null +++ b/babel_io/rustfmt.toml @@ -0,0 +1,4 @@ +max_width = 160 +newline_style = "Unix" +use_field_init_shorthand = true +use_try_shorthand = true From 811deae7c90ae1be37c7a0da04d0ec9bf42d8d4e Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:38:54 -0400 Subject: [PATCH 21/28] initial commit --- babel_io/src/bin/build_compendia.rs | 65 +++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 babel_io/src/bin/build_compendia.rs diff --git a/babel_io/src/bin/build_compendia.rs b/babel_io/src/bin/build_compendia.rs new file mode 100644 index 00000000..9e6e2483 --- /dev/null +++ b/babel_io/src/bin/build_compendia.rs @@ -0,0 +1,65 @@ +#[macro_use] +extern crate log; + +use clap::Parser; +use humantime::format_duration; +use itertools::Itertools; +use oxigraph::io::RdfFormat; +use oxigraph::sparql::QueryResults; +use oxigraph::store::Store; +use std::collections::HashSet; +use std::error::Error; +use std::fs::read_to_string; +use std::io::{BufReader, BufWriter}; +use std::time::Instant; +use std::{fs, path}; + +#[derive(Parser, PartialEq, Debug)] +#[clap(author, version, about, long_about = None)] +struct Options { + #[clap(short, long, required = true)] + concordances: Vec, + + #[clap(short, long, required = true)] + identifiers: Vec, + + #[clap(short = 'z', long, required = true)] + ic_rdf: path::PathBuf, +} +#[tokio::main] +async fn main() -> Result<(), Box> { + let start = Instant::now(); + env_logger::init(); + + let options = Options::parse(); + debug!("{:?}", options); + + let dicts = HashSet::new(); + let types = HashSet::new(); + + for ifile in options.identifiers { + let asdf = read_to_string(ifile).unwrap(); + for line in asdf.lines() {} + // new_identifiers, new_types = read_identifier_file(ifile) + // + // + // + // types = {} + // identifiers = list() + // with open(infile,'r') as inf: + // for line in inf: + // x = line.strip().split('\t') + // identifiers.append((x[0],)) + // if len(x) > 1: + // types[x[0]] = x[1] + // return identifiers,types + // + // + // + // glom(dicts, new_identifiers, unique_prefixes=[UBERON, GO]) + // types.update(new_types) + } + + info!("Duration: {}", format_duration(start.elapsed()).to_string()); + Ok(()) +} From 48ade29cd1af25e1542fc1de8248dcd37bbbe695 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:39:10 -0400 Subject: [PATCH 22/28] adding targets --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 638f85ec..0388e479 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ babel_outputs/ .snakemake/ .env .idea +**/target/ \ No newline at end of file From 9e7d8cc3e5c32122496af4b908022cc1f0e4bad1 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:39:26 -0400 Subject: [PATCH 23/28] renaming --- requirements.txt => requirements.in | 3 --- 1 file changed, 3 deletions(-) rename requirements.txt => requirements.in (85%) diff --git a/requirements.txt b/requirements.in similarity index 85% rename from requirements.txt rename to requirements.in index b6d6479a..45be78fc 100644 --- a/requirements.txt +++ b/requirements.in @@ -4,13 +4,10 @@ bmt jsonlines pandas more-itertools -#pyoxigraph~=0.2.5 pyoxigraph~=0.4.11 psycopg2-binary pytest pytest-cov -#python-Levenshtein-wheels -python-levenshtein pyyaml requests PuLP==2.7.0 From 1bde21dd922283205f5d6558ca880f3c91ea79e8 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:39:44 -0400 Subject: [PATCH 24/28] incrementing snakemake --- requirements.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.in b/requirements.in index 45be78fc..4d09b5eb 100644 --- a/requirements.in +++ b/requirements.in @@ -11,7 +11,8 @@ pytest-cov pyyaml requests PuLP==2.7.0 -snakemake==7.32.4 +#snakemake==7.32.4 +snakemake==9.9.0 sparqlwrapper # Added by Gaurav, Jan 2022 xmltodict From b142c29f81f4118fa6b1b50919a78ae763832aa0 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:39:57 -0400 Subject: [PATCH 25/28] updating --- requirements.lock | 443 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 311 insertions(+), 132 deletions(-) diff --git a/requirements.lock b/requirements.lock index 987e81f5..8dc2ba53 100644 --- a/requirements.lock +++ b/requirements.lock @@ -1,168 +1,347 @@ -aiohttp==3.8.4 -aiosignal==1.3.1 -airium==0.2.5 +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# pip-compile requirements.in +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.15 + # via apybiomart +aiosignal==1.4.0 +airium==0.2.7 + # via + # linkml-renderer + # oaklib +alabaster==1.0.0 + # via sphinx +annotated-types==0.7.0 + # via pydantic antlr4-python3-runtime==4.9.3 -anyio==3.7.0 + # linkml + # pyjsg + # pyshexc appdirs==1.4.4 + # snakemake apybiomart==0.5.3 -async-timeout==4.0.2 + # via -r requirements.in +argparse-dataclass==2.0.0 + # snakemake-interface-common + # snakemake-interface-executor-plugins + # yte +arrow==1.3.0 + # via isoduration asyncio==3.4.3 -attrs==23.1.0 -Babel==2.12.1 -bcp47==0.0.4 -beautifulsoup4==4.12.2 -biopython==1.81 -bmt==1.1.1 -cattrs==23.1.2 -certifi==2023.5.7 -cffconvert==2.0.0 +attrs==25.3.0 + # aiohttp + # cattrs + # jsonlines + # jsonschema + # referencing + # requests-cache +babel==2.17.0 +bcp47==0.1.0 + # via funowl +beautifulsoup4==4.13.4 +biopython==1.85 +bmt==1.4.5 +cattrs==25.1.1 + # via requests-cache +certifi==2025.7.14 + # via requests +CFGraph==0.2.1 + # via pyshex chardet==5.2.0 -charset-normalizer==3.1.0 -class-resolver==0.4.2 -click==8.1.7 -colorama==0.4.6 -ConfigArgParse==1.7 -connection-pool==0.0.3 -coverage==7.3.0 -curies==0.6.0 -datrie==0.8.2 -Deprecated==1.2.14 + # pronto + # pyshex +charset-normalizer==3.4.2 +class-resolver==0.6.0 + # ols-client +click==8.2.1 + # json-flattener + # linkml-runtime + # more-click + # prefixcommons + # pystow + # semsql + # sphinx-click + # sssom +conda-inject==1.3.2 + # via snakemake +ConfigArgParse==1.7.1 +connection_pool==0.0.3 +coverage==7.10.1 + # via pytest-cov +curies==0.10.19 + # -r requirements.in + # kgcl-schema + # prefixmaps +Deprecated==1.2.18 + # via linkml-runtime deprecation==2.1.0 -docopt==0.6.2 -docutils==0.20.1 -dpath==2.1.6 -EditorConfig==0.12.3 -elasticsearch==7.16.3 + # bmt +docutils==0.21.2 + # sphinx +dpath==2.2.0 +duckdb==1.3.2 +et_xmlfile==2.0.0 + # via openpyxl eutils==0.6.0 -fastapi==0.95.0 -fastjsonschema==2.18.0 -fastobo==0.12.2 -frozenlist==1.3.3 + # via oaklib +fastjsonschema==2.21.1 + # via nbformat +fastobo==0.13.0 + # via pronto +fqdn==1.5.1 + # via jsonschema +frozenlist==1.7.0 + # aiosignal funowl==0.2.3 -ghp-import==2.1.0 -gitdb==4.0.10 -GitPython==3.1.34 -greenlet==2.0.1 -gunicorn==20.1.0 -h11==0.14.0 +gitdb==4.0.12 + # via gitpython +GitPython==3.1.45 +graphviz==0.21 + # via linkml +greenlet==3.2.3 + # via sqlalchemy hbreader==0.9.1 + # jsonasobj2 humanfriendly==10.0 -idna==3.4 -ijson==3.2.3 -importlib-metadata==6.8.0 -iniconfig==2.0.0 -isodate==0.6.1 -itsdangerous==2.1.2 -Jinja2==3.1.2 -jsbeautifier==1.14.9 +idna==3.10 + # requests + # url-normalize + # yarl +ijson==3.4.0 + # via ndex2 +imagesize==1.4.1 +immutables==0.21 +importlib_resources==6.5.2 + # via sssom +iniconfig==2.1.0 + # via pytest +isodate==0.7.2 +isoduration==20.11.0 +Jinja2==3.1.6 json-flattener==0.1.9 jsonasobj==1.3.1 + # funowl jsonasobj2==1.0.4 jsonlines==4.0.0 -jsonschema==3.2.0 -jupyter_core==5.3.1 +jsonpointer==3.0.0 +jsonschema==4.25.0 + # nbformat +jsonschema-specifications==2025.4.1 +jupyter_core==5.8.1 kgcl-rdflib==0.5.0 -kgcl-schema==0.6.0 -lark==1.1.7 -linkml-renderer==0.3.0 -linkml-runtime==1.5.6 -lxml==4.9.3 -Markdown==3.4.4 -MarkupSafe==2.1.3 -mergedeep==1.3.4 -mistune==2.0.3 -mkdocs==1.5.2 -mkdocs-material==9.2.7 -mkdocs-material-extensions==1.1.1 -mkdocs-mermaid2-plugin==0.6.0 +kgcl_schema==0.6.9 + # kgcl-rdflib +lark==1.2.2 + # via kgcl-schema +linkml==1.9.3 +linkml-renderer==0.3.1 +linkml-runtime==1.9.4 + # sssom-schema +lxml==6.0.0 + # via eutils +MarkupSafe==3.0.2 + # via jinja2 more-click==0.1.2 -more-itertools==10.1.0 -multidict==6.0.4 -nbformat==5.9.2 -ndex2==3.5.1 -networkx==3.1 -numpy==1.25.2 -oaklib==0.5.18 + # via ols-client +more-itertools==10.7.0 +multidict==6.6.3 +nbformat==5.10.4 +ndex2==3.11.0 +networkx==3.5 + # ndex2 +numpy==2.3.2 + # biopython + # pandas + # pansql + # scipy +oaklib==0.5.33 ols-client==0.1.4 ontoportal-client==0.0.4 -packaging==23.1 -paginate==0.5.6 -pandas==2.1.0 +openpyxl==3.1.5 +packaging==25.0 + # deprecation + # pytest +pandas==2.3.1 + # apybiomart pansql==0.0.1 -pathspec==0.11.2 -plac==1.3.5 -platformdirs==3.10.0 -pluggy==1.0.0 +parse==1.20.2 +platformdirs==4.3.8 + # jupyter-core +pluggy==1.6.0 + # pytest-cov prefixcommons==0.1.12 -prefixmaps==0.1.5 -pronto==2.5.5 -psutil==5.9.5 -psycopg2-binary==2.9.7 +prefixmaps==0.2.6 +pronto==2.7.0 +propcache==0.3.2 +psutil==7.0.0 +psycopg2-binary==2.9.10 PuLP==2.7.0 -pydantic==1.10.9 -Pygments==2.16.1 +pydantic==2.11.7 + # curies +pydantic_core==2.33.2 +Pygments==2.19.2 PyJSG==0.11.10 -pykwalify==1.8.0 -pymdown-extensions==10.3 -pyoxigraph==0.2.5 -pyparsing==3.1.1 -pyrsistent==0.17.3 -pysolr==3.9.0 -pystow==0.5.0 -pytest==7.3.2 -pytest-cov==4.1.0 + # shexjsg +pyoxigraph==0.4.11 +pyparsing==3.2.3 + # via rdflib +PyShEx==0.8.1 +PyShExC==0.9.1 +pysolr==3.10.0 +pystow==0.7.1 + # ontoportal-client +pytest==8.4.1 + # pytest-logging +pytest-cov==6.2.1 pytest-logging==2015.11.4 -python-dateutil==2.8.2 -python-Levenshtein-wheels==0.13.2 + # via prefixcommons +python-dateutil==2.9.0.post0 + # arrow PyTrie==0.4.0 -pytz==2021.1 -PyYAML==6.0.1 -pyyaml_env_tag==0.1 + # via curies +pytz==2025.2 + # eutils +PyYAML==6.0.2 + # conda-inject ratelimit==2.2.1 -rdflib==7.0.0 +rdflib==7.1.4 + # cfgraph + # rdflib-jsonld + # rdflib-shim + # sparqlslurper + # sparqlwrapper rdflib-jsonld==0.6.1 + # via rdflib-shim rdflib-shim==1.0.3 -redis==4.4.2 -regex==2022.10.31 -requests==2.28.2 -requests-cache==1.1.0 +referencing==0.36.2 + # jsonschema-specifications +requests==2.32.4 + # pysolr + # requests-toolbelt +requests-cache==1.2.1 requests-toolbelt==1.0.0 reretry==0.11.8 + # snakemake-interface-storage-plugins +rfc3339-validator==0.1.4 rfc3987==1.3.8 -ruamel.yaml==0.17.26 -ruamel.yaml.clib==0.2.7 -scipy==1.11.2 -semsimian==0.2.1 -semsql==0.3.2 -six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 -snakemake==7.32.3 -sniffio==1.3.0 +roman-numerals-py==3.1.0 +rpds-py==0.26.0 +scipy==1.16.1 +semsql==0.4.0 +ShExJSG==0.8.2 +six==1.17.0 + # python-dateutil + # rfc3339-validator +smart_open==7.3.0.post1 +smmap==5.0.2 + # via gitdb +snakemake==9.9.0 +snakemake-interface-common==1.21.0 + # snakemake-interface-logger-plugins + # snakemake-interface-report-plugins +snakemake-interface-executor-plugins==9.3.9 +snakemake-interface-logger-plugins==1.2.4 +snakemake-interface-report-plugins==1.2.0 +snakemake-interface-storage-plugins==4.2.2 +snowballstemmer==3.0.1 sortedcontainers==2.4.0 -soupsieve==2.5 + # via pytrie +soupsieve==2.7 + # via beautifulsoup4 +sparqlslurper==0.5.1 SPARQLWrapper==2.0.0 -SQLAlchemy==2.0.20 +Sphinx==8.2.3 + # via sphinx-click +sphinx-click==6.0.0 +sphinxcontrib-applehelp==2.0.0 +sphinxcontrib-devhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==2.0.0 +sphinxcontrib-serializinghtml==2.0.0 +SQLAlchemy==2.0.42 + # sqlalchemy-utils SQLAlchemy-Utils==0.38.3 -sssom==0.3.40 -sssom-schema==0.15.0 -starlette==0.26.1 -stopit==1.1.2 + # via semsql +sssom==0.4.16 +sssom-schema==1.0.0 stringcase==1.2.0 + # via bmt tabulate==0.9.0 +tenacity==8.5.0 throttler==1.2.2 -toposort==1.10 -tqdm==4.66.1 -traitlets==5.9.0 -typing_extensions==4.6.3 -tzdata==2023.3 -url-normalize==1.4.3 -urllib3==1.26.16 -uvicorn==0.22.0 -validators==0.22.0 -watchdog==3.0.0 -wrapt==1.15.0 -xmltodict==0.13.0 -yarl==1.9.2 -yte==1.5.1 -zipp==3.16.2 +tqdm==4.67.1 + # via pystow +traitlets==5.14.3 +types-python-dateutil==2.9.0.20250708 + # via arrow +typing_extensions==4.14.1 + # beautifulsoup4 + # class-resolver + # pydantic + # pydantic-core + # sqlalchemy + # typing-inspection +typing-inspection==0.4.1 +tzdata==2025.2 + # via pandas +uri-template==1.3.0 +url-normalize==2.2.1 +urllib3==2.5.0 +validators==0.35.0 +watchdog==6.0.0 +webcolors==24.11.1 +wheel==0.45.1 +wrapt==1.17.2 + # deprecated + # smart-open +xmltodict==0.14.2 +yarl==1.20.1 +yte==1.9.0 + +# The following packages are considered to be unsafe in a requirements file: +# setuptools +## The following requirements were added by pip freeze: +anyio==4.9.0 +build==1.2.2.post1 +CacheControl==0.14.3 +cffi==1.17.1 +cleo==2.1.0 +crashtest==0.4.1 +cryptography==45.0.5 +distlib==0.3.9 +dulwich==0.22.8 +filelock==3.18.0 +findpython==0.6.3 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +installer==0.7.0 +jaraco.classes==3.4.0 +jaraco.context==6.0.1 +jaraco.functools==4.2.1 +jeepney==0.9.0 +keyring==25.6.0 +Levenshtein==0.27.1 +msgpack==1.1.1 +pbs-installer==2025.7.8 +pip-tools==7.5.0 +pipdeptree==2.28.0 +pkginfo==1.12.1.2 +plac==1.4.5 +poetry==2.1.3 +poetry-core==2.1.3 +pycparser==2.22 +pyproject_hooks==1.2.0 +python-Levenshtein==0.27.1 +RapidFuzz==3.13.0 +SecretStorage==3.3.3 +setuptools==80.9.0 +shellingham==1.5.4 +sniffio==1.3.1 +tomlkit==0.13.3 +trove-classifiers==2025.5.9.12 +virtualenv==20.31.2 +zstandard==0.23.0 From b54e41c88ba05d8a2b2638fe3777581f2e60a3ca Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:41:44 -0400 Subject: [PATCH 26/28] adding pull_via_urllib --- src/datahandlers/datacollect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py index b5137fbd..f5b710a1 100644 --- a/src/datahandlers/datacollect.py +++ b/src/datahandlers/datacollect.py @@ -1,5 +1,5 @@ from src.ubergraph import UberGraph -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib from collections import defaultdict import os, gzip from json import loads,dumps From 5e2bfb833ff2fd6a4a07d2dc879c0ca2cbba4f14 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:42:15 -0400 Subject: [PATCH 27/28] adding rust impls --- src/snakefiles/anatomy.snakefile | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 6e80a026..3db5a212 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -1,3 +1,5 @@ +from snakemake_interface_executor_plugins.utils import join_cli_args + import src.createcompendia.anatomy as anatomy import src.assess_compendia as assessments import src.snakefiles.util as util @@ -67,18 +69,26 @@ rule get_anatomy_umls_relationships: run: anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile) +def add_flag(files): + return {'-c ': '{wildcards.token}'.format(wildcards=files)} + rule anatomy_compendia: input: labels=os.path.join(config["download_directory"], 'common', config["common"]["labels"][0]), synonyms=os.path.join(config["download_directory"], 'common', config["common"]["synonyms"][0]), - concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']), - idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']), + concords=expand("{dd}/anatomy/concords/{ap}", dd=config['intermediate_directory'], ap=config['anatomy_concords']), + idlists=expand("{dd}/anatomy/ids/{ap}", dd=config['intermediate_directory'], ap=config['anatomy_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', + params: + flagged_concords = " ".join(["-c " + config['intermediate_directory'] + "/anatomy/concords/" + a for a in config['anatomy_concords']]), + flagged_ids = " ".join(["-i " + config['intermediate_directory'] + "/anatomy/ids/" + a for a in config['anatomy_ids']]) output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])) - run: - anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename) + # run: + # anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename) + shell: + "./babel_io/target/release/build_compendia {params.flagged_concords} {params.flagged_ids} -z {input.icrdf_filename}" rule check_anatomy_completeness: input: From b08465cc0101c64664ca9c2cf44bd184ddd795f4 Mon Sep 17 00:00:00 2001 From: Jason Reilly Date: Thu, 25 Sep 2025 11:42:44 -0400 Subject: [PATCH 28/28] initial commit --- requirements.txt | 594 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 594 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..25318166 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,594 @@ +# +# This file is autogenerated by pip-compile with Python 3.13 +# by the following command: +# +# pip-compile requirements.in +# +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.15 + # via apybiomart +aiosignal==1.4.0 + # via aiohttp +airium==0.2.7 + # via + # linkml-renderer + # oaklib +alabaster==1.0.0 + # via sphinx +annotated-types==0.7.0 + # via pydantic +antlr4-python3-runtime==4.9.3 + # via + # linkml + # pyjsg + # pyshexc +appdirs==1.4.4 + # via + # oaklib + # snakemake +apybiomart==0.5.3 + # via -r requirements.in +argparse-dataclass==2.0.0 + # via + # snakemake-interface-common + # snakemake-interface-executor-plugins + # yte +arrow==1.3.0 + # via isoduration +asyncio==3.4.3 + # via apybiomart +attrs==25.3.0 + # via + # aiohttp + # cattrs + # jsonlines + # jsonschema + # referencing + # requests-cache +babel==2.17.0 + # via sphinx +bcp47==0.1.0 + # via funowl +beautifulsoup4==4.13.4 + # via -r requirements.in +biopython==1.85 + # via -r requirements.in +bmt==1.4.5 + # via -r requirements.in +cattrs==25.1.1 + # via requests-cache +certifi==2025.7.14 + # via requests +cfgraph==0.2.1 + # via pyshex +chardet==5.2.0 + # via + # pronto + # pyshex + # pyshexc +charset-normalizer==3.4.2 + # via requests +class-resolver==0.6.0 + # via + # oaklib + # ols-client +click==8.2.1 + # via + # json-flattener + # linkml + # linkml-renderer + # linkml-runtime + # more-click + # oaklib + # ols-client + # prefixcommons + # pystow + # semsql + # sphinx-click + # sssom +conda-inject==1.3.2 + # via snakemake +configargparse==1.7.1 + # via + # snakemake + # snakemake-interface-common +connection-pool==0.0.3 + # via snakemake +coverage[toml]==7.10.1 + # via pytest-cov +curies==0.10.19 + # via + # -r requirements.in + # kgcl-schema + # linkml-runtime + # oaklib + # prefixmaps + # sssom +deprecated==1.2.18 + # via linkml-runtime +deprecation==2.1.0 + # via + # bmt + # sssom +docutils==0.21.2 + # via + # snakemake + # sphinx + # sphinx-click +dpath==2.2.0 + # via + # snakemake + # yte +duckdb==1.3.2 + # via -r requirements.in +et-xmlfile==2.0.0 + # via openpyxl +eutils==0.6.0 + # via oaklib +fastjsonschema==2.21.1 + # via nbformat +fastobo==0.13.0 + # via pronto +fqdn==1.5.1 + # via jsonschema +frozenlist==1.7.0 + # via + # aiohttp + # aiosignal +funowl==0.2.3 + # via oaklib +gitdb==4.0.12 + # via gitpython +gitpython==3.1.45 + # via snakemake +graphviz==0.21 + # via linkml +greenlet==3.2.3 + # via sqlalchemy +hbreader==0.9.1 + # via + # jsonasobj2 + # linkml + # linkml-runtime +humanfriendly==10.0 + # via snakemake +idna==3.10 + # via + # jsonschema + # requests + # url-normalize + # yarl +ijson==3.4.0 + # via ndex2 +imagesize==1.4.1 + # via sphinx +immutables==0.21 + # via snakemake +importlib-resources==6.5.2 + # via sssom +iniconfig==2.1.0 + # via pytest +isodate==0.7.2 + # via linkml +isoduration==20.11.0 + # via jsonschema +jinja2==3.1.6 + # via + # linkml + # snakemake + # sphinx +json-flattener==0.1.9 + # via linkml-runtime +jsonasobj==1.3.1 + # via + # funowl + # pyjsg + # pyshexc +jsonasobj2==1.0.4 + # via + # linkml + # linkml-runtime +jsonlines==4.0.0 + # via + # -r requirements.in + # oaklib +jsonpointer==3.0.0 + # via jsonschema +jsonschema[format]==4.25.0 + # via + # linkml + # linkml-runtime + # nbformat + # snakemake +jsonschema-specifications==2025.4.1 + # via jsonschema +jupyter-core==5.8.1 + # via nbformat +kgcl-rdflib==0.5.0 + # via oaklib +kgcl-schema==0.6.9 + # via + # kgcl-rdflib + # oaklib +lark==1.2.2 + # via kgcl-schema +linkml==1.9.3 + # via sssom +linkml-renderer==0.3.1 + # via oaklib +linkml-runtime==1.9.4 + # via + # bmt + # kgcl-rdflib + # kgcl-schema + # linkml + # linkml-renderer + # oaklib + # semsql + # sssom + # sssom-schema +lxml==6.0.0 + # via eutils +markupsafe==3.0.2 + # via jinja2 +more-click==0.1.2 + # via ols-client +more-itertools==10.7.0 + # via -r requirements.in +multidict==6.6.3 + # via + # aiohttp + # yarl +nbformat==5.10.4 + # via snakemake +ndex2==3.11.0 + # via oaklib +networkx[networkx]==3.5 + # via + # ndex2 + # oaklib + # pronto + # sssom +numpy==2.3.2 + # via + # biopython + # ndex2 + # pandas + # pansql + # scipy +oaklib==0.5.33 + # via -r requirements.in +ols-client==0.1.4 + # via oaklib +ontoportal-client==0.0.4 + # via oaklib +openpyxl==3.1.5 + # via linkml +packaging==25.0 + # via + # deprecation + # pytest + # snakemake + # snakemake-interface-common + # sphinx +pandas==2.3.1 + # via + # -r requirements.in + # apybiomart + # ndex2 + # pansql + # sssom +pansql[pansql]==0.0.1 + # via sssom +parse==1.20.2 + # via linkml +platformdirs==4.3.8 + # via + # jupyter-core + # requests-cache +pluggy==1.6.0 + # via + # pytest + # pytest-cov +prefixcommons==0.1.12 + # via + # linkml + # linkml-runtime +prefixmaps==0.2.6 + # via + # kgcl-schema + # linkml + # linkml-runtime + # oaklib +pronto==2.7.0 + # via oaklib +propcache==0.3.2 + # via + # aiohttp + # yarl +psutil==7.0.0 + # via snakemake +psycopg2-binary==2.9.10 + # via -r requirements.in +pulp==2.7.0 + # via + # -r requirements.in + # snakemake +pydantic==2.11.7 + # via + # curies + # linkml + # linkml-renderer + # linkml-runtime + # oaklib +pydantic-core==2.33.2 + # via pydantic +pygments==2.19.2 + # via + # pytest + # sphinx +pyjsg==0.11.10 + # via + # funowl + # linkml + # pyshexc + # shexjsg +pyoxigraph==0.4.11 + # via -r requirements.in +pyparsing==3.2.3 + # via rdflib +pyshex==0.8.1 + # via linkml +pyshexc==0.9.1 + # via + # linkml + # pyshex +pysolr==3.10.0 + # via oaklib +pystow==0.7.1 + # via + # oaklib + # ols-client + # ontoportal-client +pytest==8.4.1 + # via + # -r requirements.in + # pytest-cov + # pytest-logging +pytest-cov==6.2.1 + # via -r requirements.in +pytest-logging==2015.11.4 + # via prefixcommons +python-dateutil==2.9.0.post0 + # via + # arrow + # linkml + # pandas + # pronto +pytrie==0.4.0 + # via curies +pytz==2025.2 + # via + # eutils + # pandas +pyyaml==6.0.2 + # via + # -r requirements.in + # conda-inject + # json-flattener + # linkml + # linkml-runtime + # prefixcommons + # prefixmaps + # snakemake + # sssom + # yte +ratelimit==2.2.1 + # via oaklib +rdflib==7.1.4 + # via + # cfgraph + # funowl + # linkml + # linkml-runtime + # rdflib-jsonld + # rdflib-shim + # sparqlslurper + # sparqlwrapper + # sssom +rdflib-jsonld==0.6.1 + # via rdflib-shim +rdflib-shim==1.0.3 + # via + # funowl + # pyshex + # pyshexc + # sparqlslurper +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications + # snakemake +requests==2.32.4 + # via + # -r requirements.in + # apybiomart + # eutils + # linkml + # linkml-runtime + # ndex2 + # ols-client + # prefixcommons + # pyshex + # pysolr + # pystow + # requests-cache + # requests-toolbelt + # snakemake + # sphinx +requests-cache==1.2.1 + # via oaklib +requests-toolbelt==1.0.0 + # via ndex2 +reretry==0.11.8 + # via + # snakemake + # snakemake-interface-storage-plugins +rfc3339-validator==0.1.4 + # via jsonschema +rfc3987==1.3.8 + # via + # funowl + # jsonschema +roman-numerals-py==3.1.0 + # via sphinx +rpds-py==0.26.0 + # via + # jsonschema + # referencing +scipy[scipy]==1.16.1 + # via sssom +semsql==0.4.0 + # via oaklib +shexjsg==0.8.2 + # via + # pyshex + # pyshexc +six==1.17.0 + # via + # ndex2 + # python-dateutil + # rfc3339-validator +smart-open==7.3.0.post1 + # via snakemake +smmap==5.0.2 + # via gitdb +snakemake==9.9.0 + # via -r requirements.in +snakemake-interface-common==1.21.0 + # via + # snakemake + # snakemake-interface-executor-plugins + # snakemake-interface-logger-plugins + # snakemake-interface-report-plugins + # snakemake-interface-storage-plugins +snakemake-interface-executor-plugins==9.3.9 + # via snakemake +snakemake-interface-logger-plugins==1.2.4 + # via snakemake +snakemake-interface-report-plugins==1.2.0 + # via snakemake +snakemake-interface-storage-plugins==4.2.2 + # via snakemake +snowballstemmer==3.0.1 + # via sphinx +sortedcontainers==2.4.0 + # via pytrie +soupsieve==2.7 + # via beautifulsoup4 +sparqlslurper==0.5.1 + # via pyshex +sparqlwrapper==2.0.0 + # via + # -r requirements.in + # oaklib + # pyshex + # sparqlslurper + # sssom +sphinx==8.2.3 + # via sphinx-click +sphinx-click==6.0.0 + # via linkml +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +sqlalchemy==2.0.42 + # via + # linkml + # oaklib + # pansql + # sqlalchemy-utils +sqlalchemy-utils==0.38.3 + # via semsql +sssom==0.4.16 + # via oaklib +sssom-schema==1.0.0 + # via sssom +stringcase==1.2.0 + # via bmt +tabulate==0.9.0 + # via snakemake +tenacity==8.5.0 + # via oaklib +throttler==1.2.2 + # via + # snakemake + # snakemake-interface-executor-plugins + # snakemake-interface-storage-plugins +tqdm==4.67.1 + # via pystow +traitlets==5.14.3 + # via + # jupyter-core + # nbformat +types-python-dateutil==2.9.0.20250708 + # via arrow +typing-extensions==4.14.1 + # via + # beautifulsoup4 + # cattrs + # class-resolver + # curies + # ontoportal-client + # pydantic + # pydantic-core + # pystow + # sqlalchemy + # typing-inspection +typing-inspection==0.4.1 + # via pydantic +tzdata==2025.2 + # via pandas +uri-template==1.3.0 + # via jsonschema +url-normalize==2.2.1 + # via requests-cache +urllib3==2.5.0 + # via + # ndex2 + # pyshex + # requests + # requests-cache +validators==0.35.0 + # via sssom +watchdog==6.0.0 + # via linkml +webcolors==24.11.1 + # via jsonschema +wheel==0.45.1 + # via -r requirements.in +wrapt==1.17.2 + # via + # deprecated + # smart-open + # snakemake + # snakemake-interface-storage-plugins +xmltodict==0.14.2 + # via -r requirements.in +yarl==1.20.1 + # via aiohttp +yte==1.9.0 + # via snakemake + +# The following packages are considered to be unsafe in a requirements file: +# setuptools