From 9f21c46ac8c6131ae3d791daa13e2f90f1d6f3a6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 15:38:13 -0400 Subject: [PATCH 1/6] Cleaned up make_local_name() usage; changed labels, synonyms and descriptions to dirs. --- src/datahandlers/chembl.py | 2 +- src/datahandlers/clo.py | 1 - src/datahandlers/datacollect.py | 30 ++++-------------------------- src/datahandlers/ec.py | 3 +-- src/datahandlers/hgnc.py | 4 ++-- src/datahandlers/hgncfamily.py | 4 +--- src/datahandlers/mesh.py | 4 ++-- src/datahandlers/ncbigene.py | 4 ++-- src/datahandlers/obo.py | 6 +++--- src/datahandlers/pantherfamily.py | 2 +- src/datahandlers/umls.py | 8 ++++---- src/datahandlers/uniprotkb.py | 4 +--- 12 files changed, 22 insertions(+), 50 deletions(-) diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py index 271ffd9b..48c0d2c3 100644 --- a/src/datahandlers/chembl.py +++ b/src/datahandlers/chembl.py @@ -1,5 +1,5 @@ from src.prefixes import CHEMBLCOMPOUND -from src.babel_utils import pull_via_ftp, make_local_name +from src.babel_utils import pull_via_ftp import ftplib import pyoxigraph diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py index 018f8d44..66167071 100644 --- a/src/datahandlers/clo.py +++ b/src/datahandlers/clo.py @@ -4,7 +4,6 @@ from src.prefixes import CLO from src.categories import CELL_LINE from src.babel_utils import pull_via_urllib -from src.babel_utils import make_local_name from src.util import Text, LoggingUtil import pyoxigraph diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py index b5137fbd..bd992048 100644 --- a/src/datahandlers/datacollect.py +++ b/src/datahandlers/datacollect.py @@ -1,5 +1,5 @@ from src.ubergraph import UberGraph -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib from collections import defaultdict import os, gzip from json import loads,dumps @@ -8,7 +8,7 @@ def pull_pubchem_labels(): print('LABEL PUBCHEM') f_name = 'CID-Title.gz' cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name) - fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND') + fname = make_local_name('pull_pubchem_labels', subpath='PUBCHEM.COMPOUND/labels') with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') @@ -17,7 +17,7 @@ def pull_pubchem_labels(): def pull_pubchem_synonyms(): f_name = 'CID-Synonym-filtered.gz' sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name) - fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND') + fname = make_local_name('pull_pubchem_synonyms', subpath='PUBCHEM.COMPOUND/synonyms') with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') @@ -31,28 +31,6 @@ def pull_pubchem(): pull_pubchem_labels() pull_pubchem_synonyms() -def pull_hgnc(): - data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json') - hgnc_json = loads(data) - lname = make_local_name('labels', subpath='HGNC') - sname = make_local_name('synonyms', subpath='HGNC') - with open(lname,'w') as lfile, open(sname,'w') as sfile: - for gene in hgnc_json['response']['docs']: - hgnc_id =gene['hgnc_id'] - symbol = gene['symbol'] - lfile.write(f'{hgnc_id}\t{symbol}\n') - name = gene['name'] - sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n') - if 'alias_symbol' in gene: - alias_symbols = gene['alias_symbol'] - for asym in alias_symbols: - sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n') - if 'alias_name' in gene: - alias_names = gene['alias_name'] - for asym in alias_names: - sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n') - - def pull_prot(which,refresh): #swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta') if refresh: @@ -82,7 +60,7 @@ def pull_prot(which,refresh): def pull_prots(refresh_swiss=False,refresh_trembl=False): swiss,labels = pull_prot('sprot',refresh_swiss) - fname = make_local_name('labels', subpath='UNIPROTKB') + fname = make_local_name('pull_prots', subpath='UNIPROTKB/labels') with open(fname,'w') as synonyms: for k,v in labels.items(): synonyms.write(f'{k}\t{v}\n') diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py index 1d77c71b..58f02430 100644 --- a/src/datahandlers/ec.py +++ b/src/datahandlers/ec.py @@ -1,9 +1,8 @@ from src.prefixes import EC from src.categories import MOLECULAR_ACTIVITY from src.babel_utils import pull_via_urllib -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import make_local_name import pyoxigraph -from collections import defaultdict def pull_ec(): diff --git a/src/datahandlers/hgnc.py b/src/datahandlers/hgnc.py index 482f9d67..33401029 100644 --- a/src/datahandlers/hgnc.py +++ b/src/datahandlers/hgnc.py @@ -12,8 +12,8 @@ def pull_hgnc(): def pull_hgnc_labels_and_synonyms(infile): with open(infile,'r') as data: hgnc_json = json.load(data) - lname = make_local_name('labels', subpath='HGNC') - sname = make_local_name('synonyms', subpath='HGNC') + lname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/labels') + sname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/synonyms') with open(lname,'w') as lfile, open(sname,'w') as sfile: for gene in hgnc_json['response']['docs']: hgnc_id =gene['hgnc_id'] diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py index cc6f8c13..07fca6fb 100644 --- a/src/datahandlers/hgncfamily.py +++ b/src/datahandlers/hgncfamily.py @@ -1,6 +1,4 @@ -from pronto.utils.io import decompress - -from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib +from src.babel_utils import pull_via_urllib from src.prefixes import HGNCFAMILY def pull_hgncfamily(): diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index 6c198c4d..f71f9504 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -105,7 +105,7 @@ def pull_mesh_labels(self): WHERE { ?term rdfs:label ?label } ORDER BY ?term """ - ofname = make_local_name('labels', subpath='MESH') + ofname = make_local_name('pull_mesh_labels', subpath='MESH/labels') qres = self.m.query(s) with open(ofname, 'w', encoding='utf8') as outf: for row in list(qres): @@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell', # ifname = make_local_name('mesh.nt', subpath='MESH') -# ofname = make_local_name('labels', subpath='MESH') +# ofname = make_local_name('MESH', subpath='MESH/labels') # badlines = 0 # with open(ofname, 'w') as outf, open(ifname,'r') as data: # for line in data: diff --git a/src/datahandlers/ncbigene.py b/src/datahandlers/ncbigene.py index eacfb163..a2648ee1 100644 --- a/src/datahandlers/ncbigene.py +++ b/src/datahandlers/ncbigene.py @@ -10,8 +10,8 @@ def pull_ncbigene(filenames): def pull_ncbigene_labels_synonyms_and_taxa(): # File format described here: https://ftp.ncbi.nlm.nih.gov/gene/DATA/README ifname = make_local_name('gene_info.gz', subpath='NCBIGene') - labelname = make_local_name('labels', subpath='NCBIGene') - synname = make_local_name('synonyms', subpath='NCBIGene') + labelname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/labels') + synname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/synonyms') taxaname = make_local_name('taxa', subpath='NCBIGene') bad_gene_types = {'biological-region', 'other', 'unknown'} with gzip.open(ifname, 'r') as inf, \ diff --git a/src/datahandlers/obo.py b/src/datahandlers/obo.py index 6b37e052..37f82341 100644 --- a/src/datahandlers/obo.py +++ b/src/datahandlers/obo.py @@ -24,7 +24,7 @@ def pull_uber_labels(expected): ldict[p].add( ( unit['iri'], unit['label'] ) ) for p in ldict: if p not in ['http','ro'] and not p.startswith('t') and not '#' in p: - fname = make_local_name('labels',subpath=p) + fname = make_local_name('pull_uber_labels',subpath=p + "/labels") with open(fname,'w') as outf: for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\n') @@ -39,7 +39,7 @@ def pull_uber_descriptions(expected): ldict[p].add( ( unit['iri'], unit['description'] ) ) for p in ldict: if p not in ['http','ro'] and not p.startswith('t') and not '#' in p: - fname = make_local_name('descriptions',subpath=p) + fname = make_local_name('pull_uber_descriptions',subpath=p + "/descriptions") with open(fname,'w') as outf: for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\n') @@ -57,7 +57,7 @@ def pull_uber_synonyms(expected): # we are going to make some zero-length files for it for p in expected: if p not in ['http','ro'] and not p.startswith('t') and not '#' in p: - fname = make_local_name('synonyms',subpath=p) + fname = make_local_name('pull_uber_synonyms',subpath=p + "/synonyms") with open(fname,'w') as outf: for unit in ldict[p]: outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n') diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index f4a0c596..6cfea38a 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -1,4 +1,4 @@ -from src.babel_utils import make_local_name, pull_via_ftp +from src.babel_utils import pull_via_ftp from src.prefixes import PANTHERFAMILY def pull_pantherfamily(): diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index cd852b68..8a712335 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -305,8 +305,8 @@ def pull_umls(mrconso): """Run through MRCONSO.RRF creating label and synonym files for UMLS and SNOMEDCT""" rows = defaultdict(list) priority = read_umls_priority() - snomed_label_name = make_local_name('labels', subpath='SNOMEDCT') - snomed_syn_name = make_local_name('synonyms', subpath='SNOMEDCT') + snomed_label_name = make_local_name('pull_umls', subpath='SNOMEDCT/labels') + snomed_syn_name = make_local_name('pull_umls', subpath='SNOMEDCT/synonyms') with open(mrconso, 'r') as inf, open(snomed_label_name,'w') as snolabels, open(snomed_syn_name,'w') as snosyns: for line in inf: if not check_mrconso_line(line): @@ -335,8 +335,8 @@ def pull_umls(mrconso): #print(pkey) pri = 1000000 rows[cui].append( (pri,term,line) ) - lname = make_local_name('labels', subpath='UMLS') - sname = make_local_name('synonyms', subpath='UMLS') + lname = make_local_name('pull_umls', subpath='UMLS/labels') + sname = make_local_name('pull_umls', subpath='UMLS/synonyms') re_numerical = re.compile(r"^\s*[+-]*[\d\.]+\s*$") with open(lname,'w') as labels, open(sname,'w') as synonyms: for cui,crows in rows.items(): diff --git a/src/datahandlers/uniprotkb.py b/src/datahandlers/uniprotkb.py index cb65d33e..3c8a7c74 100644 --- a/src/datahandlers/uniprotkb.py +++ b/src/datahandlers/uniprotkb.py @@ -3,9 +3,7 @@ import os import requests -from requests import request - -from src.babel_utils import pull_via_urllib, make_local_name, pull_via_wget +from src.babel_utils import make_local_name def readlabels(which): From 04e5f3d4e3a08693bf13413b2d55627892653e64 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 15:48:40 -0400 Subject: [PATCH 2/6] Standardized how we call NodeFactory(). --- src/createcompendia/drugchemical.py | 2 +- src/createcompendia/leftover_umls.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 6c851941..1567d455 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -378,7 +378,7 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem glom(gloms, pairs_to_be_glommed) # Set up a NodeFactory. - nodefactory = NodeFactory('', get_config()['biolink_version']) + nodefactory = NodeFactory(make_local_name(''), get_config()['biolink_version']) # Write out all the resulting cliques. written = set() diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py index f1d2bfb3..4fe6943f 100644 --- a/src/createcompendia/leftover_umls.py +++ b/src/createcompendia/leftover_umls.py @@ -6,6 +6,7 @@ from snakemake.logging import Logger from bmt import Toolkit +from src.babel_utils import make_local_name from src.node import NodeFactory from src.datahandlers import umls from src.prefixes import UMLS @@ -206,7 +207,7 @@ def umls_type_to_biolink_type(umls_tui): reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n") # Write out synonyms to synonym file. - node_factory = NodeFactory('babel_downloads/UMLS/labels', biolink_version) + node_factory = NodeFactory(make_local_name(''), biolink_version) count_synonym_objs = 0 with jsonlines.open(umls_synonyms, 'w') as umls_synonymsf: for id in synonyms_by_id: From 1c1d353a44b28631bf71a715cfd2a5969ba606e5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 15:49:14 -0400 Subject: [PATCH 3/6] Rewrote write_mods_ids() to use `labels` as a directory. --- src/createcompendia/gene.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 1c89da02..39a6c4f6 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -18,10 +18,16 @@ def write_mods_ids(dd,id,modlist): for mod in modlist: - with open(f'{dd}/{mod}/labels','r') as inf, open(f'{id}/gene/ids/{mod}','w') as outf: - for line in inf: - x = line.split('\t')[0] - outf.write(f'{x}\n') + with open(f'{id}/gene/ids/{mod}','w') as outf: + for labelfile in os.listdir(f'{dd}/{mod}/labels'): + labelfile_path = f'{dd}/{mod}/labels/{labelfile}' + if not os.path.isfile(labelfile_path): + # Skip label files. + continue + with open(labelfile_path,'r') as inf: + for line in inf: + x = line.split('\t')[0] + outf.write(f'{x}\n') def build_gene_ensembl_relationships(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" From c905dd6b7edccd6e082f081a477b7bd8617b38ae Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 15:50:17 -0400 Subject: [PATCH 4/6] Updated/cleaned up make_local_name() change. --- src/createcompendia/drugchemical.py | 2 +- src/datahandlers/mesh.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 1567d455..f80d94e8 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -5,7 +5,7 @@ from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE, ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE) -from src.babel_utils import glom, get_numerical_curie_suffix +from src.babel_utils import glom, get_numerical_curie_suffix, make_local_name from collections import defaultdict import os,json diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py index f71f9504..d16bbc59 100644 --- a/src/datahandlers/mesh.py +++ b/src/datahandlers/mesh.py @@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell', # ifname = make_local_name('mesh.nt', subpath='MESH') -# ofname = make_local_name('MESH', subpath='MESH/labels') +# ofname = make_local_name('write_ids', subpath='MESH/labels') # badlines = 0 # with open(ofname, 'w') as outf, open(ifname,'r') as data: # for line in data: From e50995ced5526459a3e3b4d9a700dd91fa83b572 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 15:51:42 -0400 Subject: [PATCH 5/6] Updated write_labels() to write a labels directory. --- src/datahandlers/mods.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py index 3de6672b..82da5804 100644 --- a/src/datahandlers/mods.py +++ b/src/datahandlers/mods.py @@ -22,7 +22,9 @@ def write_labels(dd): for mod,prefix in modmap.items(): with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf: j = json.load(inf) - with open(f'{dd}/{prefix}/labels','w') as outf: + + os.makedirs(f'{dd}/{prefix}/labels',exist_ok=True) + with open(f'{dd}/{prefix}/labels/write_labels','w') as outf: for gene in j['data']: gid = gene['gene_id'].split(':')[-1] outf.write(f'{prefix}:{gid}\t{gene["gene_name"]}\n') \ No newline at end of file From 728f60497353f5c7bd42269c94fdde0c3efc3b49 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 25 Apr 2025 16:19:18 -0400 Subject: [PATCH 6/6] Modified more labels as directory. --- src/createcompendia/chemicals.py | 100 ++++++++++++++++----------- src/datahandlers/efo.py | 3 + src/snakefiles/anatomy.snakefile | 4 +- src/snakefiles/cell_line.snakefile | 2 +- src/snakefiles/chemical.snakefile | 20 +++--- src/snakefiles/datacollect.snakefile | 10 +-- 6 files changed, 79 insertions(+), 60 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 6acd293f..0d81e0d8 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,4 +1,5 @@ import logging +import os from collections import defaultdict import jsonlines import requests @@ -74,35 +75,40 @@ def build_chemical_umls_relationships(mrconso, idfile,outfile): def build_chemical_rxnorm_relationships(conso, idfile,outfile): umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) -def write_pubchem_ids(labelfile,smilesfile,outfile): +def write_pubchem_ids(labeldir, smilesfile, outfile): #Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier # but since they're already sorted, let's give it a shot - with open(labelfile,'r') as inlabels, gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile,'w') as outf: - sn = -1 - flag_file_ended = False - for labelline in inlabels: - x = labelline.split('\t')[0] - pn = int(x.split(':')[-1]) - while not flag_file_ended and sn < pn: - line = insmiles.readline() - if line == '': - # Get this: a blank line in readline() means that we've reached the end-of-file. - # (A '\n' would indicate that we've just read a blank line.) - flag_file_ended = True - break - smiline = line.strip().split('\t') - if len(smiline) != 2: - raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'") - sn = int(smiline[0]) - - if sn == pn: - #We have a smiles for this id - stype = get_type_from_smiles(smiline[1]) - outf.write(f'{x}\t{stype}\n') - else: - #sn > pn, we went past it. No smiles for that - print('no smiles:',x,pn,sn) - outf.write(f'{x}\t{CHEMICAL_ENTITY}\n') + with gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile, 'w') as outf: + for labelfile in os.listdir(labeldir): + labelpath = os.path.join(labeldir, labelfile) + if not os.path.isfile(labelpath): + continue + with open(labelpath, 'r') as inlabels: + sn = -1 + flag_file_ended = False + for labelline in inlabels: + x = labelline.split('\t')[0] + pn = int(x.split(':')[-1]) + while not flag_file_ended and sn < pn: + line = insmiles.readline() + if line == '': + # Get this: a blank line in readline() means that we've reached the end-of-file. + # (A '\n' would indicate that we've just read a blank line.) + flag_file_ended = True + break + smiline = line.strip().split('\t') + if len(smiline) != 2: + raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'") + sn = int(smiline[0]) + + if sn == pn: + #We have a smiles for this id + stype = get_type_from_smiles(smiline[1]) + outf.write(f'{x}\t{stype}\n') + else: + #sn > pn, we went past it. No smiles for that + print('no smiles:',x,pn,sn) + outf.write(f'{x}\t{CHEMICAL_ENTITY}\n') def write_mesh_ids(outfile): @@ -209,20 +215,25 @@ def write_drugbank_ids(infile,outfile): outf.write(f'{dbid}\t{CHEMICAL_ENTITY}\n') written.add(x[2]) -def write_chemical_ids_from_labels_and_smiles(labelfile,smifile,outfile): +def write_chemical_ids_from_labels_and_smiles(labeldir,smifile,outfile): smiles = {} with open(smifile,'r') as inf: for line in inf: x = line.strip().split('\t') smiles[x[0]] = x[1] - with open(labelfile,'r') as inf, open(outfile,'w') as outf: - for line in inf: - hmdbid = line.split('\t')[0] - if hmdbid in smiles: - ctype = get_type_from_smiles(smiles[hmdbid]) - else: - ctype = CHEMICAL_ENTITY - outf.write(f'{hmdbid}\t{ctype}\n') + with open(outfile,'w') as outf: + for labelfile in os.listdir(labeldir): + labelpath = os.path.join(labeldir,labelfile) + if not os.path.isfile(labelpath): + continue + with open(labelpath,'r') as inf: + for line in inf: + hmdbid = line.split('\t')[0] + if hmdbid in smiles: + ctype = get_type_from_smiles(smiles[hmdbid]) + else: + ctype = CHEMICAL_ENTITY + outf.write(f'{hmdbid}\t{ctype}\n') def parse_smifile(infile,outfile,smicol,idcol,pref,stripquotes=False): @@ -363,17 +374,22 @@ def make_pubchem_cas_concord(pubchemsynonyms, outfile): if is_cas(x[1]): outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n') -def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): +def make_pubchem_mesh_concord(pubcheminput,meshlabelsdir,outfile): mesh_label_to_id={} #Meshlabels has all kinds of stuff. e.g. these are both in there: #MESH:D014867 Water #MESH:M0022883 Water #but we only want the ones that are MESH:D... or MESH:C.... - with open(meshlabels,'r') as inf: - for line in inf: - x = line.strip().split('\t') - if x[0].split(':')[-1][0] in ['C','D']: - mesh_label_to_id[x[1]] = x[0] + for meshlabelsfile in os.listdir(meshlabelsdir): + meshlabels = os.path.join(meshlabelsdir,meshlabelsfile) + if not os.path.isfile(meshlabels): + continue + with open(meshlabels,'r') as inf: + for line in inf: + x = line.strip().split('\t') + if x[0].split(':')[-1][0] in ['C','D']: + mesh_label_to_id[x[1]] = x[0] + #The pubchem - mesh pairs are supposed to be ordered in this file such that the # first mapping is the 'best' i.e. the one most frequently reported. # We will only use the first one diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 03fd59f1..ec81b412 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -1,4 +1,5 @@ import logging +import os import re from src.prefixes import EFO,ORPHANET @@ -159,6 +160,8 @@ def get_xrefs(self, iri, outfile): def make_labels(labelfile,synfile): m = EFOgraph() + os.makedirs(os.path.dirname(labelfile),exist_ok=True) + os.makedirs(os.path.dirname(synfile),exist_ok=True) m.pull_EFO_labels_and_synonyms(labelfile,synfile) def make_ids(roots,idfname): diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 571225d0..e04b731d 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -69,8 +69,8 @@ rule get_anatomy_umls_relationships: rule anatomy_compendia: input: - labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['anatomy_prefixes']), - synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['anatomy_prefixes']), + labels=directory(expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['anatomy_prefixes'])), + synonyms=directory(expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['anatomy_prefixes'])), concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']), idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index b8e72965..e9e4593f 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -22,7 +22,7 @@ rule get_clo_ids: rule cell_line_compendia: input: ids=config['intermediate_directory']+"/cell_line/ids/CLO", - labelfile=config['download_directory'] + '/CLO/labels', + labelfile=directory(config['download_directory'] + '/CLO/labels'), synonymfile=config['download_directory'] + '/CLO/synonyms', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 249fe30c..1d85c8c1 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -28,23 +28,23 @@ rule chemical_mesh_ids: rule chemical_pubchem_ids: input: - infile=config['download_directory']+"/PUBCHEM.COMPOUND/labels", + labelsdir=directory(config['download_directory']+"/PUBCHEM.COMPOUND/labels"), smilesfile=config['download_directory']+"/PUBCHEM.COMPOUND/CID-SMILES.gz" output: outfile=config['intermediate_directory']+"/chemicals/ids/PUBCHEM.COMPOUND" run: #This one is a simple enough transform to do with awk - chemicals.write_pubchem_ids(input.infile,input.smilesfile,output.outfile) + chemicals.write_pubchem_ids(input.labelsdir,input.smilesfile,output.outfile) #"awk '{{print $1\"\tbiolink:ChemicalSubstance\"}}' {input.infile} > {output.outfile}" rule chemical_chembl_ids: input: - labelfile=config['download_directory']+"/CHEMBL.COMPOUND/labels", + labeldir=directory(config['download_directory']+"/CHEMBL.COMPOUND/labels"), smifile =config['download_directory'] + "/CHEMBL.COMPOUND/smiles" output: outfile=config['intermediate_directory']+"/chemicals/ids/CHEMBL.COMPOUND" run: - chemicals.write_chemical_ids_from_labels_and_smiles(input.labelfile,input.smifile,output.outfile) + chemicals.write_chemical_ids_from_labels_and_smiles(input.labeldir,input.smifile,output.outfile) rule chemical_gtopdb_ids: input: @@ -56,7 +56,7 @@ rule chemical_gtopdb_ids: rule chemical_kegg_ids: input: - infile=config['download_directory']+"/KEGG.COMPOUND/labels" + infile=config['download_directory']+"/KEGG.COMPOUND/labels/pull_kegg_compound_labels" output: outfile=config['intermediate_directory']+"/chemicals/ids/KEGG.COMPOUND" shell: @@ -73,12 +73,12 @@ rule chemical_unii_ids: rule chemical_hmdb_ids: input: - labelfile=config['download_directory']+"/HMDB/labels", + labeldir=directory(config['download_directory']+"/HMDB/labels"), smifile=config['download_directory'] + "/HMDB/smiles" output: outfile=config['intermediate_directory']+"/chemicals/ids/HMDB" run: - chemicals.write_chemical_ids_from_labels_and_smiles(input.labelfile,input.smifile,output.outfile) + chemicals.write_chemical_ids_from_labels_and_smiles(input.labeldir,input.smifile,output.outfile) rule chemical_drugcentral_ids: input: @@ -159,11 +159,11 @@ rule get_chemical_unichem_relationships: rule get_chemical_pubchem_mesh_concord: input: pubchemfile=config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH', - meshlabels=config['download_directory'] + '/MESH/labels' + meshlabelsdir=directory(config['download_directory'] + '/MESH/labels') output: outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH' run: - chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile) + chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabelsdir,output.outfile) rule get_chemical_pubchem_cas_concord: input: @@ -201,7 +201,7 @@ rule chemical_unichem_concordia: rule untyped_chemical_compendia: input: - labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['chemical_labels']), + labeldirs=directory(expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['chemical_labels'])), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['chemical_synonyms']), unichemgroup = config['intermediate_directory']+'/chemicals/partials/UNICHEM', concords = expand('{dd}/chemicals/concords/{cc}',dd=config['intermediate_directory'], cc=config['chemical_concords'] ), diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 4bb708dd..8e517601 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -54,8 +54,8 @@ rule get_EFO_labels: input: infile=config['download_directory'] + '/EFO/efo.owl' output: - labelfile=config['download_directory'] + '/EFO/labels', - synonymfile =config['download_directory'] + '/EFO/synonyms' + labelfile=config['download_directory'] + '/EFO/labels/get_EFO_labels', + synonymfile =config['download_directory'] + '/EFO/synonyms/get_EFO_labels' run: efo.make_labels(output.labelfile,output.synonymfile) @@ -89,7 +89,7 @@ rule get_mods_labels: input: expand("{download_directory}/{mod}/GENE-DESCRIPTION-JSON_{mod}.json",download_directory=config['download_directory'], mod=config['mods']), output: - expand("{download_directory}/{mod}/labels",download_directory=config['download_directory'], mod=config['mods']), + expand("{download_directory}/{mod}/labels/write_labels",download_directory=config['download_directory'], mod=config['mods']), run: mods.write_labels(config['download_directory']) @@ -118,7 +118,7 @@ rule get_uniprotkb_labels: sprot_input=config['download_directory']+'/UniProtKB/uniprot_sprot.fasta', trembl_input=config['download_directory']+'/UniProtKB/uniprot_trembl.fasta', output: - outfile=config['download_directory']+'/UniProtKB/labels' + outfile=config['download_directory']+'/UniProtKB/labels/pull_uniprot_labels' run: uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile) @@ -471,7 +471,7 @@ rule gtopdb_labels_and_synonyms: rule keggcompound_labels: output: - labelfile=config['download_directory'] + '/KEGG.COMPOUND/labels' + labelfile=config['download_directory'] + '/KEGG.COMPOUND/labels/pull_kegg_compound_labels' run: kegg.pull_kegg_compound_labels(output.labelfile)