Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 58 additions & 42 deletions src/createcompendia/chemicals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
from collections import defaultdict
import jsonlines
import requests
Expand Down Expand Up @@ -74,35 +75,40 @@ def build_chemical_umls_relationships(mrconso, idfile,outfile):
def build_chemical_rxnorm_relationships(conso, idfile,outfile):
umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI)

def write_pubchem_ids(labelfile,smilesfile,outfile):
def write_pubchem_ids(labeldir, smilesfile, outfile):
#Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier
# but since they're already sorted, let's give it a shot
with open(labelfile,'r') as inlabels, gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile,'w') as outf:
sn = -1
flag_file_ended = False
for labelline in inlabels:
x = labelline.split('\t')[0]
pn = int(x.split(':')[-1])
while not flag_file_ended and sn < pn:
line = insmiles.readline()
if line == '':
# Get this: a blank line in readline() means that we've reached the end-of-file.
# (A '\n' would indicate that we've just read a blank line.)
flag_file_ended = True
break
smiline = line.strip().split('\t')
if len(smiline) != 2:
raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
sn = int(smiline[0])

if sn == pn:
#We have a smiles for this id
stype = get_type_from_smiles(smiline[1])
outf.write(f'{x}\t{stype}\n')
else:
#sn > pn, we went past it. No smiles for that
print('no smiles:',x,pn,sn)
outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')
with gzip.open(smilesfile, 'rt', encoding='utf-8') as insmiles, open(outfile, 'w') as outf:
for labelfile in os.listdir(labeldir):
labelpath = os.path.join(labeldir, labelfile)
if not os.path.isfile(labelpath):
continue
with open(labelpath, 'r') as inlabels:
sn = -1
flag_file_ended = False
for labelline in inlabels:
x = labelline.split('\t')[0]
pn = int(x.split(':')[-1])
while not flag_file_ended and sn < pn:
line = insmiles.readline()
if line == '':
# Get this: a blank line in readline() means that we've reached the end-of-file.
# (A '\n' would indicate that we've just read a blank line.)
flag_file_ended = True
break
smiline = line.strip().split('\t')
if len(smiline) != 2:
raise RuntimeError(f"Could not parse line from {smilesfile}: '{line}'")
sn = int(smiline[0])

if sn == pn:
#We have a smiles for this id
stype = get_type_from_smiles(smiline[1])
outf.write(f'{x}\t{stype}\n')
else:
#sn > pn, we went past it. No smiles for that
print('no smiles:',x,pn,sn)
outf.write(f'{x}\t{CHEMICAL_ENTITY}\n')


def write_mesh_ids(outfile):
Expand Down Expand Up @@ -209,20 +215,25 @@ def write_drugbank_ids(infile,outfile):
outf.write(f'{dbid}\t{CHEMICAL_ENTITY}\n')
written.add(x[2])

def write_chemical_ids_from_labels_and_smiles(labelfile,smifile,outfile):
def write_chemical_ids_from_labels_and_smiles(labeldir,smifile,outfile):
smiles = {}
with open(smifile,'r') as inf:
for line in inf:
x = line.strip().split('\t')
smiles[x[0]] = x[1]
with open(labelfile,'r') as inf, open(outfile,'w') as outf:
for line in inf:
hmdbid = line.split('\t')[0]
if hmdbid in smiles:
ctype = get_type_from_smiles(smiles[hmdbid])
else:
ctype = CHEMICAL_ENTITY
outf.write(f'{hmdbid}\t{ctype}\n')
with open(outfile,'w') as outf:
for labelfile in os.listdir(labeldir):
labelpath = os.path.join(labeldir,labelfile)
if not os.path.isfile(labelpath):
continue
with open(labelpath,'r') as inf:
for line in inf:
hmdbid = line.split('\t')[0]
if hmdbid in smiles:
ctype = get_type_from_smiles(smiles[hmdbid])
else:
ctype = CHEMICAL_ENTITY
outf.write(f'{hmdbid}\t{ctype}\n')


def parse_smifile(infile,outfile,smicol,idcol,pref,stripquotes=False):
Expand Down Expand Up @@ -363,17 +374,22 @@ def make_pubchem_cas_concord(pubchemsynonyms, outfile):
if is_cas(x[1]):
outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n')

def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile):
def make_pubchem_mesh_concord(pubcheminput,meshlabelsdir,outfile):
mesh_label_to_id={}
#Meshlabels has all kinds of stuff. e.g. these are both in there:
#MESH:D014867 Water
#MESH:M0022883 Water
#but we only want the ones that are MESH:D... or MESH:C....
with open(meshlabels,'r') as inf:
for line in inf:
x = line.strip().split('\t')
if x[0].split(':')[-1][0] in ['C','D']:
mesh_label_to_id[x[1]] = x[0]
for meshlabelsfile in os.listdir(meshlabelsdir):
meshlabels = os.path.join(meshlabelsdir,meshlabelsfile)
if not os.path.isfile(meshlabels):
continue
with open(meshlabels,'r') as inf:
for line in inf:
x = line.strip().split('\t')
if x[0].split(':')[-1][0] in ['C','D']:
mesh_label_to_id[x[1]] = x[0]

#The pubchem - mesh pairs are supposed to be ordered in this file such that the
# first mapping is the 'best' i.e. the one most frequently reported.
# We will only use the first one
Expand Down
4 changes: 2 additions & 2 deletions src/createcompendia/drugchemical.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
SMALL_MOLECULE, NUCLEIC_ACID_ENTITY, MOLECULAR_ENTITY, FOOD_ADDITIVE,
ENVIRONMENTAL_FOOD_CONTAMINANT, PROCESSED_MATERIAL, CHEMICAL_MIXTURE, POLYPEPTIDE)
from src.babel_utils import glom, get_numerical_curie_suffix
from src.babel_utils import glom, get_numerical_curie_suffix, make_local_name
from collections import defaultdict
import os,json

Expand Down Expand Up @@ -378,7 +378,7 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem
glom(gloms, pairs_to_be_glommed)

# Set up a NodeFactory.
nodefactory = NodeFactory('', get_config()['biolink_version'])
nodefactory = NodeFactory(make_local_name(''), get_config()['biolink_version'])

# Write out all the resulting cliques.
written = set()
Expand Down
14 changes: 10 additions & 4 deletions src/createcompendia/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,16 @@

def write_mods_ids(dd,id,modlist):
for mod in modlist:
with open(f'{dd}/{mod}/labels','r') as inf, open(f'{id}/gene/ids/{mod}','w') as outf:
for line in inf:
x = line.split('\t')[0]
outf.write(f'{x}\n')
with open(f'{id}/gene/ids/{mod}','w') as outf:
for labelfile in os.listdir(f'{dd}/{mod}/labels'):
labelfile_path = f'{dd}/{mod}/labels/{labelfile}'
if not os.path.isfile(labelfile_path):
# Skip label files.
continue
with open(labelfile_path,'r') as inf:
for line in inf:
x = line.split('\t')[0]
outf.write(f'{x}\n')

def build_gene_ensembl_relationships(ensembl_dir, outfile):
"""Loop over all the ensembl species. Find any protein-coding gene"""
Expand Down
3 changes: 2 additions & 1 deletion src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from snakemake.logging import Logger
from bmt import Toolkit

from src.babel_utils import make_local_name
from src.node import NodeFactory
from src.datahandlers import umls
from src.prefixes import UMLS
Expand Down Expand Up @@ -206,7 +207,7 @@ def umls_type_to_biolink_type(umls_tui):
reportf.write(f"Collected synonyms for {len(synonyms_by_id)} UMLS IDs into the leftover UMLS synonyms file.\n")

# Write out synonyms to synonym file.
node_factory = NodeFactory('babel_downloads/UMLS/labels', biolink_version)
node_factory = NodeFactory(make_local_name(''), biolink_version)
count_synonym_objs = 0
with jsonlines.open(umls_synonyms, 'w') as umls_synonymsf:
for id in synonyms_by_id:
Expand Down
2 changes: 1 addition & 1 deletion src/datahandlers/chembl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.prefixes import CHEMBLCOMPOUND
from src.babel_utils import pull_via_ftp, make_local_name
from src.babel_utils import pull_via_ftp
import ftplib
import pyoxigraph

Expand Down
1 change: 0 additions & 1 deletion src/datahandlers/clo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from src.prefixes import CLO
from src.categories import CELL_LINE
from src.babel_utils import pull_via_urllib
from src.babel_utils import make_local_name
from src.util import Text, LoggingUtil
import pyoxigraph

Expand Down
30 changes: 4 additions & 26 deletions src/datahandlers/datacollect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.ubergraph import UberGraph
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
from collections import defaultdict
import os, gzip
from json import loads,dumps
Expand All @@ -8,7 +8,7 @@ def pull_pubchem_labels():
print('LABEL PUBCHEM')
f_name = 'CID-Title.gz'
cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name)
fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND')
fname = make_local_name('pull_pubchem_labels', subpath='PUBCHEM.COMPOUND/labels')
with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf:
for line in inf:
x = line.strip().split('\t')
Expand All @@ -17,7 +17,7 @@ def pull_pubchem_labels():
def pull_pubchem_synonyms():
f_name = 'CID-Synonym-filtered.gz'
sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name)
fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND')
fname = make_local_name('pull_pubchem_synonyms', subpath='PUBCHEM.COMPOUND/synonyms')
with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf:
for line in inf:
x = line.strip().split('\t')
Expand All @@ -31,28 +31,6 @@ def pull_pubchem():
pull_pubchem_labels()
pull_pubchem_synonyms()

def pull_hgnc():
data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json')
hgnc_json = loads(data)
lname = make_local_name('labels', subpath='HGNC')
sname = make_local_name('synonyms', subpath='HGNC')
with open(lname,'w') as lfile, open(sname,'w') as sfile:
for gene in hgnc_json['response']['docs']:
hgnc_id =gene['hgnc_id']
symbol = gene['symbol']
lfile.write(f'{hgnc_id}\t{symbol}\n')
name = gene['name']
sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n')
if 'alias_symbol' in gene:
alias_symbols = gene['alias_symbol']
for asym in alias_symbols:
sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
if 'alias_name' in gene:
alias_names = gene['alias_name']
for asym in alias_names:
sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')


def pull_prot(which,refresh):
#swissname = pull_via_ftplib('ftp.uniprot.org','/pub/databases/uniprot/current_release/knowledgebase/complete/',f'uniprot_{which}.fasta.gz',decompress_data=True,outfilename=f'uniprot_{which}.fasta')
if refresh:
Expand Down Expand Up @@ -82,7 +60,7 @@ def pull_prot(which,refresh):

def pull_prots(refresh_swiss=False,refresh_trembl=False):
swiss,labels = pull_prot('sprot',refresh_swiss)
fname = make_local_name('labels', subpath='UNIPROTKB')
fname = make_local_name('pull_prots', subpath='UNIPROTKB/labels')
with open(fname,'w') as synonyms:
for k,v in labels.items():
synonyms.write(f'{k}\t{v}\n')
Expand Down
3 changes: 1 addition & 2 deletions src/datahandlers/ec.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from src.prefixes import EC
from src.categories import MOLECULAR_ACTIVITY
from src.babel_utils import pull_via_urllib
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name
import pyoxigraph
from collections import defaultdict


def pull_ec():
Expand Down
3 changes: 3 additions & 0 deletions src/datahandlers/efo.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os
import re

from src.prefixes import EFO,ORPHANET
Expand Down Expand Up @@ -159,6 +160,8 @@ def get_xrefs(self, iri, outfile):

def make_labels(labelfile,synfile):
m = EFOgraph()
os.makedirs(os.path.dirname(labelfile),exist_ok=True)
os.makedirs(os.path.dirname(synfile),exist_ok=True)
m.pull_EFO_labels_and_synonyms(labelfile,synfile)

def make_ids(roots,idfname):
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def pull_hgnc():
def pull_hgnc_labels_and_synonyms(infile):
with open(infile,'r') as data:
hgnc_json = json.load(data)
lname = make_local_name('labels', subpath='HGNC')
sname = make_local_name('synonyms', subpath='HGNC')
lname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/labels')
sname = make_local_name('pull_hgnc_labels_and_synonyms', subpath='HGNC/synonyms')
with open(lname,'w') as lfile, open(sname,'w') as sfile:
for gene in hgnc_json['response']['docs']:
hgnc_id =gene['hgnc_id']
Expand Down
4 changes: 1 addition & 3 deletions src/datahandlers/hgncfamily.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from pronto.utils.io import decompress

from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
from src.babel_utils import pull_via_urllib
from src.prefixes import HGNCFAMILY

def pull_hgncfamily():
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def pull_mesh_labels(self):
WHERE { ?term rdfs:label ?label }
ORDER BY ?term
"""
ofname = make_local_name('labels', subpath='MESH')
ofname = make_local_name('pull_mesh_labels', subpath='MESH/labels')
qres = self.m.query(s)
with open(ofname, 'w', encoding='utf8') as outf:
for row in list(qres):
Expand Down Expand Up @@ -148,7 +148,7 @@ def write_ids(meshmap,outfile,order=['biolink:CellularComponent','biolink:Cell',


# ifname = make_local_name('mesh.nt', subpath='MESH')
# ofname = make_local_name('labels', subpath='MESH')
# ofname = make_local_name('write_ids', subpath='MESH/labels')
# badlines = 0
# with open(ofname, 'w') as outf, open(ifname,'r') as data:
# for line in data:
Expand Down
4 changes: 3 additions & 1 deletion src/datahandlers/mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ def write_labels(dd):
for mod,prefix in modmap.items():
with open(f'{dd}/{prefix}/GENE-DESCRIPTION-JSON_{prefix}.json','r') as inf:
j = json.load(inf)
with open(f'{dd}/{prefix}/labels','w') as outf:

os.makedirs(f'{dd}/{prefix}/labels',exist_ok=True)
with open(f'{dd}/{prefix}/labels/write_labels','w') as outf:
for gene in j['data']:
gid = gene['gene_id'].split(':')[-1]
outf.write(f'{prefix}:{gid}\t{gene["gene_name"]}\n')
4 changes: 2 additions & 2 deletions src/datahandlers/ncbigene.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ def pull_ncbigene(filenames):
def pull_ncbigene_labels_synonyms_and_taxa():
# File format described here: https://ftp.ncbi.nlm.nih.gov/gene/DATA/README
ifname = make_local_name('gene_info.gz', subpath='NCBIGene')
labelname = make_local_name('labels', subpath='NCBIGene')
synname = make_local_name('synonyms', subpath='NCBIGene')
labelname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/labels')
synname = make_local_name('pull_ncbigene_labels_synonyms_and_taxa', subpath='NCBIGene/synonyms')
taxaname = make_local_name('taxa', subpath='NCBIGene')
bad_gene_types = {'biological-region', 'other', 'unknown'}
with gzip.open(ifname, 'r') as inf, \
Expand Down
6 changes: 3 additions & 3 deletions src/datahandlers/obo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def pull_uber_labels(expected):
ldict[p].add( ( unit['iri'], unit['label'] ) )
for p in ldict:
if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
fname = make_local_name('labels',subpath=p)
fname = make_local_name('pull_uber_labels',subpath=p + "/labels")
with open(fname,'w') as outf:
for unit in ldict[p]:
outf.write(f'{unit[0]}\t{unit[1]}\n')
Expand All @@ -39,7 +39,7 @@ def pull_uber_descriptions(expected):
ldict[p].add( ( unit['iri'], unit['description'] ) )
for p in ldict:
if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
fname = make_local_name('descriptions',subpath=p)
fname = make_local_name('pull_uber_descriptions',subpath=p + "/descriptions")
with open(fname,'w') as outf:
for unit in ldict[p]:
outf.write(f'{unit[0]}\t{unit[1]}\n')
Expand All @@ -57,7 +57,7 @@ def pull_uber_synonyms(expected):
# we are going to make some zero-length files for it
for p in expected:
if p not in ['http','ro'] and not p.startswith('t') and not '#' in p:
fname = make_local_name('synonyms',subpath=p)
fname = make_local_name('pull_uber_synonyms',subpath=p + "/synonyms")
with open(fname,'w') as outf:
for unit in ldict[p]:
outf.write(f'{unit[0]}\t{unit[1]}\t{unit[2]}\n')
Expand Down
2 changes: 1 addition & 1 deletion src/datahandlers/pantherfamily.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import pull_via_ftp
from src.prefixes import PANTHERFAMILY

def pull_pantherfamily():
Expand Down
Loading