Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
biopython
bmt
datrie
jsonlines
pandas
more-itertools
pyoxigraph~=0.2.5
#pyoxigraph~=0.2.5
pyoxigraph~=0.4.11
psycopg2-binary
pytest
pytest-cov
python-Levenshtein-wheels
#python-Levenshtein-wheels
python-levenshtein
pyyaml
requests
snakemake
Expand Down
9 changes: 6 additions & 3 deletions src/datahandlers/chembl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os.path
import pathlib

from src.prefixes import CHEMBLCOMPOUND
from src.babel_utils import pull_via_ftp, make_local_name
import ftplib
Expand Down Expand Up @@ -48,11 +51,11 @@ def __init__(self,ifname,ccofile):
from datetime import datetime as dt
print('loading chembl')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ccofile,'rb') as inf:
self.m.load(inf,'application/turtle')
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
with open(ifname,'rb') as inf:
self.m.load(inf,'application/turtle')
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/clo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def __init__(self,ifname):
from datetime import datetime as dt
print('loading CLO')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ifname,'rb') as inf:
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri='http://example.org/')
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down
2 changes: 1 addition & 1 deletion src/datahandlers/datacollect.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from src.ubergraph import UberGraph
from src.babel_utils import make_local_name, pull_via_ftp
from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
from collections import defaultdict
import os, gzip
from json import loads,dumps
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/ec.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def __init__(self):
from datetime import datetime as dt
print('loading EC')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ifname,'rb') as inf:
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down
5 changes: 3 additions & 2 deletions src/datahandlers/efo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def __init__(self):
from datetime import datetime as dt
print('loading EFO')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ifname,'rb') as inf:
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down Expand Up @@ -120,6 +120,7 @@ def get_exacts(self, iri, outfile):
outfile.write(f"{iri}\tskos:exactMatch\t{otherid}\n")
nwrite += 1
return nwrite

def get_xrefs(self, iri, outfile):
query = f"""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
Expand Down
27 changes: 14 additions & 13 deletions src/datahandlers/hgncfamily.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import csv

from pronto.utils.io import decompress

from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
Expand All @@ -11,19 +13,18 @@ def pull_hgncfamily():
decompress=False,
subpath=HGNCFAMILY)

def pull_labels(infile,outfile, metadata_yaml):
with open(infile,'r') as inf:
data = inf.read().strip()
lines = data.split('\n')
with open(outfile,'w') as outf:
#skip header
for line in lines[1:]:
parts = line.split(',')
if len(parts) < 10:
continue
i = f"{HGNCFAMILY}:{parts[0][1:-1]}"
l = parts[2][1:-1]
outf.write(f'{i}\t{l}\n')
def pull_labels(infile, labelsfile, descriptionsfile, metadata_yaml):
with open(infile, 'r') as inf, open(labelsfile, 'w') as labelsf, open(descriptionsfile, 'w') as descriptionsf:
reader = csv.DictReader(inf)
for row in reader:
curie = f"{HGNCFAMILY}:{row['id']}"
name = row['name']
description = row['desc_comment']
# There is also a 'desc_label' field, but this seems to be pretty similar to 'name'.
labelsf.write(f'{curie}\t{name}\n')

if description and description != "NULL":
descriptionsf.write(f'{curie}\t{description}\n')

write_metadata(
metadata_yaml,
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/mesh.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def __init__(self):
from datetime import datetime as dt
print('loading mesh.nt')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ifname,'rb') as inf:
self.m.load(inf,'application/n-triples')
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.N_TRIPLES)
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down
8 changes: 8 additions & 0 deletions src/datahandlers/mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@
def pull_mods():
for mod in mods:
subp = modmap[mod]

# We get the downloads from https://www.alliancegenome.org/downloads#gene-descriptions.
# They are also available at:
# - https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/SGD/GENE-DESCRIPTION-JSON_SGD_9.json.gz
# - origname = pull_via_urllib(f"https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/{mod}/",f'GENE-DESCRIPTION-JSON_{mod}_9.json.gz', subpath=subp)
#
# However, the following URL returns the latest version of this file for each model organism.
origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp)

#This should be fine. But for the makefile it's nice if the directory in which this goes is the same as the {mod} in the filename.
# And we'd like it to be the names of the prefixes
if mod != modmap[mod]:
Expand Down
16 changes: 6 additions & 10 deletions src/datahandlers/pantherfamily.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,20 @@ def pull_pantherfamily():
# - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/

def pull_labels(infile,outfile, metadata_yaml):
with open(infile,'r') as inf:
data = inf.read()
lines = data.strip().split('\n')
SUBFAMILY_COLUMN = 3
MAINFAMILY_NAME_COLUMN = 4
SUBFAMILY_NAME_COLUMN = 5
panther_families=[]
labels = {}
done = set()
with open(outfile,'w') as labelf:
for line in lines[1:]:
with open(infile,'r') as inf, open(outfile,'w') as labelf:
for raw_line in inf:
line = raw_line.strip()
parts = line.split('\t')
if len(parts) < 5:
continue
sf = parts[SUBFAMILY_COLUMN]
mf = sf.split(':')[0]
mfname = parts[MAINFAMILY_NAME_COLUMN]
sfname = parts[SUBFAMILY_NAME_COLUMN]
mf = sf.split(':')[0] # PTHR10845:SF155 -> PTHR10845
mfname = parts[MAINFAMILY_NAME_COLUMN] # REGULATOR OF G PROTEIN SIGNALING
sfname = parts[SUBFAMILY_NAME_COLUMN] # REGULATOR OF G-PROTEIN SIGNALING 18
if mf not in done:
main_family = f'{PANTHERFAMILY}:{mf}'
#panther_families.append(main_family)
Expand Down
4 changes: 2 additions & 2 deletions src/datahandlers/rhea.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ def __init__(self):
from datetime import datetime as dt
print('loading rhea')
start = dt.now()
self.m= pyoxigraph.MemoryStore()
self.m= pyoxigraph.Store()
with open(ifname,'rb') as inf:
self.m.load(inf,'application/rdf+xml')
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML)
end = dt.now()
print('loading complete')
print(f'took {end-start}')
Expand Down
1 change: 0 additions & 1 deletion src/datahandlers/unichem.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
data_sources: dict = {'1': CHEMBLCOMPOUND, '2': DRUGBANK, '4': GTOPDB, '6': KEGGCOMPOUND, '7': CHEBI, '14': UNII,
'18': HMDB, '22': PUBCHEMCOMPOUND, '34': DRUGCENTRAL}


def pull_unichem():
""" Download UniChem files. """
pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/', 'structure.tsv.gz', decompress=False, subpath='UNICHEM')
Expand Down
9 changes: 5 additions & 4 deletions src/snakefiles/datacollect.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -277,12 +277,13 @@ rule get_hgncfamily:

rule get_hgncfamily_labels:
input:
infile=rules.get_hgncfamily.output.outfile
infile=config['download_directory'] + '/HGNC.FAMILY/family.csv'
output:
outfile = config['download_directory'] + '/HGNC.FAMILY/labels',
labelsfile = config['download_directory'] + '/HGNC.FAMILY/labels',
descriptionsfile = config['download_directory'] + '/HGNC.FAMILY/descriptions',
metadata_yaml = config['download_directory'] + '/HGNC.FAMILY/metadata.yaml',
run:
hgncfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml)
hgncfamily.pull_labels(input.infile, output.labelsfile, output.descriptionsfile, output.metadata_yaml)

### PANTHER.FAMILY

Expand All @@ -294,7 +295,7 @@ rule get_pantherfamily:

rule get_pantherfamily_labels:
input:
infile=rules.get_pantherfamily.output.outfile
infile=config['download_directory'] + '/PANTHER.FAMILY/family.csv'
output:
outfile = config['download_directory'] + '/PANTHER.FAMILY/labels',
metadata_yaml = config['download_directory'] + '/PANTHER.FAMILY/metadata.yaml',
Expand Down
54 changes: 29 additions & 25 deletions tests/test_glom.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,58 @@

"""glom is a tool that looks at list of sets of values and combines them together if they share members"""


def test_uberon():
uberon=[('UBERON:123',)]
dict={}
glom(dict,uberon,unique_prefixes='UBERON')
uber2 = [set(['UBERON:123','SOME:other'])]
glom(dict,uber2,unique_prefixes='UBERON')
uberon = [("UBERON:123",)]
dict = {}
glom(dict, uberon, unique_prefixes="UBERON")
uber2 = [{"UBERON:123", "SOME:other"}]
glom(dict, uber2, unique_prefixes="UBERON")
print(dict)


def test_simple():
"""Given 3 sets, 2 of which share a member, output 2 sets, with the sharing sets combined"""
d = {}
eqs = [('1','2'), ('2','3'), ('4','5')]
glom(d,eqs)
eqs = [("1", "2"), ("2", "3"), ("4", "5")]
glom(d, eqs)
print(f"{d}")
assert len(d) == 5
assert d['1'] == d['2'] == d['3'] == {'1','2','3'}
assert d['4'] == d['5'] == {'4','5'}
assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
assert d["4"] == d["5"] == {"4", "5"}


def test_two_calls():
"""Test using glom iteratively. The first call joins the first two sets, then the second call joins
the next two and the new set."""
d = {}
eqs = [('1','2'), ('2','3'), ('4','5'), ('6','7')]
oeqs = [('5','7')]
glom(d,eqs)
glom(d,oeqs)
assert d['1']==d['2']==d['3']=={'1','2','3'}
assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
eqs = [("1", "2"), ("2", "3"), ("4", "5"), ("6", "7")]
oeqs = [("5", "7")]
glom(d, eqs)
glom(d, oeqs)
assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}


def test_sets():
"""Test using set() as opposed to {}"""
d = {}
eqs = [{'1','2'}, set(['2','3']), set(['4','5']), set(['6','7'])]
oeqs = [{'5','7'}]
glom(d,eqs)
glom(d,oeqs)
assert d['1']==d['2']==d['3']=={'1','2','3'}
assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
eqs = [{"1", "2"}, {"2", "3"}, {"4", "5"}, {"6", "7"}]
oeqs = [{"5", "7"}]
glom(d, eqs)
glom(d, oeqs)
assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}


def test_bigger_sets():
"""Test when the sets have more than two members.
As of recent builds, we no longer expect this to work.
Now glom only operates on new pairwise sets"""
d = {}
eqs = [{'1','2','3'}, {'4','5','6'} ]
eqs = [{"1", "2", "3"}, {"4", "5", "6"}]
try:
glom(d,eqs)
glom(d, eqs)
assert False
except ValueError:
assert True