Skip to content

Commit fbf5a8d

Browse files
authored
Pyoxigraph and test improvements (#590)
This PR upgrades [pyoxigraph v0.2.5 to v0.4.11](https://pyoxigraph.readthedocs.io/en/stable/migration.html#from-0-2-to-0-3), which requires the interface to be updated. It also replaces the dodgy/outdated [python-Levenshtein-wheels](https://pypi.org/project/python-Levenshtein-wheels/) with the up-to-date [python-Levenshtein](https://pypi.org/project/python-Levenshtein/). Adapted from PR #588.
2 parents 437b59d + 4c9f7bd commit fbf5a8d

File tree

14 files changed

+84
-70
lines changed

14 files changed

+84
-70
lines changed

requirements.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
biopython
22
bmt
3-
datrie
43
jsonlines
54
pandas
65
more-itertools
7-
pyoxigraph~=0.2.5
6+
#pyoxigraph~=0.2.5
7+
pyoxigraph~=0.4.11
88
psycopg2-binary
99
pytest
1010
pytest-cov
11-
python-Levenshtein-wheels
11+
#python-Levenshtein-wheels
12+
python-levenshtein
1213
pyyaml
1314
requests
1415
snakemake

src/datahandlers/chembl.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import os.path
2+
import pathlib
3+
14
from src.prefixes import CHEMBLCOMPOUND
25
from src.babel_utils import pull_via_ftp, make_local_name
36
import ftplib
@@ -48,11 +51,11 @@ def __init__(self,ifname,ccofile):
4851
from datetime import datetime as dt
4952
print('loading chembl')
5053
start = dt.now()
51-
self.m= pyoxigraph.MemoryStore()
54+
self.m= pyoxigraph.Store()
5255
with open(ccofile,'rb') as inf:
53-
self.m.load(inf,'application/turtle')
56+
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
5457
with open(ifname,'rb') as inf:
55-
self.m.load(inf,'application/turtle')
58+
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
5659
end = dt.now()
5760
print('loading complete')
5861
print(f'took {end-start}')

src/datahandlers/clo.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ def __init__(self,ifname):
2424
from datetime import datetime as dt
2525
print('loading CLO')
2626
start = dt.now()
27-
self.m= pyoxigraph.MemoryStore()
27+
self.m= pyoxigraph.Store()
2828
with open(ifname,'rb') as inf:
29-
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
29+
self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri='http://example.org/')
3030
end = dt.now()
3131
print('loading complete')
3232
print(f'took {end-start}')

src/datahandlers/datacollect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from src.ubergraph import UberGraph
2-
from src.babel_utils import make_local_name, pull_via_ftp
2+
from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
33
from collections import defaultdict
44
import os, gzip
55
from json import loads,dumps

src/datahandlers/ec.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ def __init__(self):
2323
from datetime import datetime as dt
2424
print('loading EC')
2525
start = dt.now()
26-
self.m= pyoxigraph.MemoryStore()
26+
self.m= pyoxigraph.Store()
2727
with open(ifname,'rb') as inf:
28-
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
28+
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
2929
end = dt.now()
3030
print('loading complete')
3131
print(f'took {end-start}')

src/datahandlers/efo.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ def __init__(self):
2727
from datetime import datetime as dt
2828
print('loading EFO')
2929
start = dt.now()
30-
self.m= pyoxigraph.MemoryStore()
30+
self.m= pyoxigraph.Store()
3131
with open(ifname,'rb') as inf:
32-
self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
32+
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
3333
end = dt.now()
3434
print('loading complete')
3535
print(f'took {end-start}')
@@ -120,6 +120,7 @@ def get_exacts(self, iri, outfile):
120120
outfile.write(f"{iri}\tskos:exactMatch\t{otherid}\n")
121121
nwrite += 1
122122
return nwrite
123+
123124
def get_xrefs(self, iri, outfile):
124125
query = f"""
125126
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>

src/datahandlers/hgncfamily.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import csv
2+
13
from pronto.utils.io import decompress
24

35
from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
@@ -11,19 +13,18 @@ def pull_hgncfamily():
1113
decompress=False,
1214
subpath=HGNCFAMILY)
1315

14-
def pull_labels(infile,outfile, metadata_yaml):
15-
with open(infile,'r') as inf:
16-
data = inf.read().strip()
17-
lines = data.split('\n')
18-
with open(outfile,'w') as outf:
19-
#skip header
20-
for line in lines[1:]:
21-
parts = line.split(',')
22-
if len(parts) < 10:
23-
continue
24-
i = f"{HGNCFAMILY}:{parts[0][1:-1]}"
25-
l = parts[2][1:-1]
26-
outf.write(f'{i}\t{l}\n')
16+
def pull_labels(infile, labelsfile, descriptionsfile, metadata_yaml):
17+
with open(infile, 'r') as inf, open(labelsfile, 'w') as labelsf, open(descriptionsfile, 'w') as descriptionsf:
18+
reader = csv.DictReader(inf)
19+
for row in reader:
20+
curie = f"{HGNCFAMILY}:{row['id']}"
21+
name = row['name']
22+
description = row['desc_comment']
23+
# There is also a 'desc_label' field, but this seems to be pretty similar to 'name'.
24+
labelsf.write(f'{curie}\t{name}\n')
25+
26+
if description and description != "NULL":
27+
descriptionsf.write(f'{curie}\t{description}\n')
2728

2829
write_metadata(
2930
metadata_yaml,

src/datahandlers/mesh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ def __init__(self):
1313
from datetime import datetime as dt
1414
print('loading mesh.nt')
1515
start = dt.now()
16-
self.m= pyoxigraph.MemoryStore()
16+
self.m= pyoxigraph.Store()
1717
with open(ifname,'rb') as inf:
18-
self.m.load(inf,'application/n-triples')
18+
self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.N_TRIPLES)
1919
end = dt.now()
2020
print('loading complete')
2121
print(f'took {end-start}')

src/datahandlers/mods.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,15 @@
1010
def pull_mods():
1111
for mod in mods:
1212
subp = modmap[mod]
13+
14+
# We get the downloads from https://www.alliancegenome.org/downloads#gene-descriptions.
15+
# They are also available at:
16+
# - https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/SGD/GENE-DESCRIPTION-JSON_SGD_9.json.gz
17+
# - origname = pull_via_urllib(f"https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/{mod}/",f'GENE-DESCRIPTION-JSON_{mod}_9.json.gz', subpath=subp)
18+
#
19+
# However, the following URL returns the latest version of this file for each model organism.
1320
origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp)
21+
1422
#This should be fine. But for the makefile it's nice if the directory in which this goes is the same as the {mod} in the filename.
1523
# And we'd like it to be the names of the prefixes
1624
if mod != modmap[mod]:

src/datahandlers/pantherfamily.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,20 @@ def pull_pantherfamily():
99
# - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/
1010

1111
def pull_labels(infile,outfile, metadata_yaml):
12-
with open(infile,'r') as inf:
13-
data = inf.read()
14-
lines = data.strip().split('\n')
1512
SUBFAMILY_COLUMN = 3
1613
MAINFAMILY_NAME_COLUMN = 4
1714
SUBFAMILY_NAME_COLUMN = 5
18-
panther_families=[]
19-
labels = {}
2015
done = set()
21-
with open(outfile,'w') as labelf:
22-
for line in lines[1:]:
16+
with open(infile,'r') as inf, open(outfile,'w') as labelf:
17+
for raw_line in inf:
18+
line = raw_line.strip()
2319
parts = line.split('\t')
2420
if len(parts) < 5:
2521
continue
2622
sf = parts[SUBFAMILY_COLUMN]
27-
mf = sf.split(':')[0]
28-
mfname = parts[MAINFAMILY_NAME_COLUMN]
29-
sfname = parts[SUBFAMILY_NAME_COLUMN]
23+
mf = sf.split(':')[0] # PTHR10845:SF155 -> PTHR10845
24+
mfname = parts[MAINFAMILY_NAME_COLUMN] # REGULATOR OF G PROTEIN SIGNALING
25+
sfname = parts[SUBFAMILY_NAME_COLUMN] # REGULATOR OF G-PROTEIN SIGNALING 18
3026
if mf not in done:
3127
main_family = f'{PANTHERFAMILY}:{mf}'
3228
#panther_families.append(main_family)

0 commit comments

Comments
 (0)