Pyoxigraph and test improvements (#590)

gaurav · web-flow · commit fbf5a8d2d8ec · 2025-10-20T19:04:39.000-04:00
This PR upgrades [pyoxigraph v0.2.5 to v0.4.11](https://pyoxigraph.readthedocs.io/en/stable/migration.html#from-0-2-to-0-3), which requires the interface to be updated. It also replaces the dodgy/outdated [python-Levenshtein-wheels](https://pypi.org/project/python-Levenshtein-wheels/) with the up-to-date [python-Levenshtein](https://pypi.org/project/python-Levenshtein/). Adapted from PR #588.
diff --git a/requirements.txt b/requirements.txt
@@ -1,14 +1,15 @@
 biopython
 bmt
-datrie
 jsonlines
 pandas
 more-itertools
-pyoxigraph~=0.2.5
+#pyoxigraph~=0.2.5
+pyoxigraph~=0.4.11
 psycopg2-binary
 pytest
 pytest-cov
-python-Levenshtein-wheels
+#python-Levenshtein-wheels
+python-levenshtein
 pyyaml
 requests
 snakemake
diff --git a/src/datahandlers/chembl.py b/src/datahandlers/chembl.py
@@ -1,3 +1,6 @@
+import os.path
+import pathlib
+
 from src.prefixes import CHEMBLCOMPOUND
 from src.babel_utils import pull_via_ftp, make_local_name
 import ftplib
@@ -48,11 +51,11 @@ def __init__(self,ifname,ccofile):
         from datetime import datetime as dt
         print('loading chembl')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ccofile,'rb') as inf:
-            self.m.load(inf,'application/turtle')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/turtle')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.TURTLE)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py
@@ -24,9 +24,9 @@ def __init__(self,ifname):
         from datetime import datetime as dt
         print('loading CLO')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf, format=pyoxigraph.RdfFormat.RDF_XML, base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
diff --git a/src/datahandlers/datacollect.py b/src/datahandlers/datacollect.py
@@ -1,5 +1,5 @@
 from src.ubergraph import UberGraph
-from src.babel_utils import make_local_name, pull_via_ftp
+from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
 from collections import defaultdict
 import os, gzip
 from json import loads,dumps
diff --git a/src/datahandlers/ec.py b/src/datahandlers/ec.py
@@ -23,9 +23,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading EC')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py
@@ -27,9 +27,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading EFO')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml',base_iri='http://example.org/')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML,base_iri='http://example.org/')
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
@@ -120,6 +120,7 @@ def get_exacts(self, iri, outfile):
             outfile.write(f"{iri}\tskos:exactMatch\t{otherid}\n")
             nwrite += 1
         return nwrite
+
     def get_xrefs(self, iri, outfile):
         query = f"""
          prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py
@@ -1,3 +1,5 @@
+import csv
+
 from pronto.utils.io import decompress
 
 from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib
@@ -11,19 +13,18 @@ def pull_hgncfamily():
                     decompress=False,
                     subpath=HGNCFAMILY)
 
-def pull_labels(infile,outfile, metadata_yaml):
-    with open(infile,'r') as inf:
-        data = inf.read().strip()
-    lines = data.split('\n')
-    with open(outfile,'w') as outf:
-        #skip header
-        for line in lines[1:]:
-            parts = line.split(',')
-            if len(parts) < 10:
-                continue
-            i = f"{HGNCFAMILY}:{parts[0][1:-1]}"
-            l = parts[2][1:-1]
-            outf.write(f'{i}\t{l}\n')
+def pull_labels(infile, labelsfile, descriptionsfile, metadata_yaml):
+    with open(infile, 'r') as inf, open(labelsfile, 'w') as labelsf, open(descriptionsfile, 'w') as descriptionsf:
+        reader = csv.DictReader(inf)
+        for row in reader:
+            curie = f"{HGNCFAMILY}:{row['id']}"
+            name = row['name']
+            description = row['desc_comment']
+            # There is also a 'desc_label' field, but this seems to be pretty similar to 'name'.
+            labelsf.write(f'{curie}\t{name}\n')
+
+            if description and description != "NULL":
+                descriptionsf.write(f'{curie}\t{description}\n')
 
     write_metadata(
         metadata_yaml,
diff --git a/src/datahandlers/mesh.py b/src/datahandlers/mesh.py
@@ -13,9 +13,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading mesh.nt')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/n-triples')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.N_TRIPLES)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
diff --git a/src/datahandlers/mods.py b/src/datahandlers/mods.py
@@ -10,7 +10,15 @@
 def pull_mods():
     for mod in mods:
         subp = modmap[mod]
+
+        # We get the downloads from https://www.alliancegenome.org/downloads#gene-descriptions.
+        # They are also available at:
+        # - https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/SGD/GENE-DESCRIPTION-JSON_SGD_9.json.gz
+        # - origname = pull_via_urllib(f"https://download.alliancegenome.org/8.1.0/GENE-DESCRIPTION-JSON/{mod}/",f'GENE-DESCRIPTION-JSON_{mod}_9.json.gz', subpath=subp)
+        #
+        # However, the following URL returns the latest version of this file for each model organism.
         origname = pull_via_urllib('https://fms.alliancegenome.org/download/',f'GENE-DESCRIPTION-JSON_{mod}.json.gz',subpath=subp)
+
         #This should be fine.  But for the makefile it's nice if the directory in which this goes is the same as the {mod} in the filename.
         # And we'd like it to be the names of the prefixes
         if mod != modmap[mod]:
diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py
@@ -9,24 +9,20 @@ def pull_pantherfamily():
     # - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/
 
 def pull_labels(infile,outfile, metadata_yaml):
-    with open(infile,'r') as inf:
-        data = inf.read()
-    lines = data.strip().split('\n')
     SUBFAMILY_COLUMN = 3
     MAINFAMILY_NAME_COLUMN = 4
     SUBFAMILY_NAME_COLUMN = 5
-    panther_families=[]
-    labels = {}
     done = set()
-    with open(outfile,'w') as labelf:
-        for line in lines[1:]:
+    with open(infile,'r') as inf, open(outfile,'w') as labelf:
+        for raw_line in inf:
+            line = raw_line.strip()
             parts = line.split('\t')
             if len(parts) < 5:
                 continue
             sf = parts[SUBFAMILY_COLUMN]
-            mf = sf.split(':')[0]
-            mfname = parts[MAINFAMILY_NAME_COLUMN]
-            sfname = parts[SUBFAMILY_NAME_COLUMN]
+            mf = sf.split(':')[0] # PTHR10845:SF155 -> PTHR10845
+            mfname = parts[MAINFAMILY_NAME_COLUMN] # REGULATOR OF G PROTEIN SIGNALING
+            sfname = parts[SUBFAMILY_NAME_COLUMN] # REGULATOR OF G-PROTEIN SIGNALING 18
             if mf not in done:
                 main_family = f'{PANTHERFAMILY}:{mf}'
                 #panther_families.append(main_family)
diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py
@@ -17,9 +17,9 @@ def __init__(self):
         from datetime import datetime as dt
         print('loading rhea')
         start = dt.now()
-        self.m= pyoxigraph.MemoryStore()
+        self.m= pyoxigraph.Store()
         with open(ifname,'rb') as inf:
-            self.m.load(inf,'application/rdf+xml')
+            self.m.bulk_load(input=inf,format=pyoxigraph.RdfFormat.RDF_XML)
         end = dt.now()
         print('loading complete')
         print(f'took {end-start}')
diff --git a/src/datahandlers/unichem.py b/src/datahandlers/unichem.py
@@ -8,7 +8,6 @@
 data_sources: dict = {'1': CHEMBLCOMPOUND, '2': DRUGBANK, '4': GTOPDB, '6': KEGGCOMPOUND, '7': CHEBI, '14': UNII,
                       '18': HMDB, '22': PUBCHEMCOMPOUND, '34': DRUGCENTRAL}
 
-
 def pull_unichem():
     """ Download UniChem files. """
     pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/table_dumps/', 'structure.tsv.gz', decompress=False, subpath='UNICHEM')
diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile
@@ -277,12 +277,13 @@ rule get_hgncfamily:
 
 rule get_hgncfamily_labels:
     input:
-        infile=rules.get_hgncfamily.output.outfile
+        infile=config['download_directory'] + '/HGNC.FAMILY/family.csv'
     output:
-        outfile = config['download_directory'] + '/HGNC.FAMILY/labels',
+        labelsfile = config['download_directory'] + '/HGNC.FAMILY/labels',
+        descriptionsfile = config['download_directory'] + '/HGNC.FAMILY/descriptions',
         metadata_yaml = config['download_directory'] + '/HGNC.FAMILY/metadata.yaml',
     run:
-        hgncfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml)
+        hgncfamily.pull_labels(input.infile, output.labelsfile, output.descriptionsfile, output.metadata_yaml)
 
 ### PANTHER.FAMILY
 
@@ -294,7 +295,7 @@ rule get_pantherfamily:
 
 rule get_pantherfamily_labels:
     input:
-        infile=rules.get_pantherfamily.output.outfile
+        infile=config['download_directory'] + '/PANTHER.FAMILY/family.csv'
     output:
         outfile = config['download_directory'] + '/PANTHER.FAMILY/labels',
         metadata_yaml = config['download_directory'] + '/PANTHER.FAMILY/metadata.yaml',
diff --git a/tests/test_glom.py b/tests/test_glom.py
@@ -3,54 +3,58 @@
 
 """glom is a tool that looks at list of sets of values and combines them together if they share members"""
 
+
 def test_uberon():
-    uberon=[('UBERON:123',)]
-    dict={}
-    glom(dict,uberon,unique_prefixes='UBERON')
-    uber2 = [set(['UBERON:123','SOME:other'])]
-    glom(dict,uber2,unique_prefixes='UBERON')
+    uberon = [("UBERON:123",)]
+    dict = {}
+    glom(dict, uberon, unique_prefixes="UBERON")
+    uber2 = [{"UBERON:123", "SOME:other"}]
+    glom(dict, uber2, unique_prefixes="UBERON")
     print(dict)
 
+
 def test_simple():
     """Given 3 sets, 2 of which share a member, output 2 sets, with the sharing sets combined"""
     d = {}
-    eqs = [('1','2'), ('2','3'), ('4','5')]
-    glom(d,eqs)
+    eqs = [("1", "2"), ("2", "3"), ("4", "5")]
+    glom(d, eqs)
+    print(f"{d}")
     assert len(d) == 5
-    assert d['1'] == d['2'] == d['3'] == {'1','2','3'}
-    assert d['4'] == d['5'] == {'4','5'}
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == {"4", "5"}
+
 
 def test_two_calls():
     """Test using glom iteratively. The first call joins the first two sets, then the second call joins
     the next two and the new set."""
     d = {}
-    eqs = [('1','2'), ('2','3'), ('4','5'), ('6','7')]
-    oeqs = [('5','7')]
-    glom(d,eqs)
-    glom(d,oeqs)
-    assert d['1']==d['2']==d['3']=={'1','2','3'}
-    assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
+    eqs = [("1", "2"), ("2", "3"), ("4", "5"), ("6", "7")]
+    oeqs = [("5", "7")]
+    glom(d, eqs)
+    glom(d, oeqs)
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}
+
 
 def test_sets():
     """Test using set() as opposed to {}"""
     d = {}
-    eqs = [{'1','2'}, set(['2','3']), set(['4','5']), set(['6','7'])]
-    oeqs = [{'5','7'}]
-    glom(d,eqs)
-    glom(d,oeqs)
-    assert d['1']==d['2']==d['3']=={'1','2','3'}
-    assert d['4']==d['5']==d['6']==d['7']=={'4','5','6','7'}
+    eqs = [{"1", "2"}, {"2", "3"}, {"4", "5"}, {"6", "7"}]
+    oeqs = [{"5", "7"}]
+    glom(d, eqs)
+    glom(d, oeqs)
+    assert d["1"] == d["2"] == d["3"] == {"1", "2", "3"}
+    assert d["4"] == d["5"] == d["6"] == d["7"] == {"4", "5", "6", "7"}
+
 
 def test_bigger_sets():
     """Test when the sets have more than two members.
     As of recent builds, we no longer expect this to work.
     Now glom only operates on new pairwise sets"""
     d = {}
-    eqs = [{'1','2','3'}, {'4','5','6'} ]
+    eqs = [{"1", "2", "3"}, {"4", "5", "6"}]
     try:
-        glom(d,eqs)
+        glom(d, eqs)
         assert False
     except ValueError:
         assert True
-
-