From 3bacd045e22c5d0458e6b318e5737b7c546877a3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Sat, 2 Aug 2025 19:23:00 -0400 Subject: [PATCH 001/167] Updated UMLS and RxNorm. --- config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.yaml b/config.yaml index 76f8c0b4..9ac10e53 100644 --- a/config.yaml +++ b/config.yaml @@ -4,8 +4,8 @@ intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs biolink_version: "4.2.6-rc5" -umls_version: "2024AB" -rxnorm_version: "03032025" +umls_version: "2025AA" +rxnorm_version: "07072025" drugbank_version: "5-1-13" UMLS_UniProtKB_download_raw_url: "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv" From 885cc941d4b54e895db860e6f762792e97ed022e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 26 Jun 2025 13:18:23 -0400 Subject: [PATCH 002/167] First stab at writing out a metadata file for every compendium. diff --git c/src/babel_utils.py i/src/babel_utils.py index a96120d..5cbab9c 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -5,13 +5,15 @@ from enum import Enum from ftplib import FTP from io import BytesIO import gzip -from datetime import datetime as dt +from datetime import datetime as dt, datetime from datetime import timedelta import time import requests import os import urllib import jsonlines +import yaml + from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -349,10 +351,11 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): """ + :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: - :param ofname: + :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories. :param node_type: :param labels: A map of identifiers Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx). @@ -371,6 +374,32 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + metadata_dir = os.path.join(cdir,'metadata') + os.makedirs(metadata_dir, exist_ok=True) + with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + metadata = { + 'type': 'compendium', + 'name': ofname, + 'created_at': datetime.now().isoformat(), + 'concords': {} + } + for metadata_yaml in metadata_yamls: + metadata_block = yaml.safe_load(metadata_yaml) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block + + outf.write(yaml.dump(metadata)) + # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config['preferred_name_boost_prefixes'] --- src/babel_utils.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index a96120d1..5cbab9c0 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -5,13 +5,15 @@ from ftplib import FTP from io import BytesIO import gzip -from datetime import datetime as dt +from datetime import datetime as dt, datetime from datetime import timedelta import time import requests import os import urllib import jsonlines +import yaml + from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -349,10 +351,11 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): """ + :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: - :param ofname: + :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories. :param node_type: :param labels: A map of identifiers Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx). @@ -371,6 +374,32 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + metadata_dir = os.path.join(cdir,'metadata') + os.makedirs(metadata_dir, exist_ok=True) + with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + metadata = { + 'type': 'compendium', + 'name': ofname, + 'created_at': datetime.now().isoformat(), + 'concords': {} + } + for metadata_yaml in metadata_yamls: + metadata_block = yaml.safe_load(metadata_yaml) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block + + outf.write(yaml.dump(metadata)) + # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config['preferred_name_boost_prefixes'] From e800dfd958aaecf4ee696be00014a5fab968043b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 26 Jun 2025 13:21:23 -0400 Subject: [PATCH 003/167] Added counts to metadata. --- src/babel_utils.py | 69 ++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 5cbab9c0..bd67de48 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -351,7 +351,7 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, extra_prefixes=[], icrdf_filename=None): """ :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: @@ -374,32 +374,6 @@ def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},ext node_factory = NodeFactory(make_local_name(''),biolink_version) synonym_factory = SynonymFactory(make_local_name('')) - # Write out the metadata.yaml file combining information from all the metadata.yaml files. - metadata_dir = os.path.join(cdir,'metadata') - os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: - metadata = { - 'type': 'compendium', - 'name': ofname, - 'created_at': datetime.now().isoformat(), - 'concords': {} - } - for metadata_yaml in metadata_yamls: - metadata_block = yaml.safe_load(metadata_yaml) - if metadata_block is None: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - metadata_name = metadata_block['name'] - - if metadata_name in metadata['concords']: - logging.error(f"Duplicate metadata block name {metadata_name}!") - logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") - logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") - raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") - metadata['concords'][metadata_name] = metadata_block - - outf.write(yaml.dump(metadata)) - # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. preferred_name_boost_prefixes = config['preferred_name_boost_prefixes'] @@ -418,11 +392,19 @@ def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},ext os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True) os.makedirs(os.path.join(cdir, 'synonyms'), exist_ok=True) + # Counts. + count_cliques = 0 + count_eq_ids = 0 + count_synonyms = 0 + # Write compendium and synonym files. with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: for slist in synonym_list: node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) if node is not None: + count_cliques += 1 + count_eq_ids += len(slist) + nw = {"type": node['type']} ic = ic_factory.get_ic(node) nw['ic'] = ic @@ -527,6 +509,8 @@ def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},ext "names": synonyms_list, "types": [t[8:] for t in types]} # remove biolink: + count_synonyms += len(synonyms_list) + # Write out the preferred name. if preferred_name: document["preferred_name"] = preferred_name @@ -574,6 +558,37 @@ def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},ext traceback.print_exc() exit() + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + metadata_dir = os.path.join(cdir,'metadata') + os.makedirs(metadata_dir, exist_ok=True) + with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + metadata = { + 'type': 'compendium', + 'name': ofname, + 'created_at': datetime.now().isoformat(), + 'counts': { + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + }, + 'concords': {} + } + for metadata_yaml in metadata_yamls: + metadata_block = yaml.safe_load(metadata_yaml) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block + + outf.write(yaml.dump(metadata)) + def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. conc_set is a dictionary where the values are these equivalent identifier sets and From 4db83fe86a355007cdc0aee6295806376e235062 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 26 Jun 2025 13:27:48 -0400 Subject: [PATCH 004/167] Added metadata files to build_* methods. --- src/createcompendia/anatomy.py | 4 ++-- src/createcompendia/cell_line.py | 4 ++-- src/createcompendia/chemicals.py | 6 +++--- src/createcompendia/diseasephenotype.py | 9 +++++---- src/createcompendia/gene.py | 4 ++-- src/createcompendia/genefamily.py | 4 ++-- src/createcompendia/macromolecular_complex.py | 4 ++-- src/createcompendia/processactivitypathway.py | 4 ++-- src/createcompendia/protein.py | 4 ++-- src/createcompendia/publications.py | 4 ++-- src/createcompendia/taxon.py | 4 ++-- 11 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index dce606d5..e3261d86 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -138,7 +138,7 @@ def build_wikidata_cell_relationships(outdir): def build_anatomy_umls_relationships(mrconso, idfile,outfile): umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) -def build_compendia(concordances, identifiers, icrdf_filename): +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -175,7 +175,7 @@ def build_compendia(concordances, identifiers, icrdf_filename): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, concordances, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/createcompendia/cell_line.py b/src/createcompendia/cell_line.py index 89252c1e..d61176ff 100644 --- a/src/createcompendia/cell_line.py +++ b/src/createcompendia/cell_line.py @@ -2,7 +2,7 @@ from src.babel_utils import read_identifier_file,glom,write_compendium -def build_compendia(ifile, icrdf_filename): +def build_compendia(ifile, metadata_yamls, icrdf_filename): """:identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} @@ -13,5 +13,5 @@ def build_compendia(ifile, icrdf_filename): types.update(new_types) idsets = set([frozenset(x) for x in dicts.values()]) baretype = CELL_LINE.split(':')[-1] - write_compendium(idsets, f'{baretype}.txt', CELL_LINE, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, idsets, f'{baretype}.txt', CELL_LINE, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 0a6dd39f..82af73aa 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -567,7 +567,7 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c for s in untyped_sets: outf.write(f'{set(s)}\n') -def build_compendia(type_file, untyped_compendia_file, icrdf_filename): +def build_compendia(type_file, untyped_compendia_file, metadata_yamls, icrdf_filename): types = {} with open(type_file,'r') as inf: for line in inf: @@ -582,9 +582,9 @@ def build_compendia(type_file, untyped_compendia_file, icrdf_filename): for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] if biotype == DRUG: - write_compendium(sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename) else: - write_compendium(sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename) def create_typed_sets(eqsets, types): """ diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 65431d63..e4fdd69c 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -126,7 +126,7 @@ def build_disease_doid_relationships(idfile,outfile): 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE}) -def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_filename): +def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badxrefs, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -173,7 +173,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_file typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into @@ -231,6 +231,7 @@ def read_badxrefs(fn): return morebad def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_filename): + metadata_yamls = [] #print('disease/phenotype') #print('get and write hp sets') #bad_mappings = read_bad_hp_mappings(badhpos) @@ -301,8 +302,8 @@ def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_fil print('dump it') fs = set([frozenset(x) for x in dicts.values()]) diseases,phenotypes = create_typed_sets(fs) - write_compendium(diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename) - write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename) if __name__ == '__main__': with open('crapfile','w') as crapfile: diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 1c89da02..15ef2f19 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -251,7 +251,7 @@ def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile): #Could also add MESH, if that were a valid gene prefix umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}) -def build_gene_compendia(concordances, identifiers, icrdf_filename): +def build_gene_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -273,5 +273,5 @@ def build_gene_compendia(concordances, identifiers, icrdf_filename): glom(dicts, pairs, unique_prefixes=uniques) gene_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/genefamily.py b/src/createcompendia/genefamily.py index fb26f9f4..b2163177 100644 --- a/src/createcompendia/genefamily.py +++ b/src/createcompendia/genefamily.py @@ -2,7 +2,7 @@ from src.babel_utils import read_identifier_file,glom,write_compendium -def build_compendia(identifiers, icrdf_filename): +def build_compendia(identifiers, metadata_yamls, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -15,5 +15,5 @@ def build_compendia(identifiers, icrdf_filename): types.update(new_types) genefam_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE_FAMILY.split(':')[-1] - write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/macromolecular_complex.py b/src/createcompendia/macromolecular_complex.py index ceb482b2..7ca4dd29 100644 --- a/src/createcompendia/macromolecular_complex.py +++ b/src/createcompendia/macromolecular_complex.py @@ -4,7 +4,7 @@ import src.datahandlers.complexportal as complexportal from src.babel_utils import read_identifier_file, glom, write_compendium -def build_compendia(identifiers, icrdf_filename): +def build_compendia(identifiers, metadata_yamls, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -16,4 +16,4 @@ def build_compendia(identifiers, icrdf_filename): types.update(new_types) sets = set([frozenset(x) for x in dicts.values()]) type = MACROMOLECULAR_COMPLEX.split(':')[-1] - write_compendium(sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL], icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index a303efd8..630fc6b9 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -58,7 +58,7 @@ def build_process_rhea_relationships(outfile): rhea.make_concord(outfile) -def build_compendia(concordances, identifiers, icrdf_filename): +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" #These are concords that cause problems and are being special cased out. In disease/process we put these in some @@ -105,7 +105,7 @@ def build_compendia(concordances, identifiers, icrdf_filename): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 20bf9287..33ebdcda 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -147,7 +147,7 @@ def build_ncit_uniprot_relationships(infile,outfile): def build_umls_ncit_relationships(mrconso, idfile, outfile): umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}) -def build_protein_compendia(concordances, identifiers, icrdf_filename): +def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -182,5 +182,5 @@ def build_protein_compendia(concordances, identifiers, icrdf_filename): # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index 41c8eafd..86247631 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -246,7 +246,7 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi statusf.write(json.dumps({'id': pmid, 'statuses': sorted(statuses)}, sort_keys=True) + '\n') -def generate_compendium(concordances, identifiers, titles, publication_compendium, icrdf_filename): +def generate_compendium(concordances, metadata_yamls, identifiers, titles, publication_compendium, icrdf_filename): """ Generate a Publication compendium using the ID and Concord files provided. @@ -295,5 +295,5 @@ def generate_compendium(concordances, identifiers, titles, publication_compendiu # Write out the compendium. publication_sets = set([frozenset(x) for x in dicts.values()]) baretype = PUBLICATION.split(':')[-1] - write_compendium(publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels, + write_compendium(metadata_yamls, publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index f9a71661..d9bbb52d 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -82,7 +82,7 @@ def build_relationships(outfile,mesh_ids): -def build_compendia(concordances, identifiers, icrdf_filename): +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -106,5 +106,5 @@ def build_compendia(concordances, identifiers, icrdf_filename): baretype = ORGANISM_TAXON.split(':')[-1] # We need to use extra_prefixes since UMLS is not listed as an identifier prefix at # https://biolink.github.io/biolink-model/docs/OrganismTaxon.html - write_compendium(gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}, icrdf_filename=icrdf_filename) From 5d207f277f0af99f4ff79e34ae4e4d4ca73fe595 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 26 Jun 2025 14:02:12 -0400 Subject: [PATCH 005/167] Added metadata YAMLs to Snakemake dependencies. --- src/snakefiles/anatomy.snakefile | 3 ++- src/snakefiles/cell_line.snakefile | 3 ++- src/snakefiles/chemical.snakefile | 3 ++- src/snakefiles/diseasephenotype.snakefile | 10 +++++++--- src/snakefiles/gene.snakefile | 3 ++- src/snakefiles/genefamily.snakefile | 3 ++- src/snakefiles/macromolecular_complex.snakefile | 3 ++- src/snakefiles/process.snakefile | 3 ++- src/snakefiles/protein.snakefile | 3 ++- src/snakefiles/publications.snakefile | 2 ++ src/snakefiles/taxon.snakefile | 3 ++- 11 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 6e80a026..ecbfce61 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -72,13 +72,14 @@ rule anatomy_compendia: labels=os.path.join(config["download_directory"], 'common', config["common"]["labels"][0]), synonyms=os.path.join(config["download_directory"], 'common', config["common"]["synonyms"][0]), concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']), + metadata_yamls=expand("{dd}/anatomy/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['anatomy_concords']), idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])) run: - anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename) + anatomy.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_anatomy_completeness: input: diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index b8e72965..ce52cc22 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -24,12 +24,13 @@ rule cell_line_compendia: ids=config['intermediate_directory']+"/cell_line/ids/CLO", labelfile=config['download_directory'] + '/CLO/labels', synonymfile=config['download_directory'] + '/CLO/synonyms', + metadatafile=config['download_directory'] + '/CLO/metadata.yaml', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: config['output_directory']+"/compendia/CellLine.txt", temp(config['output_directory']+"/synonyms/CellLine.txt") run: - cell_line.build_compendia(input.ids,input.icrdf_filename) + cell_line.build_compendia(input.ids, input.metadatafile, input.icrdf_filename) rule check_cell_line_completeness: input: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 249fe30c..3f212746 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -217,12 +217,13 @@ rule chemical_compendia: input: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', + metadata_yamls = config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml', icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])) run: - chemicals.build_compendia(input.typesfile,input.untyped_file, input.icrdf_filename) + chemicals.build_compendia(input.typesfile, input.untyped_file, [input.metadata_yamls], input.icrdf_filename) rule check_chemical_completeness: input: diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 8a77dc11..db2a04d4 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -141,15 +141,19 @@ rule disease_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['disease_labelsandsynonyms']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['disease_labelsandsynonyms']), concords=expand("{dd}/disease/concords/{ap}",dd=config['intermediate_directory'],ap=config['disease_concords']), + metadata_yamls=expand("{dd}/disease/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['disease_concords']), idlists=expand("{dd}/disease/ids/{ap}",dd=config['intermediate_directory'],ap=config['disease_ids']), icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['disease_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['disease_outputs'])) run: - diseasephenotype.build_compendium(input.concords,input.idlists,input.close_matches,{'HP':input.bad_hpo_xrefs, - 'MONDO':input.bad_mondo_xrefs, - 'UMLS':input.bad_umls_xrefs}, input.icrdf_filename ) + diseasephenotype.build_compendium(input.concords, input.metadata_yamls, input.idlists,input.close_matches, + { + 'HP':input.bad_hpo_xrefs, + 'MONDO':input.bad_mondo_xrefs, + 'UMLS':input.bad_umls_xrefs + }, input.icrdf_filename ) rule check_disease_completeness: input: diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 7bb30832..11fdf7a6 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -101,13 +101,14 @@ rule gene_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['gene_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['gene_labels']), concords=expand("{dd}/gene/concords/{ap}",dd=config['intermediate_directory'],ap=config['gene_concords']), + metadata_yamls=expand("{dd}/gene/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['gene_concords']), idlists=expand("{dd}/gene/ids/{ap}",dd=config['intermediate_directory'],ap=config['gene_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs'])) run: - gene.build_gene_compendia(input.concords,input.idlists, input.icrdf_filename) + gene.build_gene_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_gene_completeness: input: diff --git a/src/snakefiles/genefamily.snakefile b/src/snakefiles/genefamily.snakefile index a38d2729..b8eb18cb 100644 --- a/src/snakefiles/genefamily.snakefile +++ b/src/snakefiles/genefamily.snakefile @@ -24,12 +24,13 @@ rule genefamily_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['genefamily_labels']), idlists=expand("{dd}/genefamily/ids/{ap}",dd=config['intermediate_directory'],ap=config['genefamily_ids']), + metadata_yamls=expand("{dd}/{ap}/metadata.yaml",dd=config['download_directory'],ap=config['genefamily_labels']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['genefamily_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs'])) run: - genefamily.build_compendia(input.idlists, input.icrdf_filename) + genefamily.build_compendia(input.idlists, input.metadata_yamls, input.icrdf_filename) rule check_genefamily_completeness: input: diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index cd8581f5..99df28d9 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -14,13 +14,14 @@ rule macromolecular_complex_compendia: input: labels = config['download_directory']+'/ComplexPortal/559292_labels.tsv', synonyms = config['download_directory']+'/ComplexPortal/559292_synonyms.tsv', + metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml', idlists = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: config['output_directory']+'/compendia/MacromolecularComplex.txt', temp(config['output_directory']+'/synonyms/MacromolecularComplex.txt') run: - macromolecular_complex.build_compendia([input.idlists], icrdf_filename=input.icrdf_filename) + macromolecular_complex.build_compendia([input.idlists], [input.metadata_yaml], icrdf_filename=input.icrdf_filename) rule check_macromolecular_complex_completeness: input: diff --git a/src/snakefiles/process.snakefile b/src/snakefiles/process.snakefile index 79b97ffd..b652102b 100644 --- a/src/snakefiles/process.snakefile +++ b/src/snakefiles/process.snakefile @@ -90,13 +90,14 @@ rule process_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['process_labels']), #synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['process_labelsandsynonyms']), concords=expand("{dd}/process/concords/{ap}",dd=config['intermediate_directory'],ap=config['process_concords']), + metadata_yamls=expand("{dd}/process/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['process_concords']), idlists=expand("{dd}/process/ids/{ap}",dd=config['intermediate_directory'],ap=config['process_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['process_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['process_outputs'])) run: - pap.build_compendia(input.concords,input.idlists,input.icrdf_filename) + pap.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_process_completeness: input: diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index b550d748..2f83e8d8 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -80,6 +80,7 @@ rule protein_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['protein_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['protein_synonyms']), concords=expand("{dd}/protein/concords/{ap}",dd=config['intermediate_directory'],ap=config['protein_concords']), + metadata_yamls=expand("{dd}/protein/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['protein_concords']), idlists=expand("{dd}/protein/ids/{ap}",dd=config['intermediate_directory'],ap=config['protein_ids']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', # Include the taxon information from UniProtKB @@ -88,7 +89,7 @@ rule protein_compendia: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs'])) run: - protein.build_protein_compendia(input.concords,input.idlists, input.icrdf_filename) + protein.build_protein_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_protein_completeness: input: diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index 04704035..cf69cf29 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -52,6 +52,7 @@ rule generate_pubmed_compendia: titles = [ config['download_directory'] + '/PubMed/titles.tsv', ], + metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: publication_compendium = config['output_directory'] + '/compendia/Publication.txt', @@ -60,6 +61,7 @@ rule generate_pubmed_compendia: run: publications.generate_compendium( [input.pmid_doi_concord_file], + [input.metadata_yaml], [input.pmid_id_file], input.titles, output.publication_compendium, diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index d3cdd7be..b9d6b744 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -50,13 +50,14 @@ rule taxon_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['taxon_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['taxon_synonyms']), concords=expand("{dd}/taxon/concords/{ap}",dd=config['intermediate_directory'],ap=config['taxon_concords']), + metadata_yamls=expand("{dd}/taxon/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['taxon_concords']), idlists=expand("{dd}/taxon/ids/{ap}",dd=config['intermediate_directory'],ap=config['taxon_ids']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['taxon_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs'])) run: - taxon.build_compendia(input.concords,input.idlists, input.icrdf_filename) + taxon.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_taxon_completeness: input: From 8abbb8be883d7b8c54f6fbe4d7af87368164c7f6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 12:44:38 -0400 Subject: [PATCH 006/167] First stab at metadata files. --- src/babel_utils.py | 1 + src/createcompendia/anatomy.py | 52 ++++++++++++++++++++++++++++++-- src/metadata/provenance.py | 17 +++++++++++ src/snakefiles/anatomy.snakefile | 15 +++++++-- 4 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 src/metadata/provenance.py diff --git a/src/babel_utils.py b/src/babel_utils.py index bd67de48..a1fd9891 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -562,6 +562,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, metadata_dir = os.path.join(cdir,'metadata') os.makedirs(metadata_dir, exist_ok=True) with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + # TODO: move into metadata/provenance.py metadata = { 'type': 'compendium', 'name': ofname, diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index e3261d86..c0af5871 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -2,6 +2,7 @@ import requests import src.datahandlers.obo as obo +from src.metadata.provenance import write_concord_metadata from src.util import Text from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS, FMA @@ -91,14 +92,43 @@ def write_umls_ids(mrsty, outfile): #CL only shows up as an xref once in uberon, and it's a mistake. It doesn't show up in anything else. #GO only shows up as an xref once in uberon, and it's a mistake. It doesn't show up in anything else. #PMID is just wrong -def build_anatomy_obo_relationships(outdir): +def build_anatomy_obo_relationships(outdir, metadata_yamls): ignore_list = ['PMID','BTO','BAMS','FMA','CALOHA','GOC','WIKIPEDIA.EN','CL','GO','NIF_SUBCELLULAR','HTTP','OPENCYC'] #Create the equivalence pairs with open(f'{outdir}/{UBERON}', 'w') as uberon, open(f'{outdir}/{GO}', 'w') as go, open(f'{outdir}/{CL}', 'w') as cl: build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) + # CL is now being handled by Wikidata (build_wikidata_cell_relationships), so we can probably remove it from here. -def build_wikidata_cell_relationships(outdir): + # Write out metadata. + write_concord_metadata(metadata_yamls['UBERON'], + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'UBERON' + } + ], + description=f'get_subclasses_and_xrefs() of {UBERON}:0001062' + ) + write_concord_metadata(metadata_yamls['GO'], + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'GO' + } + ], + description=f'get_subclasses_and_xrefs() of {GO}:0005575' + ) + # TODO: delete + write_concord_metadata(metadata_yamls['CL'], + name='build_anatomy_obo_relationships()', + sources=[], + description='' + ) + +def build_wikidata_cell_relationships(outdir, metadata_yaml): #This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier sparql = """PREFIX wdt: PREFIX wdtn: @@ -135,8 +165,26 @@ def build_wikidata_cell_relationships(outdir): else: print(f'Pair {pair} is not unique {counts[pair[0]]} {counts[pair[1]]}') + # Write out metadata + write_concord_metadata(metadata_yaml, { + 'name': 'build_wikidata_cell_relationships()', + 'sources': [{ + 'type': 'Frink', + 'name': 'Frink Direct Normalized Graph via SPARQL' + }], + 'description': 'wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', + }) + def build_anatomy_umls_relationships(mrconso, idfile,outfile): umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) + write_concord_metadata(outfile, { + 'name': 'build_anatomy_umls_relationships()', + 'sources': [{ + 'type': 'UMLS', + 'name': 'MRCONSO' + }], + 'description': 'umls.build_sets() of UMLS MRCONSO with prefixes: SNOMEDCT_US, MSH, NCI, GO, FMA', + }) def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py new file mode 100644 index 00000000..4b7c3558 --- /dev/null +++ b/src/metadata/provenance.py @@ -0,0 +1,17 @@ +from datetime import datetime + +import yaml + + +def write_concord_metadata(filename, name, description='', sources=[]): + write_metadata(filename, 'concord', name, description, sources) + +def write_metadata(filename, typ, name, description='', sources=[]): + with open(filename, 'w') as fout: + yaml.dump({ + 'created_at': datetime.now().isoformat(), + 'type': typ, + 'name': name, + 'description': description, + 'sources': sources, + }) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index ecbfce61..f0070eb6 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -49,14 +49,22 @@ rule get_anatomy_obo_relationships: config['intermediate_directory']+'/anatomy/concords/UBERON', config['intermediate_directory']+'/anatomy/concords/CL', config['intermediate_directory']+'/anatomy/concords/GO', + uberon_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UberGraph.yaml', + cl_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-CL.yaml', + go_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-GO.yaml', run: - anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords') + anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords', { + 'UBERON': output.uberon_metadata, + 'CL': output.cl_metadata, + 'GO': output.go_metadata, + }) rule get_wikidata_cell_relationships: output: config['intermediate_directory']+'/anatomy/concords/WIKIDATA', + wikidata_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-WIKIDATA.yaml', run: - anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords') + anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords', output.wikidata_metadata) rule get_anatomy_umls_relationships: input: @@ -64,8 +72,9 @@ rule get_anatomy_umls_relationships: infile=config['intermediate_directory']+"/anatomy/ids/UMLS" output: outfile=config['intermediate_directory']+'/anatomy/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UMLS.yaml', run: - anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile) + anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile, output.umls_metadata) rule anatomy_compendia: input: From c8ed59b2d5e6ec2f029b24c8a0f73d5a99e2abe9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 12:45:42 -0400 Subject: [PATCH 007/167] Fixed typo. --- src/snakefiles/anatomy.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index f0070eb6..a751f445 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -49,7 +49,7 @@ rule get_anatomy_obo_relationships: config['intermediate_directory']+'/anatomy/concords/UBERON', config['intermediate_directory']+'/anatomy/concords/CL', config['intermediate_directory']+'/anatomy/concords/GO', - uberon_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UberGraph.yaml', + uberon_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UBERON.yaml', cl_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-CL.yaml', go_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-GO.yaml', run: From a5af5183232bddaa1e84a3b0abd489ee38451416 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 12:54:25 -0400 Subject: [PATCH 008/167] Fixed build_anatomy_umls_relationships() params. --- src/createcompendia/anatomy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index c0af5871..d252223b 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -175,9 +175,9 @@ def build_wikidata_cell_relationships(outdir, metadata_yaml): 'description': 'wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', }) -def build_anatomy_umls_relationships(mrconso, idfile,outfile): +def build_anatomy_umls_relationships(mrconso, idfile, outfile, umls_metadata): umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) - write_concord_metadata(outfile, { + write_concord_metadata(umls_metadata, { 'name': 'build_anatomy_umls_relationships()', 'sources': [{ 'type': 'UMLS', From 519641b303569d0ea9eb544bf10175355047dc3d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 12:54:46 -0400 Subject: [PATCH 009/167] Added metadata for CellLine. --- src/datahandlers/clo.py | 7 ++++++- src/metadata/provenance.py | 5 ++++- src/snakefiles/cell_line.snakefile | 2 +- src/snakefiles/datacollect.snakefile | 5 +++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py index 018f8d44..625c02e6 100644 --- a/src/datahandlers/clo.py +++ b/src/datahandlers/clo.py @@ -1,6 +1,7 @@ import logging import re +from src.metadata.provenance import write_download_metadata from src.prefixes import CLO from src.categories import CELL_LINE from src.babel_utils import pull_via_urllib @@ -10,8 +11,12 @@ logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) -def pull_clo(): +def pull_clo(metadata_file): _=pull_via_urllib('http://purl.obolibrary.org/obo/','clo.owl', subpath='CLO', decompress=False) + write_download_metadata(metadata_file, + name='Cell Line Ontology', + url='http://purl.obolibrary.org/obo/clo.owl', + ) class CLOgraph: """Load the file for querying""" diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 4b7c3558..5d8ecb38 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -2,16 +2,19 @@ import yaml +def write_download_metadata(filename, name, url='', description='', sources=[]): + write_metadata(filename, 'download', name, url, description, sources) def write_concord_metadata(filename, name, description='', sources=[]): write_metadata(filename, 'concord', name, description, sources) -def write_metadata(filename, typ, name, description='', sources=[]): +def write_metadata(filename, typ, name, sources=[], url='', description=''): with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), 'type': typ, 'name': name, + 'url': url, 'description': description, 'sources': sources, }) diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index ce52cc22..e40bb8f0 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -11,7 +11,7 @@ rule get_clo_ids: input: infile=config['download_directory']+"/CLO/clo.owl" output: - outfile=config['intermediate_directory']+"/cell_line/ids/CLO" + outfile=config['intermediate_directory']+"/cell_line/ids/CLO", run: clo.write_clo_ids(input.infile, output.outfile) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index aa9641f6..16f331b8 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -641,9 +641,10 @@ rule get_chebi: rule get_clo: output: - config['download_directory']+'/CLO/clo.owl' + config['download_directory']+'/CLO/clo.owl', + metadata=config['download_directory']+'/CLO/metadata.yaml', run: - clo.pull_clo() + clo.pull_clo(output.metadata) rule get_CLO_labels: input: From 0c5bc87d81e8c93353f20b13047d73a00815a491 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 14:18:32 -0400 Subject: [PATCH 010/167] Fixed some metadata output. --- src/babel_utils.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index a1fd9891..53216870 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -561,7 +561,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, # Write out the metadata.yaml file combining information from all the metadata.yaml files. metadata_dir = os.path.join(cdir,'metadata') os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf: + with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf: # TODO: move into metadata/provenance.py metadata = { 'type': 'compendium', @@ -575,18 +575,22 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, 'concords': {} } for metadata_yaml in metadata_yamls: - metadata_block = yaml.safe_load(metadata_yaml) - if metadata_block is None: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - metadata_name = metadata_block['name'] - - if metadata_name in metadata['concords']: - logging.error(f"Duplicate metadata block name {metadata_name}!") - logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") - logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") - raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") - metadata['concords'][metadata_name] = metadata_block + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if metadata_name in metadata['concords']: + logging.error(f"Duplicate metadata block name {metadata_name}!") + logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") + logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") + raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") + metadata['concords'][metadata_name] = metadata_block outf.write(yaml.dump(metadata)) From 650ebb474eeb4d9c55633465318e415aa65ce522 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 14:18:57 -0400 Subject: [PATCH 011/167] Fixed some issues. --- src/metadata/provenance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 5d8ecb38..f292d6e9 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -3,10 +3,10 @@ import yaml def write_download_metadata(filename, name, url='', description='', sources=[]): - write_metadata(filename, 'download', name, url, description, sources) + write_metadata(filename, 'download', name, url=url, description=description, sources=sources) -def write_concord_metadata(filename, name, description='', sources=[]): - write_metadata(filename, 'concord', name, description, sources) +def write_concord_metadata(filename, name, url='', description='', sources=[]): + write_metadata(filename, 'concord', name, url=url, description=description, sources=sources) def write_metadata(filename, typ, name, sources=[], url='', description=''): with open(filename, 'w') as fout: @@ -17,4 +17,4 @@ def write_metadata(filename, typ, name, sources=[], url='', description=''): 'url': url, 'description': description, 'sources': sources, - }) + }, fout) From 2fa76f30cd6196692ac9ed83d039ced03548ddad Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 14:19:15 -0400 Subject: [PATCH 012/167] Fixed some cell_line bugs. --- src/snakefiles/cell_line.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index e40bb8f0..3d032776 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -30,7 +30,7 @@ rule cell_line_compendia: config['output_directory']+"/compendia/CellLine.txt", temp(config['output_directory']+"/synonyms/CellLine.txt") run: - cell_line.build_compendia(input.ids, input.metadatafile, input.icrdf_filename) + cell_line.build_compendia(input.ids, [input.metadatafile], input.icrdf_filename) rule check_cell_line_completeness: input: From baf970b0f8abc88345a9930044450e5d51114ab0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 15:07:17 -0400 Subject: [PATCH 013/167] First stab at adding metadata to Chemical. --- src/createcompendia/chemicals.py | 20 +++++++++++++++++++- src/snakefiles/chemical.snakefile | 4 +++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 82af73aa..c63f8158 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,11 +1,15 @@ import logging from collections import defaultdict +from datetime import datetime + import jsonlines import requests import ast import gzip from gzip import GzipFile +import yaml + from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG @@ -521,7 +525,7 @@ def get_wikipedia_relationships(outfile): for m,c in pairs: outf.write(f'{m}\txref\t{c}\n') -def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file): +def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = read_partial_unichem(unichem_partial) @@ -567,6 +571,20 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c for s in untyped_sets: outf.write(f'{set(s)}\n') + # Build the metadata file by combining the input metadata_yamls. + metadata = { + 'type': 'untyped_compendium', + 'name': 'build_untyped_compendia()', + 'created_at': datetime.now().isoformat(), + 'sources': [] + } + for metadata_yaml in input_metadata_yamls: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None: + raise ValueError("Metadata file {metadata_yaml} is empty.") + metadata['sources'].append(metadata_block) + def build_compendia(type_file, untyped_compendia_file, metadata_yamls, icrdf_filename): types = {} with open(type_file,'r') as inf: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 3f212746..4b62062b 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -205,12 +205,14 @@ rule untyped_chemical_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['chemical_synonyms']), unichemgroup = config['intermediate_directory']+'/chemicals/partials/UNICHEM', concords = expand('{dd}/chemicals/concords/{cc}',dd=config['intermediate_directory'], cc=config['chemical_concords'] ), + metadata_yamls = expand('{dd}/chemicals/concords/metadata-{cc}.yaml',dd=config['intermediate_directory'], cc=config['chemical_concords'] ), idlists=expand("{dd}/chemicals/ids/{ap}",dd=config['intermediate_directory'],ap=config['chemical_ids']), output: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', + untyped_meta = config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml' run: - chemicals.build_untyped_compendia(input.concords,input.idlists,input.unichemgroup,output.untyped_file,output.typesfile) + chemicals.build_untyped_compendia(input.concords,input.idlists,input.unichemgroup,output.untyped_file,output.typesfile, output.untyped_meta, input.metadata_yamls) rule chemical_compendia: From 48c906444ac36ade52ca383983c38c7d84f23ab7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 23:40:53 -0400 Subject: [PATCH 014/167] Turned off DRY_RUN for testing. --- scripts/babel-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/babel-build.sh b/scripts/babel-build.sh index 653de19e..72c3c24a 100644 --- a/scripts/babel-build.sh +++ b/scripts/babel-build.sh @@ -11,7 +11,7 @@ export CORES=5 # Dry run: if true, run Snakemake in a dry run. -export DRY_RUN=1 +export DRY_RUN= # Verbose: if set, produce verbose output. export VERBOSE= From e97ac425dd8cdfcc6f5bfd2dde74cdbe2fd7b01f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 30 Jun 2025 23:41:13 -0400 Subject: [PATCH 015/167] Added metadata outputs for cell_line and anatomy. --- src/babel_utils.py | 2 +- src/snakefiles/anatomy.snakefile | 3 ++- src/snakefiles/cell_line.snakefile | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 53216870..e94dac3e 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -592,7 +592,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") metadata['concords'][metadata_name] = metadata_block - outf.write(yaml.dump(metadata)) + yaml.dump(metadata, outf) def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index a751f445..ed23182a 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -86,7 +86,8 @@ rule anatomy_compendia: icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])), + expand("{od}/metadata/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), run: anatomy.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index 3d032776..0330f6eb 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -28,7 +28,8 @@ rule cell_line_compendia: icrdf_filename=config['download_directory']+'/icRDF.tsv', output: config['output_directory']+"/compendia/CellLine.txt", - temp(config['output_directory']+"/synonyms/CellLine.txt") + temp(config['output_directory']+"/synonyms/CellLine.txt"), + config['output_directory']+"/metadata/CellLine.txt.yaml", run: cell_line.build_compendia(input.ids, [input.metadatafile], input.icrdf_filename) From b444e1cd65eceb543f5a59a43583230426684fa9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 00:48:53 -0400 Subject: [PATCH 016/167] Fixed sources. --- src/metadata/provenance.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index f292d6e9..79d83c01 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -2,13 +2,15 @@ import yaml -def write_download_metadata(filename, name, url='', description='', sources=[]): +def write_download_metadata(filename, name, url='', description='', sources=None): write_metadata(filename, 'download', name, url=url, description=description, sources=sources) -def write_concord_metadata(filename, name, url='', description='', sources=[]): +def write_concord_metadata(filename, name, url='', description='', sources=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources) -def write_metadata(filename, typ, name, sources=[], url='', description=''): +def write_metadata(filename, typ, name, sources=None, url='', description=''): + if sources is None: + sources = [] with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), From fe9ed8111846a452112f5d443c99fbc1542898d3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 00:49:14 -0400 Subject: [PATCH 017/167] First stab at Chemical concord metadata. --- src/createcompendia/chemicals.py | 99 +++++++++++++++++++++++++++---- src/snakefiles/chemical.snakefile | 42 ++++++++----- 2 files changed, 115 insertions(+), 26 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index c63f8158..7043d393 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -10,6 +10,7 @@ import yaml +from src.metadata.provenance import write_concord_metadata from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG @@ -74,12 +75,32 @@ def write_rxnorm_ids(infile, outfile): umlsmap ["A1.4.1.1.1"] = DRUG umls.write_rxnorm_ids(umlsmap, filter_types, infile, outfile, prefix=RXCUI, styfile="RXNSTY.RRF") -def build_chemical_umls_relationships(mrconso, idfile,outfile): +def build_chemical_umls_relationships(mrconso, idfile,outfile, metadata_yaml): umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }) -def build_chemical_rxnorm_relationships(conso, idfile,outfile): + write_concord_metadata( + metadata_yaml, + name='build_chemical_umls_relationships()', + sources=[{ + 'type': 'UMLS', + 'name': 'MRCONSO' + }], + description='umls.build_sets() of UMLS MRCONSO with prefixes: MSH, DRUGBANK, RXNORM', + ) + +def build_chemical_rxnorm_relationships(conso, idfile,outfile, metadata_yaml): umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) + write_concord_metadata( + metadata_yaml, + name='build_chemical_rxnorm_relationships()', + sources=[{ + 'type': 'UMLS', + 'name': 'MRCONSO' + }], + description=f'umls.build_sets() of {RXNORM} MRCONSO with prefixes: {MESH}, {DRUGBANK}', + ) + def write_pubchem_ids(labelfile,smilesfile,outfile): #Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier # but since they're already sorted, let's give it a shot @@ -362,14 +383,20 @@ def is_cas(thing): return False return True -def make_pubchem_cas_concord(pubchemsynonyms, outfile): +def make_pubchem_cas_concord(pubchemsynonyms, outfile, metadata_yaml): with open(pubchemsynonyms,'r') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split('\t') if is_cas(x[1]): outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n') -def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): + write_concord_metadata( + metadata_yaml, + name='make_pubchem_cas_concord()', + description=f'make_pubchem_cas_concord() creates xrefs from PUBCHEM identifiers in the PubChem synonyms file ({pubchemsynonyms}) to Chemical Abstracts Service (CAS) identifiers.', + ) + +def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): mesh_label_to_id={} #Meshlabels has all kinds of stuff. e.g. these are both in there: #MESH:D014867 Water @@ -397,7 +424,13 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): outf.write(f'{PUBCHEMCOMPOUND}:{x[0]}\txref\t{mesh_id}\n') used_pubchem.add(x[0]) -def build_drugcentral_relations(infile,outfile): + write_concord_metadata( + metadata_yaml, + name='make_pubchem_mesh_concord()', + description=f'make_pubchem_mesh_concord() loads MeSH labels from {meshlabels}, then creates xrefs from PubChem identifiers in the PubChem input file ({pubcheminput}) to those MeSH identifiers using the labels as keys.', + ) + +def build_drugcentral_relations(infile,outfile, metadata_yaml): prefixmap = { 'CHEBI': CHEBI, 'ChEMBL_ID': CHEMBLCOMPOUND, 'DRUGBANK_ID': DRUGBANK, @@ -421,8 +454,13 @@ def build_drugcentral_relations(infile,outfile): #print('ok') outf.write(f'{DRUGCENTRAL}:{parts[drugcentral_id_col]}\txref\t{prefixmap[external_ns]}:{parts[external_id_col]}\n') + write_concord_metadata( + metadata_yaml, + name='build_drugcentral_relations()', + description=f'Build xrefs from DrugCentral ({infile}) to {DRUGCENTRAL} using the prefix map {prefixmap}.', + ) -def make_gtopdb_relations(infile,outfile): +def make_gtopdb_relations(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: h = inf.readline() # We might have a header/version line. If so, skip to the next line. @@ -439,7 +477,13 @@ def make_gtopdb_relations(infile,outfile): inchi = f'{INCHIKEY}:{x[inchi_index][1:-1]}' outf.write(f'{gid}\txref\t{inchi}\n') -def make_chebi_relations(sdf,dbx,outfile): + write_concord_metadata( + metadata_yaml, + name='make_gtopdb_relations()', + description=f'Transform Ligand ID/InChIKey mappings from {infile} into a concord.' + ) + +def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): """CHEBI contains relations both about chemicals with and without inchikeys. You might think that because everything is based on unichem, we could avoid the with structures part, but history has shown that we lose links in that case, so we will use both the structured and unstructured chemical entries.""" @@ -483,10 +527,14 @@ def make_chebi_relations(sdf,dbx,outfile): if x[3] == 'Pubchem accession': outf.write(f'{cid}\txref\t{PUBCHEMCOMPOUND}:{x[4]}\n') + write_concord_metadata( + metadata_yaml, + name='make_chebi_relations()', + description=f'make_chebi_relations() creates xrefs from the ChEBI database ({sdf}) to {PUBCHEMCOMPOUND} and {KEGGCOMPOUND}.', + ) - -def get_mesh_relationships(mesh_id_file,cas_out, unii_out): +def get_mesh_relationships(mesh_id_file,cas_out, unii_out, cas_metadata, unii_metadata): meshes = set() with open(mesh_id_file,'r') as inf: for line in inf: @@ -508,7 +556,29 @@ def get_mesh_relationships(mesh_id_file,cas_out, unii_out): #is a unii? uniiout.write(f'{meshid}\txref\tUNII:{reg}\n') -def get_wikipedia_relationships(outfile): + write_concord_metadata( + cas_metadata, + name='get_mesh_relationships()', + sources=[{ + 'type': 'MeSH Registry', + 'name': 'MeSH Registry', + }], + description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' + f'in {mesh_id_file}, then writes out CAS mappings to {cas_out}' + ) + + write_concord_metadata( + unii_metadata, + name='get_mesh_relationships()', + sources=[{ + 'type': 'MeSH Registry', + 'name': 'MeSH Registry', + }], + description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' + f'in {mesh_id_file}, then writes out non-CAS mappings (i.e. UNII mappings) to {unii_out}' + ) + +def get_wikipedia_relationships(outfile, metadata_yaml): url = 'https://query.wikidata.org/sparql?format=json&query=SELECT ?chebi ?mesh WHERE { ?compound wdt:P683 ?chebi . ?compound wdt:P486 ?mesh. }' results = requests.get(url).json() pairs = [(f'{MESH}:{r["mesh"]["value"]}', f'{CHEBI}:{r["chebi"]["value"]}') @@ -525,6 +595,15 @@ def get_wikipedia_relationships(outfile): for m,c in pairs: outf.write(f'{m}\txref\t{c}\n') + write_concord_metadata( + metadata_yaml, + sources=[{ + 'type': 'Wikidata', + 'name': 'Wikidata SPARQL query', + }], + description='Wikidata SPARQL query to find Wikidata entities with both CHEBI and MESH IDs, and build a concordance between them.', + ) + def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 4b62062b..4f8d4dc8 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -109,9 +109,10 @@ rule get_chemical_drugcentral_relationships: input: xreffile=config['download_directory']+"/DrugCentral/xrefs" output: - outfile=config['intermediate_directory']+'/chemicals/concords/DrugCentral' + outfile=config['intermediate_directory']+'/chemicals/concords/DrugCentral', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-DrugCentral.yaml', run: - chemicals.build_drugcentral_relations(input.xreffile,output.outfile) + chemicals.build_drugcentral_relations(input.xreffile,output.outfile, output.metadata_yaml) rule get_chemical_umls_relationships: input: @@ -119,8 +120,9 @@ rule get_chemical_umls_relationships: infile=config['intermediate_directory']+"/chemicals/ids/UMLS", output: outfile=config['intermediate_directory']+'/chemicals/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-UMLS.yaml', run: - chemicals.build_chemical_umls_relationships(input.mrconso, input.infile, output.outfile) + chemicals.build_chemical_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule get_chemical_rxnorm_relationships: input: @@ -128,23 +130,27 @@ rule get_chemical_rxnorm_relationships: conso=config['download_directory'] + "/RxNorm/RXNCONSO.RRF" output: outfile=config['intermediate_directory']+'/chemicals/concords/RXNORM', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-RXNORM.yaml', run: - chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile,output.outfile) + chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile,output.outfile, output.metadata_yaml) rule get_chemical_wikipedia_relationships: output: - outfile = config['intermediate_directory'] + '/chemicals/concords/wikipedia_mesh_chebi' + outfile = config['intermediate_directory'] + '/chemicals/concords/wikipedia_mesh_chebi', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-wikipedia_mesh_chebi.yaml' run: - chemicals.get_wikipedia_relationships(output.outfile) + chemicals.get_wikipedia_relationships(output.outfile, output.metadata_yaml) rule get_chemical_mesh_relationships: input: infile = config['intermediate_directory'] + '/chemicals/ids/MESH' output: casout = config['intermediate_directory'] + '/chemicals/concords/mesh_cas', - uniout = config['intermediate_directory'] + '/chemicals/concords/mesh_unii' + uniout = config['intermediate_directory'] + '/chemicals/concords/mesh_unii', + casout_metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-mesh_cas.yaml', + uniout_metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-mesh_unii.yaml', run: - chemicals.get_mesh_relationships(input.infile,output.casout,output.uniout) + chemicals.get_mesh_relationships(input.infile,output.casout,output.uniout,output.casout_metadata_yaml,output.uniout_metadata_yaml) #This is about a 2 hour step and requires something more than 256G of RAM. 512G works. rule get_chemical_unichem_relationships: @@ -161,35 +167,39 @@ rule get_chemical_pubchem_mesh_concord: pubchemfile=config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH', meshlabels=config['download_directory'] + '/MESH/labels' output: - outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH' + outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-PUBCHEM_MESH.yaml' run: - chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile) + chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile, output.metadata_yaml) rule get_chemical_pubchem_cas_concord: input: pubchemsynonyms=config['download_directory'] + '/PUBCHEM.COMPOUND/synonyms' output: - outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_CAS' + outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_CAS', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-PUBCHEM_CAS.yaml' run: - chemicals.make_pubchem_cas_concord(input.pubchemsynonyms, output.outfile) + chemicals.make_pubchem_cas_concord(input.pubchemsynonyms, output.outfile, output.metadata_yaml) # There are some gtopdb inchikey relations that for some reason are not in unichem rule get_gtopdb_inchikey_concord: input: infile=config['download_directory']+'/GTOPDB/ligands.tsv' output: - outfile=config['intermediate_directory'] + '/chemicals/concords/GTOPDB' + outfile=config['intermediate_directory'] + '/chemicals/concords/GTOPDB', + metadata_yaml=config['intermediate_directory'] + '/chemicals/concords/metadata-GTOPDB.yaml', run: - chemicals.make_gtopdb_relations(input.infile,output.outfile) + chemicals.make_gtopdb_relations(input.infile,output.outfile, output.metadata_yaml) rule get_chebi_concord: input: sdf=config['download_directory']+'/CHEBI/ChEBI_complete.sdf', dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: - outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI' + outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-CHEBI.yaml' run: - chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile) + chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile, output.metadata_yaml) rule chemical_unichem_concordia: input: From 9adc3ffa5a8c4ad55a80f44ae70cd4f5864721a1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 14:03:37 -0400 Subject: [PATCH 018/167] Added check for type of name (in case people try passing in an object). --- src/metadata/provenance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 79d83c01..6b5d0261 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -9,6 +9,8 @@ def write_concord_metadata(filename, name, url='', description='', sources=None) write_metadata(filename, 'concord', name, url=url, description=description, sources=sources) def write_metadata(filename, typ, name, sources=None, url='', description=''): + if type(name) != str: + raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] with open(filename, 'w') as fout: From 963197cbd018d9150cefcb7ee8064f7e8fa4a6be Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 14:04:12 -0400 Subject: [PATCH 019/167] Fixed metadata.yaml output. --- src/snakefiles/anatomy.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index ed23182a..3974ad20 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -87,7 +87,7 @@ rule anatomy_compendia: output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])), - expand("{od}/metadata/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), + expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['anatomy_outputs']), run: anatomy.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) From b49384bdf1e369e6011f42bbc0ec0698f23dee21 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 14:04:34 -0400 Subject: [PATCH 020/167] Updated behavior of concord combinations. --- src/babel_utils.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index e94dac3e..f9733374 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -585,12 +585,16 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, metadata_name = metadata_block['name'] + if type(metadata_name) != str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + if metadata_name in metadata['concords']: - logging.error(f"Duplicate metadata block name {metadata_name}!") - logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!") - logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!") - raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.") - metadata['concords'][metadata_name] = metadata_block + # If it's not already a list, then make it into a list. + if type(metadata['concords'][metadata_name]) != list: + metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]] + metadata['concords'][metadata_name].append(metadata_block) + else: + metadata['concords'][metadata_name] = metadata_block yaml.dump(metadata, outf) From 640fbb773b743804d33aefb783ad20cbee166982 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 14:06:05 -0400 Subject: [PATCH 021/167] Fixed write_concord_metadata() calls. --- src/createcompendia/anatomy.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index d252223b..2106075d 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -166,25 +166,25 @@ def build_wikidata_cell_relationships(outdir, metadata_yaml): print(f'Pair {pair} is not unique {counts[pair[0]]} {counts[pair[1]]}') # Write out metadata - write_concord_metadata(metadata_yaml, { - 'name': 'build_wikidata_cell_relationships()', - 'sources': [{ + write_concord_metadata(metadata_yaml, + name='build_wikidata_cell_relationships()', + sources=[{ 'type': 'Frink', 'name': 'Frink Direct Normalized Graph via SPARQL' }], - 'description': 'wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', - }) + description='wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', + ) def build_anatomy_umls_relationships(mrconso, idfile, outfile, umls_metadata): umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) - write_concord_metadata(umls_metadata, { - 'name': 'build_anatomy_umls_relationships()', - 'sources': [{ + write_concord_metadata(umls_metadata, + name='build_anatomy_umls_relationships()', + sources=[{ 'type': 'UMLS', 'name': 'MRCONSO' }], - 'description': 'umls.build_sets() of UMLS MRCONSO with prefixes: SNOMEDCT_US, MSH, NCI, GO, FMA', - }) + description='umls.build_sets() of UMLS MRCONSO with prefixes: SNOMEDCT_US, MSH, NCI, GO, FMA', + ) def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships @@ -223,7 +223,7 @@ def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(metadata_yamls, concordances, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into From edd78abd25844432028386b65f890bfb978d0c63 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 15:01:47 -0400 Subject: [PATCH 022/167] Added metadata to module targets. --- src/snakefiles/anatomy.snakefile | 1 + src/snakefiles/cell_line.snakefile | 1 + 2 files changed, 2 insertions(+) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 3974ad20..062a3b03 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -135,6 +135,7 @@ rule anatomy: input: config['output_directory']+'/reports/anatomy_completeness.txt', synonyms=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), + metadata=expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['anatomy_outputs']), reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['anatomy_outputs']) output: synonyms_gzipped=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['anatomy_outputs']), diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index 0330f6eb..62cb17fc 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -54,6 +54,7 @@ rule cell_line: config['output_directory']+'/reports/cell_line_completeness.txt', config['output_directory'] + "/reports/CellLine.txt", cell_line_synonyms=config['output_directory'] + "/synonyms/CellLine.txt", + metadata=config['output_directory']+"/metadata/CellLine.txt.yaml", output: config['output_directory'] + "/synonyms/CellLine.txt.gz", x=config['output_directory']+'/reports/cell_line_done' From 54802a70d3e50b779e5da6350d83330b0b2c062c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 15:03:37 -0400 Subject: [PATCH 023/167] Added target to chemical. --- src/snakefiles/chemical.snakefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 4f8d4dc8..28b015a5 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -233,7 +233,8 @@ rule chemical_compendia: icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])), + expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), run: chemicals.build_compendia(input.typesfile, input.untyped_file, [input.metadata_yamls], input.icrdf_filename) @@ -307,7 +308,8 @@ rule chemical: input: config['output_directory']+'/reports/chemical_completeness.txt', synonyms = expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), - reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['chemical_outputs']) + reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['chemical_outputs']), + metadata = expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), output: synonyms_gzipped = expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['chemical_outputs']), x=config['output_directory']+'/reports/chemicals_done' From 4e9e1e79350ba7dba8c67b0c8276cfccde13bcac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 15:18:32 -0400 Subject: [PATCH 024/167] Centralized UMLS build_sets() metadata generation. --- src/createcompendia/anatomy.py | 10 +--------- src/createcompendia/chemicals.py | 12 +----------- src/datahandlers/umls.py | 16 ++++++++++++++-- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 2106075d..a435293f 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -176,15 +176,7 @@ def build_wikidata_cell_relationships(outdir, metadata_yaml): ) def build_anatomy_umls_relationships(mrconso, idfile, outfile, umls_metadata): - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) - write_concord_metadata(umls_metadata, - name='build_anatomy_umls_relationships()', - sources=[{ - 'type': 'UMLS', - 'name': 'MRCONSO' - }], - description='umls.build_sets() of UMLS MRCONSO with prefixes: SNOMEDCT_US, MSH, NCI, GO, FMA', - ) + umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}, provenance_metadata_yaml=umls_metadata) def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 7043d393..2e845ca1 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -76,17 +76,7 @@ def write_rxnorm_ids(infile, outfile): umls.write_rxnorm_ids(umlsmap, filter_types, infile, outfile, prefix=RXCUI, styfile="RXNSTY.RRF") def build_chemical_umls_relationships(mrconso, idfile,outfile, metadata_yaml): - umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }) - - write_concord_metadata( - metadata_yaml, - name='build_chemical_umls_relationships()', - sources=[{ - 'type': 'UMLS', - 'name': 'MRCONSO' - }], - description='umls.build_sets() of UMLS MRCONSO with prefixes: MSH, DRUGBANK, RXNORM', - ) + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }, provenance_metadata_yaml=metadata_yaml) def build_chemical_rxnorm_relationships(conso, idfile,outfile, metadata_yaml): umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index c840c896..72af0ea7 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import UMLS, RXCUI from src.babel_utils import make_local_name from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE @@ -200,8 +201,8 @@ def write_rxnorm_ids(category_map, bad_categories, infile, outfile,prefix=RXCUI, # One is to keep from having to pass through the umls file more than once, but that's a bad reason # The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another # way. I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it. -def build_sets(mrconso,umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, - cui_prefix = UMLS): +def build_sets(mrconso, umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, + cui_prefix = UMLS, provenance_metadata_yaml=None): """Given a list of umls identifiers we want to generate all the concordances between UMLS and that other entity""" # On UMLS / MESH: we have been getting all UMLS / MESH relationships. This has led to some clear mistakes @@ -259,6 +260,17 @@ def build_sets(mrconso,umls_input, umls_output , other_prefixes, bad_mappings=de concordfile.write(f'{tup[0]}\teq\t{tup[1]}\n') pairs.add(tup) + # Write provenance for this build_sets() call. + if provenance_metadata_yaml is not None: + write_concord_metadata(provenance_metadata_yaml, + name='umls.build_sets()', + sources=[{ + 'type': 'UMLS', + 'name': 'MRCONSO' + }], + description=f'umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}', + ) + def read_umls_priority(): mrp = os.path.join('input_data', 'umls_precedence.txt') pris = [] From 2fc7053806f2f6dfbc5d8b83bde1329b70827534 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 15:55:59 -0400 Subject: [PATCH 025/167] Added provenance metadata to diseasephenotype module. --- src/createcompendia/diseasephenotype.py | 67 +++++++++++++++++++---- src/datahandlers/efo.py | 14 ++++- src/metadata/provenance.py | 13 +++-- src/snakefiles/diseasephenotype.snakefile | 42 +++++++++++--- src/ubergraph.py | 2 +- 5 files changed, 113 insertions(+), 25 deletions(-) diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index e4fdd69c..19469e55 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -2,6 +2,7 @@ from collections import defaultdict import src.datahandlers.obo as obo +from src.metadata.provenance import write_concord_metadata from src.prefixes import MESH, NCIT, MONDO, OMIM, HP, SNOMEDCT, MEDDRA, EFO, ORPHANET, ICD0, ICD9, ICD10, UMLS, KEGGDISEASE from src.categories import DISEASE, PHENOTYPIC_FEATURE @@ -87,28 +88,66 @@ def write_umls_ids(mrsty, outfile,badumlsfile): umls.write_umls_ids(mrsty, umlsmap, outfile, blocklist_umls_ids=badumls) -def build_disease_obo_relationships(outdir): +def build_disease_obo_relationships(outdir, metadata_yamls): #Create the equivalence pairs with open(f'{outdir}/{HP}', 'w') as outfile: + other_prefixes = {'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 } build_sets(f'{HP}:0000118', {HP:outfile}, ignore_list=['ICD'], - other_prefixes={'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 }, + other_prefixes=other_prefixes, set_type='xref') + + write_concord_metadata( + metadata_yamls['HP'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'HP' + } + ], + description=f'ubergraph.build_sets() of {HP}:0000118 with other_prefixes {other_prefixes}' + ) + with open(f'{outdir}/{MONDO}', 'w') as outfile: #Orphanet here is confusing. In mondo it comes out mixed case like "Orphanet" and we want to cap it. We have a normer # in build sets, but it is based on the UPPERCASED prefix. So we're passing in that we want to change uppercase orphanet to uppercase # orphanet. In actuality that matching key will pick up any case orphanet, including the one that actually occurs. build_sets('MONDO:0000001', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) + + write_concord_metadata(metadata_yamls['MONDO'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (exact) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes' + ) + with open(f'{outdir}/{MONDO}_close', 'w') as outfile: build_sets('MONDO:0000001', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) -def build_disease_efo_relationships(idfile,outfile): - efo.make_concords(idfile, outfile) + write_concord_metadata( + metadata_yamls['MONDO'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (close matches) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes' + ) + +def build_disease_efo_relationships(idfile,outfile, metadata_yaml): + efo.make_concords(idfile, outfile, provenance_metadata=metadata_yaml) -def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile): +def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile, metadata_yaml): #UMLS contains xrefs between a disease UMLS and a gene OMIM. So here we are saying: if you are going to link to # an omim identifier, make sure it's a disease omim, not some other thing. good_ids = {} @@ -118,13 +157,19 @@ def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfil for line in inf: x = line.split()[0] good_ids[prefix].add(x) - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids) + umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids, metadata_yaml=metadata_yaml) -def build_disease_doid_relationships(idfile,outfile): - doid.build_xrefs(idfile, outfile, other_prefixes={'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, - 'SNOMEDCT_US_2018_03_01': SNOMEDCT, 'SNOMEDCT_US_2019_09_01': SNOMEDCT, - 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, - 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE}) +def build_disease_doid_relationships(idfile,outfile, metadata_yaml): + other_prefixes = {'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, + 'SNOMEDCT_US_2018_03_01': SNOMEDCT, 'SNOMEDCT_US_2019_09_01': SNOMEDCT, + 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, + 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE} + doid.build_xrefs(idfile, outfile, other_prefixes=other_prefixes) + write_concord_metadata( + metadata_yaml, + name='build_disease_doid_relationships()', + description=f'build_disease_doid_relationships() using the DOID ID file {idfile} and other_prefixes {other_prefixes}' + ) def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badxrefs, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 03fd59f1..913053ce 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -1,6 +1,7 @@ import logging import re +from src.metadata.provenance import write_concord_metadata from src.prefixes import EFO,ORPHANET from src.babel_utils import pull_via_urllib from src.babel_utils import make_local_name @@ -165,7 +166,7 @@ def make_ids(roots,idfname): m = EFOgraph() m.pull_EFO_ids(roots,idfname) -def make_concords(idfilename, outfilename): +def make_concords(idfilename, outfilename, provenance_metadata=None): """Given a list of identifiers, find out all of the equivalent identifiers from the owl""" m = EFOgraph() with open(idfilename,"r") as inf, open(outfilename,"w") as concfile: @@ -174,3 +175,14 @@ def make_concords(idfilename, outfilename): nexacts = m.get_exacts(efo_id,concfile) if nexacts == 0: m.get_xrefs(efo_id,concfile) + + if provenance_metadata is not None: + write_concord_metadata( + provenance_metadata, + name='Experimental Factor Ontology (EFO) cross-references', + description=f'Cross-references from the Experimental Factor Ontology (EFO) for the EFO IDs in {idfilename}', + sources=[{ + 'name': 'Experimental Factor Ontology', + 'url': 'http://www.ebi.ac.uk/efo/efo.owl', + }], + ) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 6b5d0261..54bc50e3 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -2,17 +2,19 @@ import yaml -def write_download_metadata(filename, name, url='', description='', sources=None): - write_metadata(filename, 'download', name, url=url, description=description, sources=sources) +def write_download_metadata(filename, name, url='', description='', sources=None, counts=None): + write_metadata(filename, 'download', name, url=url, description=description, sources=sources, counts=None) -def write_concord_metadata(filename, name, url='', description='', sources=None): - write_metadata(filename, 'concord', name, url=url, description=description, sources=sources) +def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): + write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_metadata(filename, typ, name, sources=None, url='', description=''): +def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None): if type(name) != str: raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] + if counts is None: + counts = [] with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), @@ -21,4 +23,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description=''): 'url': url, 'description': description, 'sources': sources, + 'counts': counts, }, fout) diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index db2a04d4..283502a3 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.diseasephenotype as diseasephenotype import src.assess_compendia as assessments import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Disease / Phenotypic Feature @@ -84,16 +85,24 @@ rule get_disease_obo_relationships: config['intermediate_directory']+'/disease/concords/MONDO', config['intermediate_directory']+'/disease/concords/MONDO_close', config['intermediate_directory']+'/disease/concords/HP', + mondo_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-MONDO.yaml', + mondo_close_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-MONDO_close.yaml', + hp_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-HP.yaml', run: - diseasephenotype.build_disease_obo_relationships(config['intermediate_directory']+'/disease/concords') + diseasephenotype.build_disease_obo_relationships(config['intermediate_directory']+'/disease/concords', { + 'MONDO': output.mondo_metadata_yaml, + 'MONDO_close': output.mondo_close_metadata_yaml, + 'HP': output.hp_metadata_yaml, + }) rule get_disease_efo_relationships: input: infile=config['intermediate_directory']+"/disease/ids/EFO", output: - outfile=config['intermediate_directory']+'/disease/concords/EFO' + outfile=config['intermediate_directory']+'/disease/concords/EFO', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-EFO.yaml', run: - diseasephenotype.build_disease_efo_relationships(input.infile,output.outfile) + diseasephenotype.build_disease_efo_relationships(input.infile,output.outfile, output.metadata_yaml) rule get_disease_umls_relationships: input: @@ -103,23 +112,27 @@ rule get_disease_umls_relationships: ncit=config['intermediate_directory'] + '/disease/ids/NCIT' output: outfile=config['intermediate_directory']+'/disease/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-UMLS.yaml', run: - diseasephenotype.build_disease_umls_relationships(input.mrconso, input.infile,output.outfile,input.omim,input.ncit) + diseasephenotype.build_disease_umls_relationships(input.mrconso, input.infile,output.outfile,input.omim,input.ncit, output.metadata_yaml) rule get_disease_doid_relationships: input: infile = config['download_directory']+'/DOID/doid.json' output: outfile=config['intermediate_directory']+'/disease/concords/DOID', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-DOID.yaml', run: - diseasephenotype.build_disease_doid_relationships(input.infile,output.outfile) + diseasephenotype.build_disease_doid_relationships(input.infile,output.outfile,output.metadata_yaml) rule disease_manual_concord: input: infile = 'input_data/manual_concords/disease.txt' output: - outfile = config['intermediate_directory']+'/disease/concords/Manual' + outfile = config['intermediate_directory']+'/disease/concords/Manual', + metadata_yaml = config['intermediate_directory']+'/disease/concords/metadata-Manual.yaml' run: + count_manual_concords = 0 with open(input.infile, 'r') as inp, open(output.outfile, 'w') as outp: for line in inp: # Remove any lines starting with '#', which we treat as comments. @@ -131,6 +144,21 @@ rule disease_manual_concord: if len(elements) != 3: raise RuntimeError(f"Found {len(elements)} elements on line {lstripped_line}, expected 3: {elements}") outp.writelines(["\t".join(elements)]) + count_manual_concords += 1 + + write_concord_metadata( + output.metadata_yaml, + name='Manual Disease/Phenotype Concords', + description='Manually curated Disease/Phenotype cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/disease.txt', + counts={ + 'concords': count_manual_concords, + }, + ) rule disease_compendia: input: @@ -189,4 +217,4 @@ rule disease: x=config['output_directory']+'/reports/disease_done' run: util.gzip_files(input.synonyms) - util.write_done(output.x) \ No newline at end of file + util.write_done(output.x) diff --git a/src/ubergraph.py b/src/ubergraph.py index b2db9ef8..917237fa 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -477,7 +477,7 @@ def write_normalized_information_content(self, filename): print(f"Wrote {write_count} information content values into {filename}.") return write_count -def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ): +def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there is a set for each subclass of the input iri. Write these lists to concord files, indexed by the prefix""" prefix = Text.get_curie(iri) From b42423520a8dc8826f352f24b5fb75194697b506 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 17:46:46 -0400 Subject: [PATCH 026/167] Added metadata to DrugChemical conflations. diff --git c/src/babel_utils.py i/src/babel_utils.py index f973337..59a5360 100644 --- c/src/babel_utils.py +++ i/src/babel_utils.py @@ -14,6 +14,7 @@ import urllib import jsonlines import yaml +from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -559,44 +560,17 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, exit() # Write out the metadata.yaml file combining information from all the metadata.yaml files. - metadata_dir = os.path.join(cdir,'metadata') - os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf: - # TODO: move into metadata/provenance.py - metadata = { - 'type': 'compendium', - 'name': ofname, - 'created_at': datetime.now().isoformat(), - 'counts': { - 'cliques': count_cliques, - 'eq_ids': count_eq_ids, - 'synonyms': count_synonyms, - }, - 'concords': {} - } - for metadata_yaml in metadata_yamls: - with open(metadata_yaml, 'r') as metaf: - metadata_block = yaml.safe_load(metaf) - if metadata_block is None or metadata_block == {}: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - if 'name' not in metadata_block: - raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") - - metadata_name = metadata_block['name'] - - if type(metadata_name) != str: - raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") - - if metadata_name in metadata['concords']: - # If it's not already a list, then make it into a list. - if type(metadata['concords'][metadata_name]) != list: - metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]] - metadata['concords'][metadata_name].append(metadata_block) - else: - metadata['concords'][metadata_name] = metadata_block - - yaml.dump(metadata, outf) + write_combined_metadata( + os.path.join(cdir, 'metadata', ofname + '.yaml'), + typ='compendium', + name=ofname, + counts={ + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + }, + combined_from_filenames=metadata_yamls, + ) def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git c/src/createcompendia/drugchemical.py i/src/createcompendia/drugchemical.py index 2de4804..8dee460 100644 --- c/src/createcompendia/drugchemical.py +++ i/src/createcompendia/drugchemical.py @@ -1,5 +1,6 @@ import csv +from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import NodeFactory, InformationContentFactory from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, @@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui): print(x) exit() -def build_rxnorm_relationships(conso, relfile, outfile): +def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): """RXNREL is a lousy file. The subject and object can sometimes be a CUI and sometimes an AUI and you have to use CONSO to figure out how to go back and forth. @@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile): #This is maybe relying on convention a bit too much. if outfile == "UMLS": prefix = UMLS + sources = [ + { + 'type': 'UMLS', + 'name': 'MRCONSO', + 'filename': conso + }, + { + 'type': 'UMLS', + 'name': 'MRREL', + 'filename': relfile + } + ] else: prefix = RXCUI + sources = [ + { + 'type': 'RXNORM', + 'name': 'RXNCONSO', + 'filename': conso + }, + { + 'type': 'RXNOM', + 'name': 'RXNREL', + 'filename': relfile + } + ] aui_to_cui, sdui_to_cui = get_aui_to_cui(conso) # relfile = os.path.join('input_data', 'private', "RXNREL.RRF") single_use_relations = {"has_active_ingredient": defaultdict(set), @@ -214,6 +239,13 @@ def build_rxnorm_relationships(conso, relfile, outfile): continue outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n") + write_concord_metadata( + metadata_yaml, + name='build_rxnorm_relationships()', + description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', + sources=sources + ) + def load_cliques(compendium): rx_to_clique = {} @@ -228,7 +260,7 @@ def load_cliques(compendium): rx_to_clique[terms["i"]] = clique return rx_to_clique -def build_pubchem_relationships(infile,outfile): +def build_pubchem_relationships(infile,outfile, metadata_yaml): with open(infile,"r") as inf: document = json.load(inf) with open(outfile,"w") as outf: @@ -238,7 +270,19 @@ def build_pubchem_relationships(infile,outfile): for cid in cids: outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n") -def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename): + write_concord_metadata( + metadata_yaml, + name='build_pubchem_relationships()', + description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.', + sources=[{ + 'type': 'PubChem', + 'name': 'PubChem RxNorm annotations', + 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', + 'filename': infile + }] + ) + +def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): """RXN_concord contains relationshps between rxcuis that can be used to conflate Now we don't want all of them. We want the ones that are between drugs and chemicals, and the ones between drugs and drugs. @@ -556,6 +600,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem outfile.write(f"{json.dumps(final_conflation_id_list)}\n") written.add(fs) + # Write out metadata.yaml + write_combined_metadata( + output_metadata_yaml, + typ='conflation', + name='drugchemical.build_conflation()', + description='Build DrugChemical conflation.', + combined_from_filenames=input_metadata_yamls + ) + def sort_by_curie_suffix(curie): """ diff --git c/src/metadata/provenance.py i/src/metadata/provenance.py index 54bc50e..5a8f703 100644 --- c/src/metadata/provenance.py +++ i/src/metadata/provenance.py @@ -1,3 +1,4 @@ +import os.path from datetime import datetime import yaml @@ -8,13 +9,56 @@ def write_download_metadata(filename, name, url='', description='', sources=None def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None): - if type(name) != str: +def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None): + combined_from = {} + if combined_from_filenames is not None: + for metadata_yaml in combined_from_filenames: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if type(metadata_name) is not str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + + if metadata_name in combined_from: + # If it's not already a list, then make it into a list. + if type(combined_from[metadata_name]) is not list: + combined_from[metadata_name] = [combined_from[metadata_name]] + combined_from[metadata_name].append(metadata_block) + else: + combined_from[metadata_name] = metadata_block + + write_metadata( + filename, + typ=typ, + name=name, + sources=sources, + url=url, + description=description, + counts=counts, + combined_from=combined_from + ) + +def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None): + if type(typ) is not str: + raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") + if type(name) is not str: raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] if counts is None: counts = [] + if combined_from is None: + combined_from = [] + + metadata_dir = os.path.dirname(filename) + os.makedirs(metadata_dir, exist_ok=True) with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), @@ -24,4 +68,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description='', co 'description': description, 'sources': sources, 'counts': counts, + 'combined_from': combined_from, }, fout) diff --git c/src/snakefiles/drugchemical.snakefile i/src/snakefiles/drugchemical.snakefile index 9640c13..3f6a8d3 100644 --- c/src/snakefiles/drugchemical.snakefile +++ i/src/snakefiles/drugchemical.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.drugchemical as drugchemical import src.synonyms.synonymconflation as synonymconflation import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Drug / Chemical @@ -9,39 +10,56 @@ rule rxnorm_relationships: rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF", rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml' run: - drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml) rule umls_relationships: input: umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF", umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml' run: - drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml) rule pubchem_rxnorm_relationships: input: infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml' run: - drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords) + drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule drugchemical_conflation: input: drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt', chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']), rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM', + rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml', umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml', pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM', + pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml', drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: - outfile=config['output_directory']+'/conflation/DrugChemical.txt' + outfile=config['output_directory']+'/conflation/DrugChemical.txt', + metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', + drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: + write_concord_metadata(input.drugchemical_manual_metadata, + name='Manual DrugChemical Concords', + description='Manually curated DrugChemical conflation cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', + ) drugchemical.build_conflation( input.drugchemical_manual_concord, input.rxnorm_concord, @@ -50,7 +68,13 @@ rule drugchemical_conflation: input.drug_compendium, input.chemical_compendia, input.icrdf_filename, - output.outfile) + output.outfile, + input_metadata_yamls={ + 'RXNORM': input.rxnorm_metadata, + 'UMLS': input.umls_metadata, + 'PUBCHEM_RXNORM': input.pubchem_metadata, + 'Manual': input.drugchemical_manual_metadata, + }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input: --- src/babel_utils.py | 50 ++++++----------------- src/createcompendia/drugchemical.py | 59 +++++++++++++++++++++++++-- src/metadata/provenance.py | 49 +++++++++++++++++++++- src/snakefiles/drugchemical.snakefile | 40 ++++++++++++++---- 4 files changed, 147 insertions(+), 51 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index f9733374..59a53609 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -14,6 +14,7 @@ import jsonlines import yaml +from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.util import Text, get_config from src.LabeledID import LabeledID @@ -559,44 +560,17 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, exit() # Write out the metadata.yaml file combining information from all the metadata.yaml files. - metadata_dir = os.path.join(cdir,'metadata') - os.makedirs(metadata_dir, exist_ok=True) - with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf: - # TODO: move into metadata/provenance.py - metadata = { - 'type': 'compendium', - 'name': ofname, - 'created_at': datetime.now().isoformat(), - 'counts': { - 'cliques': count_cliques, - 'eq_ids': count_eq_ids, - 'synonyms': count_synonyms, - }, - 'concords': {} - } - for metadata_yaml in metadata_yamls: - with open(metadata_yaml, 'r') as metaf: - metadata_block = yaml.safe_load(metaf) - if metadata_block is None or metadata_block == {}: - raise ValueError("Metadata file {metadata_yaml} is empty.") - - if 'name' not in metadata_block: - raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") - - metadata_name = metadata_block['name'] - - if type(metadata_name) != str: - raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") - - if metadata_name in metadata['concords']: - # If it's not already a list, then make it into a list. - if type(metadata['concords'][metadata_name]) != list: - metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]] - metadata['concords'][metadata_name].append(metadata_block) - else: - metadata['concords'][metadata_name] = metadata_block - - yaml.dump(metadata, outf) + write_combined_metadata( + os.path.join(cdir, 'metadata', ofname + '.yaml'), + typ='compendium', + name=ofname, + counts={ + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + }, + combined_from_filenames=metadata_yamls, + ) def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 2de48047..8dee460f 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -1,5 +1,6 @@ import csv +from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import NodeFactory, InformationContentFactory from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, @@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui): print(x) exit() -def build_rxnorm_relationships(conso, relfile, outfile): +def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): """RXNREL is a lousy file. The subject and object can sometimes be a CUI and sometimes an AUI and you have to use CONSO to figure out how to go back and forth. @@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile): #This is maybe relying on convention a bit too much. if outfile == "UMLS": prefix = UMLS + sources = [ + { + 'type': 'UMLS', + 'name': 'MRCONSO', + 'filename': conso + }, + { + 'type': 'UMLS', + 'name': 'MRREL', + 'filename': relfile + } + ] else: prefix = RXCUI + sources = [ + { + 'type': 'RXNORM', + 'name': 'RXNCONSO', + 'filename': conso + }, + { + 'type': 'RXNOM', + 'name': 'RXNREL', + 'filename': relfile + } + ] aui_to_cui, sdui_to_cui = get_aui_to_cui(conso) # relfile = os.path.join('input_data', 'private', "RXNREL.RRF") single_use_relations = {"has_active_ingredient": defaultdict(set), @@ -214,6 +239,13 @@ def build_rxnorm_relationships(conso, relfile, outfile): continue outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n") + write_concord_metadata( + metadata_yaml, + name='build_rxnorm_relationships()', + description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', + sources=sources + ) + def load_cliques(compendium): rx_to_clique = {} @@ -228,7 +260,7 @@ def load_cliques(compendium): rx_to_clique[terms["i"]] = clique return rx_to_clique -def build_pubchem_relationships(infile,outfile): +def build_pubchem_relationships(infile,outfile, metadata_yaml): with open(infile,"r") as inf: document = json.load(inf) with open(outfile,"w") as outf: @@ -238,7 +270,19 @@ def build_pubchem_relationships(infile,outfile): for cid in cids: outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n") -def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename): + write_concord_metadata( + metadata_yaml, + name='build_pubchem_relationships()', + description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.', + sources=[{ + 'type': 'PubChem', + 'name': 'PubChem RxNorm annotations', + 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', + 'filename': infile + }] + ) + +def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): """RXN_concord contains relationshps between rxcuis that can be used to conflate Now we don't want all of them. We want the ones that are between drugs and chemicals, and the ones between drugs and drugs. @@ -556,6 +600,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem outfile.write(f"{json.dumps(final_conflation_id_list)}\n") written.add(fs) + # Write out metadata.yaml + write_combined_metadata( + output_metadata_yaml, + typ='conflation', + name='drugchemical.build_conflation()', + description='Build DrugChemical conflation.', + combined_from_filenames=input_metadata_yamls + ) + def sort_by_curie_suffix(curie): """ diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 54bc50e3..5a8f7034 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -1,3 +1,4 @@ +import os.path from datetime import datetime import yaml @@ -8,13 +9,56 @@ def write_download_metadata(filename, name, url='', description='', sources=None def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None): - if type(name) != str: +def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None): + combined_from = {} + if combined_from_filenames is not None: + for metadata_yaml in combined_from_filenames: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if type(metadata_name) is not str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + + if metadata_name in combined_from: + # If it's not already a list, then make it into a list. + if type(combined_from[metadata_name]) is not list: + combined_from[metadata_name] = [combined_from[metadata_name]] + combined_from[metadata_name].append(metadata_block) + else: + combined_from[metadata_name] = metadata_block + + write_metadata( + filename, + typ=typ, + name=name, + sources=sources, + url=url, + description=description, + counts=counts, + combined_from=combined_from + ) + +def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None): + if type(typ) is not str: + raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") + if type(name) is not str: raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") if sources is None: sources = [] if counts is None: counts = [] + if combined_from is None: + combined_from = [] + + metadata_dir = os.path.dirname(filename) + os.makedirs(metadata_dir, exist_ok=True) with open(filename, 'w') as fout: yaml.dump({ 'created_at': datetime.now().isoformat(), @@ -24,4 +68,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description='', co 'description': description, 'sources': sources, 'counts': counts, + 'combined_from': combined_from, }, fout) diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index 9640c13b..3f6a8d36 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.drugchemical as drugchemical import src.synonyms.synonymconflation as synonymconflation import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Drug / Chemical @@ -9,39 +10,56 @@ rule rxnorm_relationships: rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF", rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml' run: - drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml) rule umls_relationships: input: umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF", umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml' run: - drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml) rule pubchem_rxnorm_relationships: input: infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml' run: - drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords) + drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule drugchemical_conflation: input: drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt', chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']), rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM', + rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml', umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml', pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM', + pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml', drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: - outfile=config['output_directory']+'/conflation/DrugChemical.txt' + outfile=config['output_directory']+'/conflation/DrugChemical.txt', + metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', + drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: + write_concord_metadata(input.drugchemical_manual_metadata, + name='Manual DrugChemical Concords', + description='Manually curated DrugChemical conflation cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', + ) drugchemical.build_conflation( input.drugchemical_manual_concord, input.rxnorm_concord, @@ -50,7 +68,13 @@ rule drugchemical_conflation: input.drug_compendium, input.chemical_compendia, input.icrdf_filename, - output.outfile) + output.outfile, + input_metadata_yamls={ + 'RXNORM': input.rxnorm_metadata, + 'UMLS': input.umls_metadata, + 'PUBCHEM_RXNORM': input.pubchem_metadata, + 'Manual': input.drugchemical_manual_metadata, + }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input: From da2dc1dac2fee1f1892c8f43b0a135d6703d4ee4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 1 Jul 2025 18:46:35 -0400 Subject: [PATCH 027/167] Added concords to the Gene module. --- src/createcompendia/gene.py | 75 ++++++++++++++++++++++------ src/snakefiles/datacollect.snakefile | 13 ----- src/snakefiles/gene.snakefile | 52 +++++++++++++++---- 3 files changed, 103 insertions(+), 37 deletions(-) diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 15ef2f19..bfe0c06d 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -1,6 +1,7 @@ import re from src import babel_utils +from src.metadata.provenance import write_concord_metadata from src.prefixes import OMIM,ENSEMBL,NCBIGENE,WORMBASE, MGI, ZFIN, DICTYBASE, FLYBASE, RGD, SGD, HGNC, UMLS from src.categories import GENE @@ -23,8 +24,18 @@ def write_mods_ids(dd,id,modlist): x = line.split('\t')[0] outf.write(f'{x}\n') -def build_gene_ensembl_relationships(ensembl_dir, outfile): +def build_gene_ensembl_relationships(ensembl_dir, outfile, metadata_yaml): """Loop over all the ensembl species. Find any protein-coding gene""" + # Identifiers to extract. + column_to_prefix = { 'NCBI gene (formerly Entrezgene) ID': {NCBIGENE}, + 'ZFIN ID': {ZFIN}, + 'SGD gene name ID': {SGD}, + 'WormBase Gene ID': {WORMBASE}, + 'FlyBase ID': {FLYBASE}, + 'MGI ID': {MGI}, + 'RGD ID': {RGD} + } + with open(outfile,'w') as outf: #find all the ensembl directories dirlisting = os.listdir(ensembl_dir) @@ -39,14 +50,6 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile): h = inf.readline() x = h[:-1].split('\t') gene_column = x.index('Gene stable ID') - column_to_prefix = { 'NCBI gene (formerly Entrezgene) ID': {NCBIGENE}, - 'ZFIN ID': {ZFIN}, - 'SGD gene name ID': {SGD}, - 'WormBase Gene ID': {WORMBASE}, - 'FlyBase ID': {FLYBASE}, - 'MGI ID': {MGI}, - 'RGD ID': {RGD} - } protein_column = x.index('Protein stable ID') columnno_to_prefix = {} for i,v in enumerate(x): @@ -74,6 +77,16 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ENSEMBL}:{ensembl_id_without_version}\teq\t{gid}\n') + write_concord_metadata( + metadata_yaml, + name='build_gene_ensembl_relationships()', + description=f'Extracts gene-ensembl relationships from the ensembl files ({ensembl_dir}) for prefixes: {column_to_prefix.values()}', + sources=[{ + 'name': 'ENSEMBL', + 'filename': ensembl_dir, + }] + ) + def write_zfin_ids(infile,outfile): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: @@ -155,7 +168,7 @@ def read_ncbi_idfile(ncbi_idfile): ncbi_ids.add(x) return ncbi_ids -def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile): +def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile, metadata_yaml): ncbi_ids = read_ncbi_idfile(ncbi_idfile) with gzip.open(infile,'r') as inf, open(outfile,'w') as outf: h = inf.readline() @@ -181,7 +194,19 @@ def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ncbigene_id}\teq\t{ENSEMBL}:{ensembl_id_without_version}\n') -def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile): + write_concord_metadata( + metadata_yaml, + name='build_gene_ncbi_ensembl_relationships()', + description=f'Extracts gene-ensembl relationships from the NCBIGene gene2ensembl.gz file ({infile}), filtering to ' + + f'NCBIGene IDs in {ncbi_idfile}', + sources=[{ + 'type': 'NCBIGENE', + 'name': 'NCBIGene gene2ensembl.gz', + 'filename': infile, + }] + ) + +def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile, metadata_yaml): mappings = {'WormBase': WORMBASE, 'FLYBASE': FLYBASE, 'ZFIN': ZFIN, 'HGNC': HGNC, 'MGI': MGI, 'RGD': RGD, 'dictyBase': DICTYBASE, 'SGD': SGD } @@ -202,7 +227,19 @@ def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile): if found_prefix in mappings: outf.write(f'{ncbigene_id}\txref\t{mappings[found_prefix]}:{xref_parts[-1]}\n') -def build_gene_medgen_relationships(infile,outfile): + write_concord_metadata( + metadata_yaml, + name='build_gene_ncbigene_xrefs()', + description=f'Extracts gene-xref relationships from the NCBIGene gene_info.gz file ({infile}), filtering to ' + + f'NCBIGene IDs in {ncbi_idfile} and extracting mappings for prefixes {mappings.values()}', + sources=[{ + 'type': 'NCBIGENE', + 'name': 'NCBIGene gene_info.gz', + 'filename': infile, + }] + ) + +def build_gene_medgen_relationships(infile,outfile, metadata_yaml): with open(infile, 'r') as inf, open(outfile, 'w') as outf: h = inf.readline() for line in inf: @@ -217,6 +254,16 @@ def build_gene_medgen_relationships(infile,outfile): umls_id = f'{UMLS}:{x[4]}' outf.write(f'{ncbigene_id}\teq\t{umls_id}\n') + write_concord_metadata( + metadata_yaml, + name='build_gene_medgen_relationships()', + description=f'Extracts gene-OMIM relationships from the mim2gene_medgen file ({infile})', + sources=[{ + 'name': 'MIM2Gene MEDGEN', + 'filename': infile, + }] + ) + def write_ensembl_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile,'w') as outf: @@ -247,9 +294,9 @@ def write_ensembl_ids(ensembl_dir, outfile): outf.write(f'{gid}\n') -def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile): +def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile, metadata_yaml): #Could also add MESH, if that were a valid gene prefix - umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}) + umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}, provenance_metadata_yaml=metadata_yaml) def build_gene_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 16f331b8..34e7645c 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -122,19 +122,6 @@ rule get_uniprotkb_labels: run: uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile) -rule get_umls_gene_protein_mappings: - output: - umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv', - umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene', - umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB', - run: - uniprotkb.download_umls_gene_protein_mappings( - config['UMLS_UniProtKB_download_raw_url'], - output.umls_uniprotkb_filename, - output.umls_gene_concords, - output.umls_protein_concords, - ) - ### MESH rule get_mesh: diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 11fdf7a6..0010903e 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -1,6 +1,8 @@ import src.createcompendia.gene as gene import src.assess_compendia as assessments import src.snakefiles.util as util +from src.datahandlers import uniprotkb +from src.metadata.provenance import write_concord_metadata rule gene_mods_ids: input: @@ -57,44 +59,74 @@ rule get_gene_ncbigene_ensembl_relationships: infile=config['download_directory']+"/NCBIGene/gene2ensembl.gz", idfile=config['intermediate_directory'] + "/gene/ids/NCBIGene" output: - outfile=config['intermediate_directory']+'/gene/concords/NCBIGeneENSEMBL' + outfile=config['intermediate_directory']+'/gene/concords/NCBIGeneENSEMBL', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-NCBIGeneENSEMBL.yaml' run: - gene.build_gene_ncbi_ensembl_relationships(input.infile,input.idfile,output.outfile) + gene.build_gene_ncbi_ensembl_relationships(input.infile,input.idfile,output.outfile, output.metadata_yaml) rule get_gene_ncbigene_relationships: input: infile=config['download_directory']+"/NCBIGene/gene_info.gz", idfile=config['intermediate_directory']+"/gene/ids/NCBIGene" output: - outfile=config['intermediate_directory']+'/gene/concords/NCBIGene' + outfile=config['intermediate_directory']+'/gene/concords/NCBIGene', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-NCBIGene.yaml' run: - gene.build_gene_ncbigene_xrefs(input.infile,input.idfile,output.outfile) + gene.build_gene_ncbigene_xrefs(input.infile,input.idfile,output.outfile, output.metadata_yaml) rule get_gene_ensembl_relationships: input: infile =config['download_directory'] + '/ENSEMBL/BioMartDownloadComplete' output: - outfile=config['intermediate_directory']+'/gene/concords/ENSEMBL' + outfile=config['intermediate_directory']+'/gene/concords/ENSEMBL', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-ENSEMBL.yaml' run: - gene.build_gene_ensembl_relationships(config['download_directory']+'/ENSEMBL',output.outfile) + gene.build_gene_ensembl_relationships(config['download_directory']+'/ENSEMBL',output.outfile, output.metadata_yaml) rule get_gene_medgen_relationships: input: infile=config['download_directory']+'/NCBIGene/mim2gene_medgen' output: - outfile=config['intermediate_directory']+'/gene/concords/medgen' + outfile=config['intermediate_directory']+'/gene/concords/medgen', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-medgen.yaml', run: - gene.build_gene_medgen_relationships(input.infile, output.outfile) + gene.build_gene_medgen_relationships(input.infile, output.outfile, output.metadata_yaml) rule get_gene_umls_relationships: input: mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+'/gene/ids/UMLS' output: - outfile=config['intermediate_directory']+'/gene/concords/UMLS' + outfile=config['intermediate_directory']+'/gene/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-UMLS.yaml', run: - gene.build_gene_umls_hgnc_relationships(input.mrconso, input.infile, output.outfile) + gene.build_gene_umls_hgnc_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + +rule get_umls_gene_protein_mappings: + output: + umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv', + umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene', + umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB', + metadata_yaml=config['output_directory']+'/intermediate/gene/concords/metadata-UMLS_NCBIGene.yaml' + run: + uniprotkb.download_umls_gene_protein_mappings( + config['UMLS_UniProtKB_download_raw_url'], + output.umls_uniprotkb_filename, + output.umls_gene_concords, + output.umls_protein_concords, + ) + + write_concord_metadata( + output.metadata_yaml, + name='get_umls_gene_protein_mappings', + description="Download UMLS-UniProtKB mappings from {config['UMLS_UniProtKB_download_raw_url']}", + sources=[{ + 'type': 'download', + 'name': 'UMLS-UniProtKB mappings', + 'url': config['UMLS_UniProtKB_download_raw_url'], + }], + ) rule gene_compendia: input: From 20097780c54c168d7cf12c3ad283e26888dac644 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 02:53:54 -0400 Subject: [PATCH 028/167] Added metadata for genefamilies. --- src/datahandlers/hgncfamily.py | 15 ++++++++++++++- src/datahandlers/pantherfamily.py | 15 ++++++++++++++- src/snakefiles/datacollect.snakefile | 6 ++++-- src/snakefiles/genefamily.snakefile | 2 +- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py index cc6f8c13..161a9d93 100644 --- a/src/datahandlers/hgncfamily.py +++ b/src/datahandlers/hgncfamily.py @@ -1,6 +1,7 @@ from pronto.utils.io import decompress from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib +from src.metadata.provenance import write_metadata from src.prefixes import HGNCFAMILY def pull_hgncfamily(): @@ -10,7 +11,7 @@ def pull_hgncfamily(): decompress=False, subpath=HGNCFAMILY) -def pull_labels(infile,outfile): +def pull_labels(infile,outfile, metadata_yaml): with open(infile,'r') as inf: data = inf.read().strip() lines = data.split('\n') @@ -24,3 +25,15 @@ def pull_labels(infile,outfile): l = parts[2][1:-1] outf.write(f'{i}\t{l}\n') + write_metadata( + metadata_yaml, + typ='transform', + name='HGNC Gene Family labels', + description='Labels extracted from HGNC GeneFamily CSV download', + sources=[{ + 'type': 'download', + 'name': 'HGNC Gene Family', + 'url': 'https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv', + 'description': 'HGNC GeneFamily CSV download' + }] + ) diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index f4a0c596..51c5562d 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -1,4 +1,5 @@ from src.babel_utils import make_local_name, pull_via_ftp +from src.metadata.provenance import write_metadata from src.prefixes import PANTHERFAMILY def pull_pantherfamily(): @@ -7,7 +8,7 @@ def pull_pantherfamily(): # If you need to check this quickly, it's also available on HTTP at: # - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ -def pull_labels(infile,outfile): +def pull_labels(infile,outfile, metadata_yaml): with open(infile,'r') as inf: data = inf.read() lines = data.strip().split('\n') @@ -38,3 +39,15 @@ def pull_labels(infile,outfile): #labels[sub_family]=sfname labelf.write(f'{sub_family}\t{sfname}\n') done.add(sf) + + write_metadata( + metadata_yaml, + typ='transform', + name='HGNC Gene Family labels', + description='Main families and subfamily labels extracted from PANTHER Sequence Classification human.', + sources=[{ + 'type': 'download', + 'name': 'PANTHER Sequence Classification: Human', + 'url': 'ftp://ftp.pantherdb.org/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/PTHR19.0_human', + }] + ) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 34e7645c..3bf7aaab 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -279,8 +279,9 @@ rule get_hgncfamily_labels: infile=rules.get_hgncfamily.output.outfile output: outfile = config['download_directory'] + '/HGNC.FAMILY/labels', + metadata_yaml = config['download_directory'] + '/HGNC.FAMILY/metadata.yaml', run: - hgncfamily.pull_labels(input.infile,output.outfile) + hgncfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml) ### PANTHER.FAMILY @@ -295,8 +296,9 @@ rule get_pantherfamily_labels: infile=rules.get_pantherfamily.output.outfile output: outfile = config['download_directory'] + '/PANTHER.FAMILY/labels', + metadata_yaml = config['download_directory'] + '/PANTHER.FAMILY/metadata.yaml', run: - pantherfamily.pull_labels(input.infile,output.outfile) + pantherfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml) ### OMIM diff --git a/src/snakefiles/genefamily.snakefile b/src/snakefiles/genefamily.snakefile index b8eb18cb..0b675be8 100644 --- a/src/snakefiles/genefamily.snakefile +++ b/src/snakefiles/genefamily.snakefile @@ -15,7 +15,7 @@ rule genefamily_hgncfamily_ids: input: infile=config['download_directory']+'/HGNC.FAMILY/labels' output: - outfile=config['intermediate_directory']+"/genefamily/ids/HGNC.FAMILY" + outfile=config['intermediate_directory']+"/genefamily/ids/HGNC.FAMILY", shell: #This one is a simple enough transform to do with awk "awk '{{print $1\"\tbiolink:GeneFamily\"}}' {input.infile} > {output.outfile}" From a44eda1b7c6da0223e171111558d04cc9da51e54 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 02:57:33 -0400 Subject: [PATCH 029/167] Added metadata to a method we probably don't use any more. --- src/createcompendia/geneprotein.py | 14 +++++++++++++- src/snakefiles/geneprotein.snakefile | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/geneprotein.py b/src/createcompendia/geneprotein.py index 0655b86f..ade3cbd7 100644 --- a/src/createcompendia/geneprotein.py +++ b/src/createcompendia/geneprotein.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import UNIPROTKB, NCBIGENE from src.babel_utils import glom from collections import defaultdict @@ -8,7 +9,7 @@ from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, level=logging.ERROR) -def build_uniprotkb_ncbigene_relationships(infile,outfile): +def build_uniprotkb_ncbigene_relationships(infile,outfile, metadata_yaml): #The trick is that the uniprot mapping file can have more than one gene per protein. # Our model is 1 gene, many proteins, so this causes trouble. # For the moment, we will not include that have more than one gene per protein @@ -26,6 +27,17 @@ def build_uniprotkb_ncbigene_relationships(infile,outfile): ncbigene_id = ncbigene_ids[0] outf.write(f'{uniprot_id}\trelated_to\t{ncbigene_id}\n') + write_concord_metadata( + metadata_yaml, + name='build_uniprotkb_ncbigene_relationships()', + description='Extract NCBIGene-UniProtKB relationships from UniProtKB id-mapping file {infile}', + sources=[{ + 'type': 'UniProtKB', + 'name': 'UniProtKB idmapping file', + 'filename': infile, + }] + ) + def merge(geneproteinlist): """We have a gene and one or more proteins. We want to create a combined something.""" diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 199cc6fc..ad797c3a 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -7,7 +7,8 @@ rule geneprotein_uniprot_relationships: input: infile = config['download_directory'] + '/UniProtKB/idmapping.dat' output: - outfile_concords = config['intermediate_directory'] + '/geneprotein/concords/UniProtNCBI' + outfile_concords = config['intermediate_directory'] + '/geneprotein/concords/UniProtNCBI', + metadata_yaml = config['intermediate_directory'] + '/geneprotein/concords/metadata-UniProtNCBI.yaml' run: geneprotein.build_uniprotkb_ncbigene_relationships(input.infile,output.outfile_concords) From 188cdd0a7625178762e9386523479fbfa474293a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 02:59:39 -0400 Subject: [PATCH 030/167] Added MacromolecularComplex metadata requirement. --- src/snakefiles/macromolecular_complex.snakefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index 99df28d9..2accdd10 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -19,7 +19,8 @@ rule macromolecular_complex_compendia: icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: config['output_directory']+'/compendia/MacromolecularComplex.txt', - temp(config['output_directory']+'/synonyms/MacromolecularComplex.txt') + temp(config['output_directory']+'/synonyms/MacromolecularComplex.txt'), + output_metadata_yaml = config['output_directory']+'/metadata/MacromolecularComplex.txt.yaml', run: macromolecular_complex.build_compendia([input.idlists], [input.metadata_yaml], icrdf_filename=input.icrdf_filename) @@ -42,6 +43,7 @@ rule check_macromolecular_complex: rule macromolecular_complex: input: synonym=config['output_directory']+'/synonyms/MacromolecularComplex.txt', + output_metadata_yaml = config['output_directory']+'/metadata/MacromolecularComplex.txt.yaml', completeness=config['output_directory']+'/reports/macromolecular_complex_completeness.txt', reports = config['output_directory']+'/reports/MacromolecularComplex.txt' output: From 79a081afaf0ae26c8c142de766dfc5e8c5d00120 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 03:06:59 -0400 Subject: [PATCH 031/167] Fixed metadata for MacromolecularComplex. --- src/datahandlers/complexportal.py | 15 ++++++++++++++- src/snakefiles/datacollect.snakefile | 5 +++-- src/snakefiles/macromolecular_complex.snakefile | 4 ++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/datahandlers/complexportal.py b/src/datahandlers/complexportal.py index 30dc3c7c..f7b679a9 100644 --- a/src/datahandlers/complexportal.py +++ b/src/datahandlers/complexportal.py @@ -1,10 +1,11 @@ from src.babel_utils import pull_via_urllib, make_local_name +from src.metadata.provenance import write_metadata from src.prefixes import COMPLEXPORTAL def pull_complexportal(): pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/',f'559292.tsv', decompress=False, subpath=COMPLEXPORTAL) -def make_labels_and_synonyms(infile, labelfile, synfile): +def make_labels_and_synonyms(infile, labelfile, synfile, metadata_yaml): usedsyns = set() with open(infile, 'r') as inf, open(labelfile, 'w') as outl, open(synfile, 'w') as outsyn: next(inf) # skip header @@ -20,3 +21,15 @@ def make_labels_and_synonyms(infile, labelfile, synfile): if not syn in usedsyns: outsyn.write(f'{COMPLEXPORTAL}:{id}\t{syn}\n') usedsyns.add(syn) + + write_metadata( + metadata_yaml, + typ='transform', + name='ComplexPortal', + description='Labels and synonyms extracted from ComplexPortal download of 559292 (Saccharomyces cerevisiae)', + sources=[{ + 'type': 'download', + 'name': 'ComplexPortal for organism 559292 (Saccharomyces cerevisiae)', + 'url': 'http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/559292.tsv' + }] + ) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 3bf7aaab..81704f30 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -73,9 +73,10 @@ rule get_complexportal_labels_and_synonyms: infile = config['download_directory']+'/ComplexPortal'+'/559292.tsv' output: lfile = config['download_directory']+'/ComplexPortal'+'/559292_labels.tsv', - sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv' + sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv', + metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml' run: - complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile) + complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile, output.metadata_yaml) ### MODS diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index 2accdd10..5494e65d 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -6,7 +6,7 @@ rule macromolecular_complex_ids: input: infile = config['download_directory']+'/ComplexPortal/559292_labels.tsv' output: - outfile = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal' + outfile = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', shell: "awk '{{print $1\"\tbiolink:MacromolecularComplex\"}}' {input.infile} > {output.outfile}" @@ -14,8 +14,8 @@ rule macromolecular_complex_compendia: input: labels = config['download_directory']+'/ComplexPortal/559292_labels.tsv', synonyms = config['download_directory']+'/ComplexPortal/559292_synonyms.tsv', - metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml', idlists = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', + metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml', icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: config['output_directory']+'/compendia/MacromolecularComplex.txt', From 4682c4b5dd2e4e5cd3d2c7dc7d94bbe3de85f4b5 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 03:17:01 -0400 Subject: [PATCH 032/167] Added provenance metadata to process. --- src/createcompendia/processactivitypathway.py | 22 +++++++++++++---- src/datahandlers/rhea.py | 24 ++++++++++++++++--- src/snakefiles/process.snakefile | 11 +++++---- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index 630fc6b9..0e46d258 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -6,6 +6,7 @@ import src.datahandlers.rhea as rhea import src.datahandlers.ec as ec import src.datahandlers.umls as umls +from src.metadata.provenance import write_concord_metadata from src.prefixes import GO, REACT, WIKIPATHWAYS, RHEA, SMPDB, EC, PANTHERPATHWAY, TCDB from src.categories import BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY, PATHWAY @@ -42,10 +43,10 @@ def write_umls_ids(mrsty, outfile): } umls.write_umls_ids(mrsty, umlsmap, outfile) -def build_process_umls_relationships(mrconso, idfile,outfile): - umls.build_sets(mrconso, idfile, outfile, {'GO': GO}) +def build_process_umls_relationships(mrconso, idfile,outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'GO': GO}, provenance_metadata_yaml=metadata_yaml) -def build_process_obo_relationships(outdir): +def build_process_obo_relationships(outdir, metadata_yaml): #Create the equivalence pairs #op={'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 } op={'WIKIPEDIA': WIKIPATHWAYS, 'REACTOME':REACT, 'TC':TCDB } @@ -54,8 +55,19 @@ def build_process_obo_relationships(outdir): build_sets(f'{GO}:0008150', {GO:outfile}, set_type='xref', other_prefixes=op ) build_sets(f'{GO}:0003674', {GO:outfile}, set_type='xref', other_prefixes=op ) -def build_process_rhea_relationships(outfile): - rhea.make_concord(outfile) + write_concord_metadata( + metadata_yaml, + name='build_process_obo_relationships()', + description=f"Extract GO-GO relationships from UberGraph with get_subclasses_and_xrefs() from {GO}:0007165, {GO}:0008150 and {GO}:0003674," + f"with other_prefixes {op.values()}", + sources=[{ + 'type': 'UberGraph', + 'name': 'GO-GO relationships from UberGraph', + }], + ) + +def build_process_rhea_relationships(outfile, metadata_yaml): + rhea.make_concord(outfile, metadata_yaml) def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py index 6546e8ff..a0610236 100644 --- a/src/datahandlers/rhea.py +++ b/src/datahandlers/rhea.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import RHEA,EC from src.babel_utils import pull_via_urllib from src.babel_utils import make_local_name, pull_via_ftp @@ -12,6 +13,7 @@ class Rhea: """Load the mesh rdf file for querying""" def __init__(self): ifname = make_local_name('rhea.rdf', subpath='RHEA') + self.filename = ifname from datetime import datetime as dt print('loading rhea') start = dt.now() @@ -37,7 +39,7 @@ def pull_rhea_labels(self,ofname): #The rhea ids in the rdf use the currently approved prefix, but just to be sure... rheaid = iterm.split(':')[-1] outf.write(f'{RHEA}:{rheaid}\t{label}\n') - def pull_rhea_ec_concs(self,ofname): + def pull_rhea_ec_concs(self,ofname, metadata_yaml): s=""" PREFIX rdfs: PREFIX rh: @@ -55,12 +57,28 @@ def pull_rhea_ec_concs(self,ofname): rheaid = iterm.split(':')[-1] outf.write(f'{RHEA}:{rheaid}\toio:equivalent\t{ec}\n') + write_concord_metadata( + metadata_yaml, + name='Rhea.pull_rhea_ec_concs()', + description=f'pull_rhea_ec_concs() extracts the EC number/accession number mappings from the Rhea RDF file ({self.filename}).', + sources=[{ + 'type': 'rdf', + 'name': 'rhea.rdf', + 'filename': self.filename, + 'sources': [{ + 'type': 'download', + 'name': 'rhea.rdf', + 'url': 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz', + }] + }] + ) + #Ids are handled by just getting everything from the labels def make_labels(labelfile): m = Rhea() m.pull_rhea_labels(labelfile) -def make_concord(concfile): +def make_concord(concfile, metadata_yaml): m = Rhea() - m.pull_rhea_ec_concs(concfile) \ No newline at end of file + m.pull_rhea_ec_concs(concfile, metadata_yaml) diff --git a/src/snakefiles/process.snakefile b/src/snakefiles/process.snakefile index b652102b..fd3a16ab 100644 --- a/src/snakefiles/process.snakefile +++ b/src/snakefiles/process.snakefile @@ -64,16 +64,18 @@ rule process_umls_ids: rule get_process_go_relationships: output: config['intermediate_directory']+'/process/concords/GO', + metadata_yaml = config['intermediate_directory']+'/process/concords/metadata-GO.yaml' run: - pap.build_process_obo_relationships(config['intermediate_directory']+'/process/concords') + pap.build_process_obo_relationships(config['intermediate_directory']+'/process/concords', output.metadata_yaml) rule get_process_rhea_relationships: input: infile=config['download_directory']+"/RHEA/rhea.rdf", output: outfile=config['intermediate_directory']+'/process/concords/RHEA', + metadata_yaml=config['intermediate_directory']+'/process/concords/metadata-RHEA.yaml', run: - pap.build_process_rhea_relationships(output.outfile) + pap.build_process_rhea_relationships(output.outfile, output.metadata_yaml) rule get_process_umls_relationships: @@ -82,8 +84,9 @@ rule get_process_umls_relationships: infile=config['intermediate_directory']+"/process/ids/UMLS", output: outfile=config['intermediate_directory']+'/process/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/process/concords/metadata-UMLS.yaml' run: - pap.build_process_umls_relationships(input.mrconso, input.infile, output.outfile) + pap.build_process_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule process_compendia: input: @@ -141,4 +144,4 @@ rule process: x=config['output_directory']+'/reports/process_done' run: util.gzip_files(input.synonyms) - util.write_done(output.x) \ No newline at end of file + util.write_done(output.x) From 62370f06c453cd0d6ca06df38a5b9e7ed2df28eb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 03:20:44 -0400 Subject: [PATCH 033/167] Added metadata to taxon. --- src/createcompendia/taxon.py | 16 +++++++++++++--- src/snakefiles/taxon.snakefile | 8 +++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index d9bbb52d..848dd547 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import NCBITAXON,MESH,UMLS from src.categories import ORGANISM_TAXON @@ -61,10 +62,10 @@ def write_umls_ids(mrsty, outfile): ]} umls.write_umls_ids(mrsty, umlsmap,outfile) -def build_taxon_umls_relationships(mrconso, idfile, outfile): - umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}) +def build_taxon_umls_relationships(mrconso, idfile, outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}, provenance_metadata_yaml=metadata_yaml) -def build_relationships(outfile,mesh_ids): +def build_relationships(outfile,mesh_ids, metadata_yaml): regis = mesh.pull_mesh_registry() with open(mesh_ids,'r') as inf: lines = inf.read().strip().split('\n') @@ -80,6 +81,15 @@ def build_relationships(outfile,mesh_ids): #left = list(all_mesh_taxa.difference( set([x[0] for x in regis]) )) #eutil.lookup(left) + write_concord_metadata( + metadata_yaml, + name='build_relationships()', + description=f'Builds relationships between MeSH and NCBI Taxon from the MeSH registry.', + sources=[{ + 'type': 'MeSH', + 'name': 'MeSH Registry', + }] + ) def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index b9d6b744..5652a7fd 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -33,17 +33,19 @@ rule get_taxon_umls_relationships: infile=config['intermediate_directory']+"/taxon/ids/UMLS" output: outfile=config['intermediate_directory']+'/taxon/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/taxon/concords/metadata-UMLS.yaml', run: - taxon.build_taxon_umls_relationships(input.mrconso, input.infile, output.outfile) + taxon.build_taxon_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule get_taxon_relationships: input: meshfile=config['download_directory']+"/MESH/mesh.nt", meshids=config['intermediate_directory']+"/taxon/ids/MESH", output: - outfile=config['intermediate_directory']+'/taxon/concords/NCBI_MESH' + outfile=config['intermediate_directory']+'/taxon/concords/NCBI_MESH', + metadata_yaml=config['intermediate_directory']+'/taxon/concords/metadata-NCBI_MESH.yaml', run: - taxon.build_relationships(output.outfile,input.meshids) + taxon.build_relationships(output.outfile,input.meshids, output.metadata_yaml) rule taxon_compendia: input: From 142357e8cf647e6bc101680501ad18f9d3525b02 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 2 Jul 2025 03:24:13 -0400 Subject: [PATCH 034/167] Added publication metadata. --- src/createcompendia/publications.py | 17 ++++++++++++++++- src/snakefiles/publications.snakefile | 4 +++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index 86247631..832e2bc8 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -11,6 +11,7 @@ from src.babel_utils import pull_via_wget, WgetRecursionOptions, glom, read_identifier_file, write_compendium from src.categories import JOURNAL_ARTICLE, PUBLICATION +from src.metadata.provenance import write_concord_metadata from src.prefixes import PMID, DOI, PMC @@ -136,7 +137,7 @@ def verify_pubmed_downloads(pubmed_directories, def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_file, pmid_id_file, - pmid_doi_concord_file): + pmid_doi_concord_file, metadata_yaml): """ Read through the PubMed files in the baseline_dir and updatefiles_dir, and writes out label and status information. @@ -145,6 +146,7 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi :param titles_file: An output TSV file in the format `\t`. :param status_file: A JSON file containing publication status information. :param pmid_doi_concord_file: A concord file in the format `<PMID>\teq\t<DOI>` and other identifiers. + :param metadata_yaml: The metadata YAML file to write. """ # We can write labels and concords as we go. @@ -245,6 +247,19 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi for pmid, statuses in pmid_status.items(): statusf.write(json.dumps({'id': pmid, 'statuses': sorted(statuses)}, sort_keys=True) + '\n') + write_concord_metadata( + metadata_yaml, + name='parse_pubmed_into_tsvs()', + description="Parse PubMed files into TSVs and JSONL status files.", + sources=[{ + 'type': 'download', + 'name': 'PubMed Baseline and updates' + }, { + 'type': 'download', + 'name': 'PubMed PMC-ids.csv.gz', + 'url': 'https://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz', + }] + ) def generate_compendium(concordances, metadata_yamls, identifiers, titles, publication_compendium, icrdf_filename): """ diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index cf69cf29..687c6b2b 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -36,6 +36,7 @@ rule generate_pubmed_concords: status_file = config['download_directory'] + '/PubMed/statuses.jsonl.gz', pmid_id_file = config['intermediate_directory'] + '/publications/ids/PMID', pmid_doi_concord_file = config['intermediate_directory'] + '/publications/concords/PMID_DOI', + metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', run: publications.parse_pubmed_into_tsvs( input.baseline_dir, @@ -43,7 +44,8 @@ rule generate_pubmed_concords: output.titles_file, output.status_file, output.pmid_id_file, - output.pmid_doi_concord_file) + output.pmid_doi_concord_file, + output.metadata_yaml) rule generate_pubmed_compendia: input: From 402951970ac66105a26fe680a18a7022709a3715 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 2 Jul 2025 03:39:11 -0400 Subject: [PATCH 035/167] Added metadata for module protein. --- src/createcompendia/protein.py | 42 ++++++++++++++++++++++++++++---- src/snakefiles/gene.snakefile | 17 ++++++++++--- src/snakefiles/protein.snakefile | 18 ++++++++------ 3 files changed, 62 insertions(+), 15 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 33ebdcda..d7c16625 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,5 +1,6 @@ import re +from src.metadata.provenance import write_concord_metadata from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON from src.categories import PROTEIN @@ -97,7 +98,7 @@ def write_ensembl_ids(ensembl_dir, outfile): wrote.add(pid) outf.write(f'{pid}\n') -def build_pr_uniprot_relationships(outfile, ignore_list = []): +def build_pr_uniprot_relationships(outfile, ignore_list = [], metadata_yaml = None): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there is a set for each subclass of the input iri. Write these lists to concord files, indexed by the prefix""" iri = 'PR:000000001' @@ -110,7 +111,18 @@ def build_pr_uniprot_relationships(outfile, ignore_list = []): if k.startswith('PR'): concfile.write(f'{k}\txref\t{x}\n') -def build_protein_uniprotkb_ensemble_relationships(infile,outfile): + if metadata_yaml: + write_concord_metadata( + metadata_yaml, + name='build_pr_uniprot_relationships()', + description=f"Extracts {PR} xrefs from UberGraph after getting subclasses and xrefs of {iri}", + sources=[{ + 'type': 'UberGraph', + 'name': 'UberGraph', + }] + ) + +def build_protein_uniprotkb_ensemble_relationships(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split() @@ -128,8 +140,18 @@ def build_protein_uniprotkb_ensemble_relationships(infile,outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ensembl_id}\teq\t{ENSEMBL}:{ensembl_id_without_version}\n') + write_concord_metadata( + metadata_yaml, + name='build_protein_uniprotkb_ensemble_relationships()', + description=f'Extracts {UNIPROTKB}-to-{ENSEMBL} relationships from the ENSEMBL id-mapping file ({infile}) file.', + sources=[{ + 'name': 'ENSEMBL', + 'filename': infile, + }] + ) -def build_ncit_uniprot_relationships(infile,outfile): + +def build_ncit_uniprot_relationships(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: # These lines are sometimes empty (I think because the @@ -144,8 +166,18 @@ def build_ncit_uniprot_relationships(infile,outfile): uniprot_id = f'{UNIPROTKB}:{x[1]}' outf.write(f'{ncit_id}\teq\t{uniprot_id}\n') -def build_umls_ncit_relationships(mrconso, idfile, outfile): - umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}) + write_concord_metadata( + metadata_yaml, + name='build_ncit_uniprot_relationships()', + description=f'Extracts {NCIT}-to-{UNIPROTKB} relationships from the NCIt-SwissProt_Mapping file ({infile}).', + sources=[{ + 'name': 'NCIt-SwissProt Mapping file', + 'filename': infile, + }] + ) + +def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}, provenance_metadata_yaml=metadata_yaml) def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 0010903e..b21ccc87 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -107,8 +107,9 @@ rule get_umls_gene_protein_mappings: output: umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv', umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene', + umls_ncbigene_metadata_yaml=config['output_directory']+'/intermediate/gene/concords/metadata-UMLS_NCBIGene.yaml', umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB', - metadata_yaml=config['output_directory']+'/intermediate/gene/concords/metadata-UMLS_NCBIGene.yaml' + umls_protein_metadata_yaml=config['output_directory']+'/intermediate/protein/concords/metadata-UMLS_UniProtKB.yaml', run: uniprotkb.download_umls_gene_protein_mappings( config['UMLS_UniProtKB_download_raw_url'], @@ -118,9 +119,19 @@ rule get_umls_gene_protein_mappings: ) write_concord_metadata( - output.metadata_yaml, + output.umls_ncbigene_metadata_yaml, name='get_umls_gene_protein_mappings', - description="Download UMLS-UniProtKB mappings from {config['UMLS_UniProtKB_download_raw_url']}", + description=f"Download UMLS-UniProtKB gene mappings from {config['UMLS_UniProtKB_download_raw_url']}", + sources=[{ + 'type': 'download', + 'name': 'UMLS-UniProtKB mappings', + 'url': config['UMLS_UniProtKB_download_raw_url'], + }], + ) + write_concord_metadata( + output.umls_protein_metadata_yaml, + name='get_umls_gene_protein_mappings', + description=f"Download UMLS-UniProtKB protein mappings from {config['UMLS_UniProtKB_download_raw_url']}", sources=[{ 'type': 'download', 'name': 'UMLS-UniProtKB mappings', diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 2f83e8d8..9acb9b3c 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -48,23 +48,26 @@ rule get_protein_uniprotkb_ensembl_relationships: input: infile = config['download_directory'] + '/UniProtKB/idmapping.dat' output: - outfile = config['intermediate_directory'] + '/protein/concords/UniProtKB' + outfile = config['intermediate_directory'] + '/protein/concords/UniProtKB', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-UniProtKB.yaml', run: - protein.build_protein_uniprotkb_ensemble_relationships(input.infile,output.outfile) + protein.build_protein_uniprotkb_ensemble_relationships(input.infile,output.outfile, output.metadata_yaml) rule get_protein_pr_uniprotkb_relationships: output: - outfile = config['intermediate_directory'] + '/protein/concords/PR' + outfile = config['intermediate_directory'] + '/protein/concords/PR', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-PR.yaml' run: - protein.build_pr_uniprot_relationships(output.outfile) + protein.build_pr_uniprot_relationships(output.outfile, output.metadata_yaml) rule get_protein_ncit_uniprotkb_relationships: input: infile = config['download_directory'] + '/NCIT/NCIt-SwissProt_Mapping.txt' output: - outfile = config['intermediate_directory'] + '/protein/concords/NCIT_UniProtKB' + outfile = config['intermediate_directory'] + '/protein/concords/NCIT_UniProtKB', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-NCIT_UniProtKB.yaml', run: - protein.build_ncit_uniprot_relationships(input.infile, output.outfile) + protein.build_ncit_uniprot_relationships(input.infile, output.outfile, output.metadata_yaml) rule get_protein_ncit_umls_relationships: input: @@ -72,8 +75,9 @@ rule get_protein_ncit_umls_relationships: infile=config['intermediate_directory']+"/protein/ids/UMLS" output: outfile=config['intermediate_directory']+'/protein/concords/NCIT_UMLS', + metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-NCIT_UMLS.yaml' run: - protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile) + protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule protein_compendia: input: From febac7797b1d3b8a43a2c4dca9b71f380c733e21 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 2 Jul 2025 03:42:24 -0400 Subject: [PATCH 036/167] Turned DRY_RUN back on. --- scripts/babel-build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/babel-build.sh b/scripts/babel-build.sh index 72c3c24a..653de19e 100644 --- a/scripts/babel-build.sh +++ b/scripts/babel-build.sh @@ -11,7 +11,7 @@ export CORES=5 # Dry run: if true, run Snakemake in a dry run. -export DRY_RUN= +export DRY_RUN=1 # Verbose: if set, produce verbose output. export VERBOSE= From e500b1dd407435f1ad5dbd66d1c73e4a2d6d2ec3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 2 Jul 2025 03:43:51 -0400 Subject: [PATCH 037/167] Oops, left off metadata.yaml file. --- src/snakefiles/geneprotein.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index ad797c3a..7e540bd9 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -10,7 +10,7 @@ rule geneprotein_uniprot_relationships: outfile_concords = config['intermediate_directory'] + '/geneprotein/concords/UniProtNCBI', metadata_yaml = config['intermediate_directory'] + '/geneprotein/concords/metadata-UniProtNCBI.yaml' run: - geneprotein.build_uniprotkb_ncbigene_relationships(input.infile,output.outfile_concords) + geneprotein.build_uniprotkb_ncbigene_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule geneprotein_conflation: input: From aa4f819fcf66356336c540d3187311e134599305 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 7 Jul 2025 13:21:13 -0400 Subject: [PATCH 038/167] Fixed metadata.yaml for get_protein_pr_uniprotkb_relationships. --- src/createcompendia/protein.py | 2 +- src/snakefiles/protein.snakefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index d7c16625..6a1bf5f1 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -115,7 +115,7 @@ def build_pr_uniprot_relationships(outfile, ignore_list = [], metadata_yaml = No write_concord_metadata( metadata_yaml, name='build_pr_uniprot_relationships()', - description=f"Extracts {PR} xrefs from UberGraph after getting subclasses and xrefs of {iri}", + description=f"Extracts {PR} xrefs from UberGraph after getting subclasses and xrefs of {iri}, ignoring {ignore_list}.", sources=[{ 'type': 'UberGraph', 'name': 'UberGraph', diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 9acb9b3c..b3c898e3 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -58,7 +58,7 @@ rule get_protein_pr_uniprotkb_relationships: outfile = config['intermediate_directory'] + '/protein/concords/PR', metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-PR.yaml' run: - protein.build_pr_uniprot_relationships(output.outfile, output.metadata_yaml) + protein.build_pr_uniprot_relationships(output.outfile, metadata_yaml=output.metadata_yaml) rule get_protein_ncit_uniprotkb_relationships: input: From 97772b94dc9376fd00f41de7e45e2da0aed2ae79 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 7 Jul 2025 13:23:12 -0400 Subject: [PATCH 039/167] Fixed YAML. --- src/createcompendia/diseasephenotype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 19469e55..98ffb7ba 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -132,7 +132,7 @@ def build_disease_obo_relationships(outdir, metadata_yamls): build_sets('MONDO:0042489', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) write_concord_metadata( - metadata_yamls['MONDO'], + metadata_yamls['MONDO_close'], name='build_disease_obo_relationships()', sources=[ { From 8c219cf9ee9788b564d785c5267e900afb1c2eb3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 7 Jul 2025 13:25:18 -0400 Subject: [PATCH 040/167] Fixed typo. --- src/createcompendia/chemicals.py | 4 ++-- src/snakefiles/chemical.snakefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 2e845ca1..5ca7d0ff 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -78,7 +78,7 @@ def write_rxnorm_ids(infile, outfile): def build_chemical_umls_relationships(mrconso, idfile,outfile, metadata_yaml): umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }, provenance_metadata_yaml=metadata_yaml) -def build_chemical_rxnorm_relationships(conso, idfile,outfile, metadata_yaml): +def build_chemical_rxnorm_relationships(conso, idfile, outfile, metadata_yaml): umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) write_concord_metadata( @@ -88,7 +88,7 @@ def build_chemical_rxnorm_relationships(conso, idfile,outfile, metadata_yaml): 'type': 'UMLS', 'name': 'MRCONSO' }], - description=f'umls.build_sets() of {RXNORM} MRCONSO with prefixes: {MESH}, {DRUGBANK}', + description=f'umls.build_sets() of {RXCUI} MRCONSO with prefixes: {MESH}, {DRUGBANK}', ) def write_pubchem_ids(labelfile,smilesfile,outfile): diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 28b015a5..f2c7ffac 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -132,7 +132,7 @@ rule get_chemical_rxnorm_relationships: outfile=config['intermediate_directory']+'/chemicals/concords/RXNORM', metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-RXNORM.yaml', run: - chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile,output.outfile, output.metadata_yaml) + chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile, output.outfile, output.metadata_yaml) rule get_chemical_wikipedia_relationships: output: From 20caac43a63d588d113c5e1afaf45f42420d0ff7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 8 Jul 2025 13:36:35 -0400 Subject: [PATCH 041/167] Added syntax to ensure that most prov methods are keyword-based. --- src/metadata/provenance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 5a8f7034..73947a0e 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -3,13 +3,13 @@ import yaml -def write_download_metadata(filename, name, url='', description='', sources=None, counts=None): +def write_download_metadata(filename, name, *, url='', description='', sources=None, counts=None): write_metadata(filename, 'download', name, url=url, description=description, sources=sources, counts=None) -def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None): +def write_concord_metadata(filename, name, *, url='', description='', sources=None, counts=None): write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) -def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None): +def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames=None): combined_from = {} if combined_from_filenames is not None: for metadata_yaml in combined_from_filenames: @@ -45,7 +45,7 @@ def write_combined_metadata(filename, typ, name, sources=None, url='', descripti combined_from=combined_from ) -def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None): +def write_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from=None): if type(typ) is not str: raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") if type(name) is not str: From 83d129d7a3c28e670e6b610736e0a75bf208c924 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 9 Jul 2025 14:07:04 -0400 Subject: [PATCH 042/167] Added concord_filename everywhere. --- src/createcompendia/anatomy.py | 37 +++++---- src/createcompendia/chemicals.py | 23 ++++-- src/createcompendia/diseasephenotype.py | 76 +++++++++++-------- src/createcompendia/drugchemical.py | 6 +- src/createcompendia/gene.py | 12 ++- src/createcompendia/geneprotein.py | 3 +- src/createcompendia/processactivitypathway.py | 1 + src/createcompendia/protein.py | 9 ++- src/createcompendia/publications.py | 3 +- src/createcompendia/taxon.py | 3 +- src/datahandlers/efo.py | 1 + src/datahandlers/rhea.py | 3 +- src/datahandlers/umls.py | 1 + src/metadata/provenance.py | 49 +++++++++++- src/snakefiles/diseasephenotype.snakefile | 1 + src/snakefiles/drugchemical.snakefile | 1 + src/snakefiles/gene.snakefile | 2 + src/snakefiles/genefamily.snakefile | 3 +- src/snakefiles/taxon.snakefile | 3 +- 19 files changed, 166 insertions(+), 71 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index a435293f..1f8ee04b 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -109,24 +109,32 @@ def build_anatomy_obo_relationships(outdir, metadata_yamls): 'name': 'UBERON' } ], - description=f'get_subclasses_and_xrefs() of {UBERON}:0001062' + description=f'get_subclasses_and_xrefs() of {UBERON}:0001062', + concord_filename=f'{outdir}/{UBERON}', ) write_concord_metadata(metadata_yamls['GO'], - name='build_anatomy_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'GO' - } - ], - description=f'get_subclasses_and_xrefs() of {GO}:0005575' - ) + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'GO' + } + ], + description=f'get_subclasses_and_xrefs() of {GO}:0005575', + concord_filename=f'{outdir}/{GO}', + ) # TODO: delete write_concord_metadata(metadata_yamls['CL'], - name='build_anatomy_obo_relationships()', - sources=[], - description='' - ) + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'CL' + } + ], + description='CL relationships (not used?)', + concord_filename=f'{outdir}/{CL}', + ) def build_wikidata_cell_relationships(outdir, metadata_yaml): #This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier @@ -173,6 +181,7 @@ def build_wikidata_cell_relationships(outdir, metadata_yaml): 'name': 'Frink Direct Normalized Graph via SPARQL' }], description='wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', + concord_filename=f'{outdir}/{WIKIDATA}', ) def build_anatomy_umls_relationships(mrconso, idfile, outfile, umls_metadata): diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 5ca7d0ff..2fef8708 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -89,6 +89,7 @@ def build_chemical_rxnorm_relationships(conso, idfile, outfile, metadata_yaml): 'name': 'MRCONSO' }], description=f'umls.build_sets() of {RXCUI} MRCONSO with prefixes: {MESH}, {DRUGBANK}', + concord_filename=outfile, ) def write_pubchem_ids(labelfile,smilesfile,outfile): @@ -383,7 +384,9 @@ def make_pubchem_cas_concord(pubchemsynonyms, outfile, metadata_yaml): write_concord_metadata( metadata_yaml, name='make_pubchem_cas_concord()', - description=f'make_pubchem_cas_concord() creates xrefs from PUBCHEM identifiers in the PubChem synonyms file ({pubchemsynonyms}) to Chemical Abstracts Service (CAS) identifiers.', + description='make_pubchem_cas_concord() creates xrefs from PUBCHEM identifiers in the PubChem synonyms file ' + + f'({pubchemsynonyms}) to Chemical Abstracts Service (CAS) identifiers.', + concord_filename=outfile, ) def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): @@ -416,8 +419,10 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): write_concord_metadata( metadata_yaml, - name='make_pubchem_mesh_concord()', - description=f'make_pubchem_mesh_concord() loads MeSH labels from {meshlabels}, then creates xrefs from PubChem identifiers in the PubChem input file ({pubcheminput}) to those MeSH identifiers using the labels as keys.', + name='make_pubchem_mesh_concord()', + description=f'make_pubchem_mesh_concord() loads MeSH labels from {meshlabels}, then creates xrefs from PubChem ' + + f'identifiers in the PubChem input file ({pubcheminput}) to those MeSH identifiers using the labels as keys.', + concord_filename=outfile, ) def build_drugcentral_relations(infile,outfile, metadata_yaml): @@ -448,6 +453,7 @@ def build_drugcentral_relations(infile,outfile, metadata_yaml): metadata_yaml, name='build_drugcentral_relations()', description=f'Build xrefs from DrugCentral ({infile}) to {DRUGCENTRAL} using the prefix map {prefixmap}.', + concord_filename=outfile, ) def make_gtopdb_relations(infile,outfile, metadata_yaml): @@ -470,7 +476,8 @@ def make_gtopdb_relations(infile,outfile, metadata_yaml): write_concord_metadata( metadata_yaml, name='make_gtopdb_relations()', - description=f'Transform Ligand ID/InChIKey mappings from {infile} into a concord.' + description=f'Transform Ligand ID/InChIKey mappings from {infile} into a concord.', + concord_filename=outfile, ) def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): @@ -521,6 +528,7 @@ def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): metadata_yaml, name='make_chebi_relations()', description=f'make_chebi_relations() creates xrefs from the ChEBI database ({sdf}) to {PUBCHEMCOMPOUND} and {KEGGCOMPOUND}.', + concord_filename=outfile, ) @@ -554,7 +562,8 @@ def get_mesh_relationships(mesh_id_file,cas_out, unii_out, cas_metadata, unii_me 'name': 'MeSH Registry', }], description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' - f'in {mesh_id_file}, then writes out CAS mappings to {cas_out}' + f'in {mesh_id_file}, then writes out CAS mappings to {cas_out}', + concord_filename=cas_out, ) write_concord_metadata( @@ -565,7 +574,8 @@ def get_mesh_relationships(mesh_id_file,cas_out, unii_out, cas_metadata, unii_me 'name': 'MeSH Registry', }], description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' - f'in {mesh_id_file}, then writes out non-CAS mappings (i.e. UNII mappings) to {unii_out}' + f'in {mesh_id_file}, then writes out non-CAS mappings (i.e. UNII mappings) to {unii_out}', + concord_filename=unii_out, ) def get_wikipedia_relationships(outfile, metadata_yaml): @@ -592,6 +602,7 @@ def get_wikipedia_relationships(outfile, metadata_yaml): 'name': 'Wikidata SPARQL query', }], description='Wikidata SPARQL query to find Wikidata entities with both CHEBI and MESH IDs, and build a concordance between them.', + concord_filename=outfile, ) def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 98ffb7ba..ae693b29 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -97,17 +97,18 @@ def build_disease_obo_relationships(outdir, metadata_yamls): other_prefixes=other_prefixes, set_type='xref') - write_concord_metadata( - metadata_yamls['HP'], - name='build_disease_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'HP' - } - ], - description=f'ubergraph.build_sets() of {HP}:0000118 with other_prefixes {other_prefixes}' - ) + write_concord_metadata( + metadata_yamls['HP'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'HP' + } + ], + description=f'ubergraph.build_sets() of {HP}:0000118 with other_prefixes {other_prefixes}', + concord_filename=f'{outdir}/{HP}' + ) with open(f'{outdir}/{MONDO}', 'w') as outfile: #Orphanet here is confusing. In mondo it comes out mixed case like "Orphanet" and we want to cap it. We have a normer @@ -116,32 +117,34 @@ def build_disease_obo_relationships(outdir, metadata_yamls): build_sets('MONDO:0000001', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) - write_concord_metadata(metadata_yamls['MONDO'], - name='build_disease_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'MONDO' - } - ], - description=f'ubergraph.build_sets() (exact) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes' - ) + write_concord_metadata(metadata_yamls['MONDO'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (exact) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes', + concord_filename=f'{outdir}/{MONDO}' + ) with open(f'{outdir}/{MONDO}_close', 'w') as outfile: build_sets('MONDO:0000001', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) - write_concord_metadata( - metadata_yamls['MONDO_close'], - name='build_disease_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'MONDO' - } - ], - description=f'ubergraph.build_sets() (close matches) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes' - ) + write_concord_metadata( + metadata_yamls['MONDO_close'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (close matches) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes', + concord_filename=f'{outdir}/{MONDO}_close' + ) def build_disease_efo_relationships(idfile,outfile, metadata_yaml): efo.make_concords(idfile, outfile, provenance_metadata=metadata_yaml) @@ -168,7 +171,14 @@ def build_disease_doid_relationships(idfile,outfile, metadata_yaml): write_concord_metadata( metadata_yaml, name='build_disease_doid_relationships()', - description=f'build_disease_doid_relationships() using the DOID ID file {idfile} and other_prefixes {other_prefixes}' + description=f'build_disease_doid_relationships() using the DOID ID file {idfile} and other_prefixes {other_prefixes}', + concord_filename=outfile, + sources=[ + { + 'type': 'DOID', + 'name': 'doid.build_xrefs' + } + ] ) def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badxrefs, icrdf_filename): diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 8dee460f..0a014f69 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -243,7 +243,8 @@ def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): metadata_yaml, name='build_rxnorm_relationships()', description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', - sources=sources + sources=sources, + concord_filename=outfile, ) @@ -279,7 +280,8 @@ def build_pubchem_relationships(infile,outfile, metadata_yaml): 'name': 'PubChem RxNorm annotations', 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', 'filename': infile - }] + }], + concord_filename=outfile, ) def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index bfe0c06d..d18b1e4d 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -84,7 +84,8 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile, metadata_yaml): sources=[{ 'name': 'ENSEMBL', 'filename': ensembl_dir, - }] + }], + concord_filename=outfile, ) def write_zfin_ids(infile,outfile): @@ -203,7 +204,8 @@ def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile, metadata_y 'type': 'NCBIGENE', 'name': 'NCBIGene gene2ensembl.gz', 'filename': infile, - }] + }], + concord_filename=outfile, ) def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile, metadata_yaml): @@ -236,7 +238,8 @@ def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile, metadata_yaml): 'type': 'NCBIGENE', 'name': 'NCBIGene gene_info.gz', 'filename': infile, - }] + }], + concord_filename=outfile, ) def build_gene_medgen_relationships(infile,outfile, metadata_yaml): @@ -261,7 +264,8 @@ def build_gene_medgen_relationships(infile,outfile, metadata_yaml): sources=[{ 'name': 'MIM2Gene MEDGEN', 'filename': infile, - }] + }], + concord_filename=outfile, ) def write_ensembl_ids(ensembl_dir, outfile): diff --git a/src/createcompendia/geneprotein.py b/src/createcompendia/geneprotein.py index ade3cbd7..c73d2bb5 100644 --- a/src/createcompendia/geneprotein.py +++ b/src/createcompendia/geneprotein.py @@ -35,7 +35,8 @@ def build_uniprotkb_ncbigene_relationships(infile,outfile, metadata_yaml): 'type': 'UniProtKB', 'name': 'UniProtKB idmapping file', 'filename': infile, - }] + }], + concord_filename=outfile, ) diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index 0e46d258..1d4b71ee 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -64,6 +64,7 @@ def build_process_obo_relationships(outdir, metadata_yaml): 'type': 'UberGraph', 'name': 'GO-GO relationships from UberGraph', }], + concord_filename=f'{outdir}/{GO}' ) def build_process_rhea_relationships(outfile, metadata_yaml): diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 6a1bf5f1..c8851bf9 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -119,7 +119,8 @@ def build_pr_uniprot_relationships(outfile, ignore_list = [], metadata_yaml = No sources=[{ 'type': 'UberGraph', 'name': 'UberGraph', - }] + }], + concord_filename=outfile, ) def build_protein_uniprotkb_ensemble_relationships(infile,outfile, metadata_yaml): @@ -147,7 +148,8 @@ def build_protein_uniprotkb_ensemble_relationships(infile,outfile, metadata_yaml sources=[{ 'name': 'ENSEMBL', 'filename': infile, - }] + }], + concord_filename=outfile, ) @@ -173,7 +175,8 @@ def build_ncit_uniprot_relationships(infile,outfile, metadata_yaml): sources=[{ 'name': 'NCIt-SwissProt Mapping file', 'filename': infile, - }] + }], + concord_filename=outfile, ) def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml): diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index 832e2bc8..ac644c86 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -258,7 +258,8 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi 'type': 'download', 'name': 'PubMed PMC-ids.csv.gz', 'url': 'https://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz', - }] + }], + concord_filename=pmid_doi_concord_file, ) def generate_compendium(concordances, metadata_yamls, identifiers, titles, publication_compendium, icrdf_filename): diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index 848dd547..970e58aa 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -88,7 +88,8 @@ def build_relationships(outfile,mesh_ids, metadata_yaml): sources=[{ 'type': 'MeSH', 'name': 'MeSH Registry', - }] + }], + concord_filename=outfile, ) diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 913053ce..f3270e7d 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -185,4 +185,5 @@ def make_concords(idfilename, outfilename, provenance_metadata=None): 'name': 'Experimental Factor Ontology', 'url': 'http://www.ebi.ac.uk/efo/efo.owl', }], + concord_filename=outfilename, ) diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py index a0610236..6691b79a 100644 --- a/src/datahandlers/rhea.py +++ b/src/datahandlers/rhea.py @@ -70,7 +70,8 @@ def pull_rhea_ec_concs(self,ofname, metadata_yaml): 'name': 'rhea.rdf', 'url': 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz', }] - }] + }], + concord_filename=ofname, ) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index 72af0ea7..32c5d580 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -269,6 +269,7 @@ def build_sets(mrconso, umls_input, umls_output , other_prefixes, bad_mappings=d 'name': 'MRCONSO' }], description=f'umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}', + concord_filename=umls_output, ) def read_umls_priority(): diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 73947a0e..829f981c 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -1,13 +1,56 @@ import os.path +from collections import defaultdict from datetime import datetime import yaml -def write_download_metadata(filename, name, *, url='', description='', sources=None, counts=None): +def write_download_metadata(filename, *, name, url='', description='', sources=None, counts=None): write_metadata(filename, 'download', name, url=url, description=description, sources=sources, counts=None) -def write_concord_metadata(filename, name, *, url='', description='', sources=None, counts=None): - write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None) +def write_concord_metadata(filename, *, name, concord_filename, url='', description='', sources=None, counts=None): + # Concord files should all be in the format: + # <curie>\t<predicate>\t<curie> + # From this, we extract three counts: + # 'count_concords': Number of lines in this file. + # 'count_distinct_curies': Number of distinct CURIEs. + # 'predicates': A dictionary of counts per predicate. + # 'prefix_counts': A dictionary of prefix pairs along with the predicate + count_concords = 0 + distinct_curies = set() + predicate_counts = defaultdict(int) + curie_prefix_counts = defaultdict(int) + with open(concord_filename, 'r') as concordf: + for line in concordf: + row = line.split('\t') + if len(row) != 3: + raise ValueError(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3: {line}") + curie1 = row[0] + predicate = row[1] + curie2 = row[2] + + count_concords += 1 + predicate_counts[predicate] += 1 + distinct_curies.add(curie1) + distinct_curies.add(curie2) + + prefixes = [curie1.split(':')[0], curie2.split(':')[0]] + sorted_prefixes = sorted(prefixes) + curie_prefix_counts[f"{predicate}({sorted_prefixes[0]}, {sorted_prefixes[1]})"] += 1 + + if counts is None: + counts = {} + + if 'concords' in counts: + raise ValueError(f"Cannot add counts to concord metadata for {name} because it already has counts: {counts}") + + counts['concords'] = { + 'count_concords': count_concords, + 'count_distinct_curies': len(distinct_curies), + 'predicates': dict(predicate_counts), + 'prefix_counts': dict(curie_prefix_counts), + } + + write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=counts) def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames=None): combined_from = {} diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 283502a3..01e505e2 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -158,6 +158,7 @@ rule disease_manual_concord: counts={ 'concords': count_manual_concords, }, + concord_filename=output.outfile, ) rule disease_compendia: diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index 3f6a8d36..a2459468 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -59,6 +59,7 @@ rule drugchemical_conflation: 'url': 'https://github.com/TranslatorSRI/Babel', }], url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', + concord_filename=input.drugchemical_manual_concord, ) drugchemical.build_conflation( input.drugchemical_manual_concord, diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index b21ccc87..69f656ff 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -127,6 +127,7 @@ rule get_umls_gene_protein_mappings: 'name': 'UMLS-UniProtKB mappings', 'url': config['UMLS_UniProtKB_download_raw_url'], }], + concord_filename=output.umls_gene_concords, ) write_concord_metadata( output.umls_protein_metadata_yaml, @@ -137,6 +138,7 @@ rule get_umls_gene_protein_mappings: 'name': 'UMLS-UniProtKB mappings', 'url': config['UMLS_UniProtKB_download_raw_url'], }], + concord_filename=output.umls_protein_concords, ) rule gene_compendia: diff --git a/src/snakefiles/genefamily.snakefile b/src/snakefiles/genefamily.snakefile index 0b675be8..4bddff1c 100644 --- a/src/snakefiles/genefamily.snakefile +++ b/src/snakefiles/genefamily.snakefile @@ -28,7 +28,8 @@ rule genefamily_compendia: icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['genefamily_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs'])), + metadata_yaml=config['output_directory']+'/metadata/GeneFamily.txt.yaml', run: genefamily.build_compendia(input.idlists, input.metadata_yamls, input.icrdf_filename) diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index 5652a7fd..59d7c772 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -57,7 +57,8 @@ rule taxon_compendia: icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['taxon_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs'])), + output_metadata=expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['taxon_outputs']), run: taxon.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) From ac908b379f9d54be03de7135d8adb71af7fb1339 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@renci.org> Date: Wed, 9 Jul 2025 15:11:27 -0400 Subject: [PATCH 043/167] Fixed UMLS provenance metadata arguments. --- src/createcompendia/chemicals.py | 13 +------------ src/createcompendia/diseasephenotype.py | 5 ++++- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 2fef8708..70e4c050 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -79,18 +79,7 @@ def build_chemical_umls_relationships(mrconso, idfile,outfile, metadata_yaml): umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }, provenance_metadata_yaml=metadata_yaml) def build_chemical_rxnorm_relationships(conso, idfile, outfile, metadata_yaml): - umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) - - write_concord_metadata( - metadata_yaml, - name='build_chemical_rxnorm_relationships()', - sources=[{ - 'type': 'UMLS', - 'name': 'MRCONSO' - }], - description=f'umls.build_sets() of {RXCUI} MRCONSO with prefixes: {MESH}, {DRUGBANK}', - concord_filename=outfile, - ) + umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI, provenance_metadata_yaml=metadata_yaml) def write_pubchem_ids(labelfile,smilesfile,outfile): #Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index ae693b29..d341a67f 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -160,7 +160,10 @@ def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfil for line in inf: x = line.split()[0] good_ids[prefix].add(x) - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids, metadata_yaml=metadata_yaml) + umls.build_sets(mrconso, idfile, outfile, + {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM}, + acceptable_identifiers=good_ids, + provenance_metadata_yaml=metadata_yaml) def build_disease_doid_relationships(idfile,outfile, metadata_yaml): other_prefixes = {'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, From 3e42f31f9ed8fd0e3e3657fea42b4e6a16c2806f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@renci.org> Date: Wed, 9 Jul 2025 15:13:29 -0400 Subject: [PATCH 044/167] Added missing name. --- src/createcompendia/chemicals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 70e4c050..3b6a7648 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -586,6 +586,7 @@ def get_wikipedia_relationships(outfile, metadata_yaml): write_concord_metadata( metadata_yaml, + name="get_wikipedia_relationships()", sources=[{ 'type': 'Wikidata', 'name': 'Wikidata SPARQL query', From b4ca2deee9c2f14bc011405096ff8792430ddbc2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 9 Jul 2025 23:37:04 -0400 Subject: [PATCH 045/167] Fixed untyped compendium metadata for chemicals. --- src/createcompendia/chemicals.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 3b6a7648..574fd1b7 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -10,7 +10,7 @@ import yaml -from src.metadata.provenance import write_concord_metadata +from src.metadata.provenance import write_concord_metadata, write_combined_metadata from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG @@ -642,18 +642,15 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c outf.write(f'{set(s)}\n') # Build the metadata file by combining the input metadata_yamls. - metadata = { - 'type': 'untyped_compendium', - 'name': 'build_untyped_compendia()', - 'created_at': datetime.now().isoformat(), - 'sources': [] - } - for metadata_yaml in input_metadata_yamls: - with open(metadata_yaml, 'r') as metaf: - metadata_block = yaml.safe_load(metaf) - if metadata_block is None: - raise ValueError("Metadata file {metadata_yaml} is empty.") - metadata['sources'].append(metadata_block) + write_combined_metadata( + filename=metadata_yaml, + typ='untyped_compendium', + name='chemicals.build_untyped_compendia()', + description=f'Generate an untyped compendium from concordances {concordances}, identifiers {identifiers}, " +' + f'unichem_partial {unichem_partial}, untyped_concord {untyped_concord}, and type file {type_file}.', + # sources=None, url='', counts=None, + combined_from_filenames=input_metadata_yamls, + ) def build_compendia(type_file, untyped_compendia_file, metadata_yamls, icrdf_filename): types = {} From 4f7fc3a2d10ce78c103b727db1cf991bb6715f52 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Fri, 11 Jul 2025 13:40:53 -0400 Subject: [PATCH 046/167] Removed unnecessary counts. --- src/snakefiles/diseasephenotype.snakefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 01e505e2..6a51803c 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -155,9 +155,6 @@ rule disease_manual_concord: 'url': 'https://github.com/TranslatorSRI/Babel', }], url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/disease.txt', - counts={ - 'concords': count_manual_concords, - }, concord_filename=output.outfile, ) From b255b12b7855ba835d5ca9a6b76b1eceb55c9ecf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 15 Jul 2025 10:56:28 -0400 Subject: [PATCH 047/167] Fixed metadata file. --- src/snakefiles/drugchemical.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index a2459468..91bc4596 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -51,7 +51,7 @@ rule drugchemical_conflation: metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: - write_concord_metadata(input.drugchemical_manual_metadata, + write_concord_metadata(output.drugchemical_manual_metadata, name='Manual DrugChemical Concords', description='Manually curated DrugChemical conflation cross-references from the Babel repository', sources=[{ @@ -74,7 +74,7 @@ rule drugchemical_conflation: 'RXNORM': input.rxnorm_metadata, 'UMLS': input.umls_metadata, 'PUBCHEM_RXNORM': input.pubchem_metadata, - 'Manual': input.drugchemical_manual_metadata, + 'Manual': output.drugchemical_manual_metadata, }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: From 45534344bff7c140b83defbd1e8689f57d3a7bbf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 15 Jul 2025 11:07:40 -0400 Subject: [PATCH 048/167] Added manual concord for DrugChemical. --- src/createcompendia/drugchemical.py | 26 +++++++++++++++++++++++++- src/metadata/provenance.py | 4 +++- src/snakefiles/drugchemical.snakefile | 12 ------------ 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 0a014f69..bca544a1 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -297,6 +297,9 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem print("Loading manual concords ...") manual_concords = [] + manual_concords_curies = set() + manual_concords_predicate_counts = defaultdict(int) + manual_concords_curie_prefix_counts = defaultdict(int) with open(manual_concord_filename,"r") as manualf: csv_reader = csv.DictReader(manualf, dialect=csv.excel_tab) for row in csv_reader: @@ -306,6 +309,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem if row['subject'].strip() == '' or row['object'].strip() == '': raise RuntimeError(f"Empty subject or object fields in {manual_concord_filename}: {row}") manual_concords.append((row['subject'], row['object'])) + manual_concords_predicate_counts[row['predicate']] += 1 + manual_concords_curies.add(row['subject']) + manual_concords_curies.add(row['object']) + + subject_prefix = row['subject'].split(':')[0] + manual_concords_curie_prefix_counts[subject_prefix] += 1 + + object_prefix = row['object'].split(':')[0] + manual_concords_curie_prefix_counts[object_prefix] += 1 print(f"{len(manual_concords)} manual concords loaded.") print("load all chemical conflations so we can normalize identifiers") @@ -608,7 +620,19 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem typ='conflation', name='drugchemical.build_conflation()', description='Build DrugChemical conflation.', - combined_from_filenames=input_metadata_yamls + combined_from_filenames=input_metadata_yamls, + also_combined_from={ + 'Manual': { + 'name': 'DrugChemical Manual', + 'filename': manual_concord_filename, + 'counts': { + 'count_concords': len(manual_concords), + 'count_distinct_curies': len(manual_concords_curies), + 'predicates': dict(manual_concords_predicate_counts), + 'prefix_counts': dict(manual_concords_curie_prefix_counts), + } + } + } ) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 829f981c..4cd87f8c 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -52,7 +52,7 @@ def write_concord_metadata(filename, *, name, concord_filename, url='', descript write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=counts) -def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames=None): +def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames=None, also_combined_from=None): combined_from = {} if combined_from_filenames is not None: for metadata_yaml in combined_from_filenames: @@ -76,6 +76,8 @@ def write_combined_metadata(filename, typ, name, *, sources=None, url='', descri combined_from[metadata_name].append(metadata_block) else: combined_from[metadata_name] = metadata_block + if also_combined_from is not None: + combined_from.update(also_combined_from) write_metadata( filename, diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index 91bc4596..f9748aaa 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -49,18 +49,7 @@ rule drugchemical_conflation: output: outfile=config['output_directory']+'/conflation/DrugChemical.txt', metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', - drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml', run: - write_concord_metadata(output.drugchemical_manual_metadata, - name='Manual DrugChemical Concords', - description='Manually curated DrugChemical conflation cross-references from the Babel repository', - sources=[{ - 'name': 'Babel repository', - 'url': 'https://github.com/TranslatorSRI/Babel', - }], - url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv', - concord_filename=input.drugchemical_manual_concord, - ) drugchemical.build_conflation( input.drugchemical_manual_concord, input.rxnorm_concord, @@ -74,7 +63,6 @@ rule drugchemical_conflation: 'RXNORM': input.rxnorm_metadata, 'UMLS': input.umls_metadata, 'PUBCHEM_RXNORM': input.pubchem_metadata, - 'Manual': output.drugchemical_manual_metadata, }, output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: From 8f4d5deeff440f370725151bf0cccdb95416607b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 15 Jul 2025 14:54:02 -0400 Subject: [PATCH 049/167] Update manual concord predicate count so it's in the right format. --- src/createcompendia/drugchemical.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index bca544a1..3f9410c7 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -313,11 +313,9 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem manual_concords_curies.add(row['subject']) manual_concords_curies.add(row['object']) - subject_prefix = row['subject'].split(':')[0] - manual_concords_curie_prefix_counts[subject_prefix] += 1 - - object_prefix = row['object'].split(':')[0] - manual_concords_curie_prefix_counts[object_prefix] += 1 + sorted_curies = sorted([row['subject'], row['object']]) + prefix_count_label = row['predicate'] + '(' + (' ,'.join(sorted_curies)) + ')' + manual_concords_curie_prefix_counts[prefix_count_label] += 1 print(f"{len(manual_concords)} manual concords loaded.") print("load all chemical conflations so we can normalize identifiers") From 3360e821eae1d7076df42692e48c0f8a358c3765 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 21:46:17 -0400 Subject: [PATCH 050/167] Improve build_anatomy_obo_relationships() metadata generation. --- src/createcompendia/anatomy.py | 54 +++++++++++++--------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index 1f8ee04b..75920f28 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -101,40 +101,26 @@ def build_anatomy_obo_relationships(outdir, metadata_yamls): # CL is now being handled by Wikidata (build_wikidata_cell_relationships), so we can probably remove it from here. # Write out metadata. - write_concord_metadata(metadata_yamls['UBERON'], - name='build_anatomy_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'UBERON' - } - ], - description=f'get_subclasses_and_xrefs() of {UBERON}:0001062', - concord_filename=f'{outdir}/{UBERON}', - ) - write_concord_metadata(metadata_yamls['GO'], - name='build_anatomy_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'GO' - } - ], - description=f'get_subclasses_and_xrefs() of {GO}:0005575', - concord_filename=f'{outdir}/{GO}', - ) - # TODO: delete - write_concord_metadata(metadata_yamls['CL'], - name='build_anatomy_obo_relationships()', - sources=[ - { - 'type': 'UberGraph', - 'name': 'CL' - } - ], - description='CL relationships (not used?)', - concord_filename=f'{outdir}/{CL}', - ) + for metadata_name in [UBERON, GO, CL]: + write_concord_metadata(metadata_yamls[metadata_name], + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'UBERON' + }, + { + 'type': 'UberGraph', + 'name': 'GO' + }, + { + 'type': 'UberGraph', + 'name': 'CL' + } + ], + description=f'get_subclasses_and_xrefs() of {UBERON}:0001062 and {GO}:0005575', + concord_filename=f'{outdir}/{metadata_name}', + ) def build_wikidata_cell_relationships(outdir, metadata_yaml): #This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier From 54bc8dd9d306542702270a82511deef7f880cc5f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 21:50:17 -0400 Subject: [PATCH 051/167] Fixed name for metadata. --- src/datahandlers/pantherfamily.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index 51c5562d..0947b403 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -43,7 +43,7 @@ def pull_labels(infile,outfile, metadata_yaml): write_metadata( metadata_yaml, typ='transform', - name='HGNC Gene Family labels', + name='pantherfamily.pull_labels()', description='Main families and subfamily labels extracted from PANTHER Sequence Classification human.', sources=[{ 'type': 'download', From 57bfcf379408e6096b80663200c98bf9ceb810a0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:04:42 -0400 Subject: [PATCH 052/167] Improved MeSH metadata. --- src/createcompendia/taxon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index 970e58aa..bfa40099 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -84,10 +84,11 @@ def build_relationships(outfile,mesh_ids, metadata_yaml): write_concord_metadata( metadata_yaml, name='build_relationships()', - description=f'Builds relationships between MeSH and NCBI Taxon from the MeSH registry.', + description='Builds relationships between MeSH and NCBI Taxon from the MeSH registry.', sources=[{ 'type': 'MeSH', 'name': 'MeSH Registry', + 'url': 'ftp://ftp.nlm.nih.gov/online/mesh/rdf/mesh.nt.gz', }], concord_filename=outfile, ) From 7020b7f651be4fe850c08ca669214af62b0694ff Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:16:54 -0400 Subject: [PATCH 053/167] Improved PubMed metadata. --- src/createcompendia/publications.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index ac644c86..01d95c44 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -51,7 +51,8 @@ def download_pubmed(download_file, timestamping=True) # Step 3. Download the PMC/PMID mapping file from PMC. - pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed') + # We don't actually use this file -- we currently only use the PMC IDs already included in the PubMed XML files. + # pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed') # We're all done! Path.touch(download_file) @@ -253,12 +254,16 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi description="Parse PubMed files into TSVs and JSONL status files.", sources=[{ 'type': 'download', - 'name': 'PubMed Baseline and updates' + 'name': 'PubMed Baseline', + 'url': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/' }, { 'type': 'download', - 'name': 'PubMed PMC-ids.csv.gz', - 'url': 'https://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz', + 'name': 'PubMed Updates', + 'url': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/' }], + counts={ + 'pmid_count': len(pmid_status.keys()), + }, concord_filename=pmid_doi_concord_file, ) From 21e220b4803129e10e5b5c91ab7b594365323a1c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:17:55 -0400 Subject: [PATCH 054/167] Removed unnecessary change. --- src/ubergraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ubergraph.py b/src/ubergraph.py index 917237fa..b2db9ef8 100644 --- a/src/ubergraph.py +++ b/src/ubergraph.py @@ -477,7 +477,7 @@ def write_normalized_information_content(self, filename): print(f"Wrote {write_count} information content values into {filename}.") return write_count -def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False): +def build_sets(iri, concordfiles, set_type, ignore_list = [], other_prefixes={}, hop_ontologies=False ): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there is a set for each subclass of the input iri. Write these lists to concord files, indexed by the prefix""" prefix = Text.get_curie(iri) From c682498b2e0db5367b3048e57cd29b518da9441e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:33:37 -0400 Subject: [PATCH 055/167] Renamed duplicate functions so their purpose is clearer. --- src/createcompendia/protein.py | 4 ++-- src/snakefiles/protein.snakefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 20bf9287..2f22608e 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -30,7 +30,7 @@ def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename outf.write(f'{UNIPROTKB}:{x[0]}\t{NCBITAXON}:{x[2]}\n') -def write_ensembl_ids(ensembl_dir, outfile): +def write_ensembl_gene_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile,'w') as outf: #find all the ensembl directories @@ -69,7 +69,7 @@ def write_pr_ids(outfile): obo.write_obo_ids([(protein_id, PROTEIN)], outfile, [PROTEIN]) -def write_ensembl_ids(ensembl_dir, outfile): +def write_ensembl_protein_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile, 'w') as outf: # find all the ensembl directories diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index b550d748..1fc9be39 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -42,7 +42,7 @@ rule protein_ensembl_ids: output: outfile=config['intermediate_directory']+"/protein/ids/ENSEMBL" run: - protein.write_ensembl_ids(config['download_directory'] + '/ENSEMBL',output.outfile) + protein.write_ensembl_protein_ids(config['download_directory'] + '/ENSEMBL',output.outfile) rule get_protein_uniprotkb_ensembl_relationships: input: From 62629cedcc0e7fb974d87f02fca7e524a1c6b22e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:37:34 -0400 Subject: [PATCH 056/167] Renamed write_ensembl_ids() to clarify what's going on. --- src/createcompendia/gene.py | 2 +- src/snakefiles/gene.snakefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 1c89da02..99fa22d5 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -217,7 +217,7 @@ def build_gene_medgen_relationships(infile,outfile): umls_id = f'{UMLS}:{x[4]}' outf.write(f'{ncbigene_id}\teq\t{umls_id}\n') -def write_ensembl_ids(ensembl_dir, outfile): +def write_ensembl_gene_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile,'w') as outf: #find all the ensembl directories diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 7bb30832..d22bbaa8 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -33,7 +33,7 @@ rule gene_ensembl_ids: output: outfile=config['intermediate_directory']+"/gene/ids/ENSEMBL" run: - gene.write_ensembl_ids(config['download_directory'] + '/ENSEMBL',output.outfile) + gene.write_ensembl_gene_ids(config['download_directory'] + '/ENSEMBL',output.outfile) rule gene_hgnc_ids: input: From e0f64ad7ed5dd9ff5fb340e36fc199d89334a80b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 2 Aug 2025 22:38:48 -0400 Subject: [PATCH 057/167] Removed redundant function. --- src/createcompendia/protein.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 2f22608e..74a3d3c4 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -30,35 +30,6 @@ def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename outf.write(f'{UNIPROTKB}:{x[0]}\t{NCBITAXON}:{x[2]}\n') -def write_ensembl_gene_ids(ensembl_dir, outfile): - """Loop over all the ensembl species. Find any protein-coding gene""" - with open(outfile,'w') as outf: - #find all the ensembl directories - dirlisting = os.listdir(ensembl_dir) - for dl in dirlisting: - dlpath = os.path.join(ensembl_dir,dl) - if os.path.isdir(dlpath): - infname = os.path.join(dlpath,'BioMart.tsv') - if os.path.exists(infname): - #open each ensembl file, find the id column, and put it in the output - with open(infname,'r') as inf: - wrote=set() - h = inf.readline() - x = h[:-1].split('\t') - gene_column = x.index('Gene stable ID') - protein_column = x.index('Protein stable ID') - for line in inf: - x = line[:-1].split('\t') - #Is it protein coding? - if x[protein_column] == '': - continue - gid = f'{ENSEMBL}:{x[gene_column]}' - #The gid is not unique, so don't write the same one over again - if gid in wrote: - continue - wrote.add(gid) - outf.write(f'{gid}\n') - def write_umls_ids(mrsty, outfile): umlsmap = {} umlsmap['A1.4.1.2.1.7'] = PROTEIN From e873cbe5b30b59bf69113c9c2e1b1f057253884b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 3 Aug 2025 00:02:21 -0400 Subject: [PATCH 058/167] Added a properties file for CHEBI. --- src/createcompendia/chemicals.py | 14 ++++++++++++-- src/snakefiles/chemical.snakefile | 5 +++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 0a6dd39f..27644718 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,3 +1,4 @@ +import json import logging from collections import defaultdict import jsonlines @@ -435,7 +436,7 @@ def make_gtopdb_relations(infile,outfile): inchi = f'{INCHIKEY}:{x[inchi_index][1:-1]}' outf.write(f'{gid}\txref\t{inchi}\n') -def make_chebi_relations(sdf,dbx,outfile): +def make_chebi_relations(sdf,dbx,outfile,propfile): """CHEBI contains relations both about chemicals with and without inchikeys. You might think that because everything is based on unichem, we could avoid the with structures part, but history has shown that we lose links in that case, so we will use both the structured and unstructured chemical entries.""" @@ -451,9 +452,17 @@ def make_chebi_relations(sdf,dbx,outfile): dbxdata = inf.read() kk = 'keggcompounddatabaselinks' pk = 'pubchemdatabaselinks' - with open(outfile,'w') as outf: + secondary_chebi_id = 'secondarychebiid' + with open(outfile,'w') as outf, open(propfile,'w') as propf: #Write SDF structured things for cid,props in chebi_sdf_dat.items(): + if secondary_chebi_id in props: + propf.write(json.dumps({ + 'curie': cid, + 'property': 'OIO:hasAlternativeId', + 'value': props[secondary_chebi_id], + 'description': 'Listed as a CHEBI secondard ID in the ChEBI SDF file' + })) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') if pk in props: @@ -482,6 +491,7 @@ def make_chebi_relations(sdf,dbx,outfile): + def get_mesh_relationships(mesh_id_file,cas_out, unii_out): meshes = set() with open(mesh_id_file,'r') as inf: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 249fe30c..34086295 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -187,9 +187,10 @@ rule get_chebi_concord: sdf=config['download_directory']+'/CHEBI/ChEBI_complete.sdf', dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: - outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI' + outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', + propfile=config['intermediate_directory']+'/chemicals/properties/CHEBI' run: - chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile) + chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile,output.propfile) rule chemical_unichem_concordia: input: From 1f3491e9663de37d422d36fa04b50cd0b32127b4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 3 Aug 2025 00:42:30 -0400 Subject: [PATCH 059/167] First stab at a PropertyStore. --- src/createcompendia/chemicals.py | 20 +++++--- src/properties.py | 83 +++++++++++++++++++++++++++++++ src/snakefiles/chemical.snakefile | 2 +- 3 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 src/properties.py diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 27644718..b5ac0a13 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -7,6 +7,7 @@ import gzip from gzip import GzipFile +from src.properties import PropertyStore, PropertyValue, HAS_ADDITIONAL_ID from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG @@ -453,16 +454,18 @@ def make_chebi_relations(sdf,dbx,outfile,propfile): kk = 'keggcompounddatabaselinks' pk = 'pubchemdatabaselinks' secondary_chebi_id = 'secondarychebiid' - with open(outfile,'w') as outf, open(propfile,'w') as propf: + with open(outfile,'w') as outf, PropertyStore(propfile) as propstore: + properties = [] + #Write SDF structured things for cid,props in chebi_sdf_dat.items(): if secondary_chebi_id in props: - propf.write(json.dumps({ - 'curie': cid, - 'property': 'OIO:hasAlternativeId', - 'value': props[secondary_chebi_id], - 'description': 'Listed as a CHEBI secondard ID in the ChEBI SDF file' - })) + properties.append(PropertyValue( + curie = cid, + property = HAS_ADDITIONAL_ID, + value = props[secondary_chebi_id], + description = 'Listed as a CHEBI secondard ID in the ChEBI SDF file' + )) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') if pk in props: @@ -488,6 +491,9 @@ def make_chebi_relations(sdf,dbx,outfile,propfile): if x[3] == 'Pubchem accession': outf.write(f'{cid}\txref\t{PUBCHEMCOMPOUND}:{x[4]}\n') + # Write out the properties. + propstore.insert_all(properties) + diff --git a/src/properties.py b/src/properties.py new file mode 100644 index 00000000..4e9d45e6 --- /dev/null +++ b/src/properties.py @@ -0,0 +1,83 @@ +# +# properties.py: handle node- and clique-level properties for Babel. +# +# It would be great if we could get all Babel properties (labels, synonyms, etc.) stored in the same database +# store, but that appears to be impractical given how long it takes to write into a database. So we'll leave +# labels, synonyms and descriptions working in the current system, and start putting new properties (starting with +# hasAdditionalId) into this database. I'd love to get descriptions moved in here as well. +# +import os +from contextlib import AbstractContextManager +from dataclasses import dataclass + +import sqlite3 + +from src.babel_utils import make_local_name + +# Properties currently supported in the property store: +supported_properties = { + 'hasAdditionalId': 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId', +} + +HAS_ADDITIONAL_ID = 'hasAdditionalId' + +# A single property value. +@dataclass +class PropertyValue: + curie: str + property: str + value: str + description: str + + +# A property store for a properties file. +class PropertyStore(AbstractContextManager): + def __init__(self, db3file_path, validate_properties=True, autocommit=True): + self.validate_properties = validate_properties + self.autocommit = autocommit + + # Make the prefix directory if it doesn't exist. + os.makedirs(os.path.dirname(db3file_path), exist_ok=True) + + self.connection = sqlite3.connect(db3file_path) + cur = self.connection.cursor() + cur.execute("CREATE TABLE IF NOT EXISTS properties (curie TEXT, property TEXT, value TEXT, description TEXT) ;") + # Create a UNIQUE index on the property values -- this means that if someone tries to set the same value for + # a property either duplicatively or from another source, we simply ignore it. + cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS properties_propvalues ON properties (curie, property, value);") + self.connection.commit() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.connection.close() + + def commit(self): + self.connection.commit() + + def query(self, sql, params=None): + cursor = self.connection.cursor() + return cursor.execute(sql, params) + + def get_by_curie(self, curie) -> list[PropertyValue]: + results = self.query("SELECT curie, property, value, source FROM properties WHERE curie=:curie", params={ + "curie": curie, + }) + return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] + + def get_all(self) -> list[PropertyValue]: + results = self.query("SELECT curie, property, value, source FROM properties") + return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] + + def insert_all(self, pvs): + cursor = self.connection.cursor() + data = [] + for pv in pvs: + if self.validate_properties and pv.property not in supported_properties: + raise ValueError(f"Unable to insert_all({pvs}): unsupported property {pv.property} in {pv}.") + data.append({ + "curie": pv.curie, + "property": supported_properties[pv.property], + "value": pv.value, + "source": pv.source, + }) + cursor.executemany("INSERT OR IGNORE INTO properties VALUES (:curie, :property, :value, :source)", data) + self.connection.commit() diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 34086295..89754cb8 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -188,7 +188,7 @@ rule get_chebi_concord: dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', - propfile=config['intermediate_directory']+'/chemicals/properties/CHEBI' + propfile=config['intermediate_directory']+'/chemicals/properties.sqlite' run: chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile,output.propfile) From 255ee4667e8f4245da31e3f8893bf344f3ce3479 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 3 Aug 2025 01:00:39 -0400 Subject: [PATCH 060/167] Got property store working. --- src/createcompendia/chemicals.py | 18 +++++++++++------- src/properties.py | 8 ++++---- src/sdfreader.py | 6 +++--- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index b5ac0a13..3a33231a 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -460,17 +460,21 @@ def make_chebi_relations(sdf,dbx,outfile,propfile): #Write SDF structured things for cid,props in chebi_sdf_dat.items(): if secondary_chebi_id in props: - properties.append(PropertyValue( - curie = cid, - property = HAS_ADDITIONAL_ID, - value = props[secondary_chebi_id], - description = 'Listed as a CHEBI secondard ID in the ChEBI SDF file' - )) + secondary_ids = props[secondary_chebi_id] + for secondary_id in secondary_ids: + properties.append(PropertyValue( + curie = cid, + property = HAS_ADDITIONAL_ID, + value = secondary_id, + description = 'Listed as a CHEBI secondary ID in the ChEBI SDF file' + )) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') if pk in props: #Apparently there's a lot of structure here? - v = props[pk] + database_links = props[pk] + # To simulate previous behavior, I'll concatenate previous values together. + v = "".join(database_links) parts = v.split('SID: ') for p in parts: if 'CID' in p: diff --git a/src/properties.py b/src/properties.py index 4e9d45e6..56ed6f55 100644 --- a/src/properties.py +++ b/src/properties.py @@ -58,13 +58,13 @@ def query(self, sql, params=None): return cursor.execute(sql, params) def get_by_curie(self, curie) -> list[PropertyValue]: - results = self.query("SELECT curie, property, value, source FROM properties WHERE curie=:curie", params={ + results = self.query("SELECT curie, property, value, description FROM properties WHERE curie=:curie", params={ "curie": curie, }) return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] def get_all(self) -> list[PropertyValue]: - results = self.query("SELECT curie, property, value, source FROM properties") + results = self.query("SELECT curie, property, value, description FROM properties") return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] def insert_all(self, pvs): @@ -77,7 +77,7 @@ def insert_all(self, pvs): "curie": pv.curie, "property": supported_properties[pv.property], "value": pv.value, - "source": pv.source, + "description": pv.description, }) - cursor.executemany("INSERT OR IGNORE INTO properties VALUES (:curie, :property, :value, :source)", data) + cursor.executemany("INSERT OR IGNORE INTO properties VALUES (:curie, :property, :value, :description)", data) self.connection.commit() diff --git a/src/sdfreader.py b/src/sdfreader.py index c56b8592..8836ff10 100644 --- a/src/sdfreader.py +++ b/src/sdfreader.py @@ -30,10 +30,10 @@ def chebi_sdf_entry_to_dict(sdf_chunk, interesting_keys = {}): current_key = line.replace('>','').replace('<','').strip().replace(' ', '').lower() current_key = 'formula' if current_key == 'formulae' else current_key if current_key in interesting_keys: - final_dict[interesting_keys[current_key]] = '' + final_dict[interesting_keys[current_key]] = [] continue if current_key == 'chebiid': chebi_id = line if current_key in interesting_keys: - final_dict[interesting_keys[current_key]] += line - return (chebi_id, final_dict) \ No newline at end of file + final_dict[interesting_keys[current_key]].append(line) + return (chebi_id, final_dict) From 83c0a7ca30be32062817f2f7c2c195f9397f08d0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 4 Aug 2025 02:39:35 -0400 Subject: [PATCH 061/167] Turned exception into warning, strip() line before splitting. --- src/metadata/provenance.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 4cd87f8c..4e83db98 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -1,3 +1,4 @@ +import logging import os.path from collections import defaultdict from datetime import datetime @@ -21,9 +22,11 @@ def write_concord_metadata(filename, *, name, concord_filename, url='', descript curie_prefix_counts = defaultdict(int) with open(concord_filename, 'r') as concordf: for line in concordf: - row = line.split('\t') + row = line.strip().split('\t') if len(row) != 3: - raise ValueError(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3: {line}") + logging.warning(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3 -- skipping: '{line}'") + # raise ValueError(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3: {line}") + continue curie1 = row[0] predicate = row[1] curie2 = row[2] From 2afcbf77e4d8735cf0d6eed7d7b520b8a715150a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 4 Aug 2025 02:44:19 -0400 Subject: [PATCH 062/167] Fixed bug in Snakemake file: dir() -> directory(). --- src/snakefiles/datacollect.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index 81704f30..1340086f 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -245,7 +245,7 @@ rule get_ncbigene_labels_synonyms_and_taxa: rule get_ensembl: output: - ensembl_dir=dir(config['download_directory']+'/ENSEMBL'), + ensembl_dir=directory(config['download_directory']+'/ENSEMBL'), complete_file=config['download_directory']+'/ENSEMBL/BioMartDownloadComplete' run: ensembl.pull_ensembl(output.ensembl_dir, output.complete_file) From ce263ec194a92052c028557fae0ab69c228704e7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 4 Aug 2025 02:55:28 -0400 Subject: [PATCH 063/167] Setting PubChem Input encoding to ISO-8859. This was indicated by `file` and by an encoding error we ran into loading this file: > UnicodeDecodeError in file "/code/babel/src/snakefiles/chemical.snakefile", line 173: > 'utf-8' codec can't decode byte 0xe8 in position 2252: invalid continuation byte > File "/code/babel/src/snakefiles/chemical.snakefile", line 173, in __rule_get_chemical_pubchem_mesh_concord > File "/code/babel/src/createcompendia/chemicals.py", line 397, in make_pubchem_mesh_concord > File "<frozen codecs>", line 322, in decode --- src/createcompendia/chemicals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 574fd1b7..0943ff61 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -393,7 +393,7 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): # first mapping is the 'best' i.e. the one most frequently reported. # We will only use the first one used_pubchem = set() - with open(pubcheminput,'r') as inf, open(outfile,'w') as outf: + with open(pubcheminput,'r', encoding='ISO-8859') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split('\t') # x[0] = puchemid (no prefix), x[1] = mesh label if x[0] in used_pubchem: From 686c215a846cfb8c340d2bc885a61b81dd52b1c0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@renci.org> Date: Mon, 4 Aug 2025 18:46:51 -0400 Subject: [PATCH 064/167] Corrected encoding name. --- src/createcompendia/chemicals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 0943ff61..cac5e986 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -393,7 +393,7 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): # first mapping is the 'best' i.e. the one most frequently reported. # We will only use the first one used_pubchem = set() - with open(pubcheminput,'r', encoding='ISO-8859') as inf, open(outfile,'w') as outf: + with open(pubcheminput,'r', encoding='ISO-8859-1') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split('\t') # x[0] = puchemid (no prefix), x[1] = mesh label if x[0] in used_pubchem: From b9a55d92309b88c4a09892aeb413acba63793411 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:14:37 -0400 Subject: [PATCH 065/167] Rewrote properties in a way that made more sense. --- src/properties.py | 208 +++++++++++++++++++++++++++++++--------------- 1 file changed, 141 insertions(+), 67 deletions(-) diff --git a/src/properties.py b/src/properties.py index 56ed6f55..623e745e 100644 --- a/src/properties.py +++ b/src/properties.py @@ -1,83 +1,157 @@ # # properties.py: handle node- and clique-level properties for Babel. # -# It would be great if we could get all Babel properties (labels, synonyms, etc.) stored in the same database -# store, but that appears to be impractical given how long it takes to write into a database. So we'll leave -# labels, synonyms and descriptions working in the current system, and start putting new properties (starting with -# hasAdditionalId) into this database. I'd love to get descriptions moved in here as well. +# Property files are JSONL files that can be read into and out of the Property dataclass. +# So writing them is easy: you just add each property on its own line, and if you go through +# Property(...).to_json_line() we can even validate it for you (eventually). # -import os -from contextlib import AbstractContextManager +# We generally need to read multiple properties files so you can run queries over all of them, which you can do by +# using the PropertyList class. +# +import gzip +import json +from collections import defaultdict from dataclasses import dataclass -import sqlite3 +# +# SUPPORTED PROPERTIES +# -from src.babel_utils import make_local_name +# HAS_ADDITIONAL_ID indicates +# - Used by write_compendia() to +HAS_ADDITIONAL_ID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' -# Properties currently supported in the property store: +# Properties currently supported in the property store in one set for validation. supported_properties = { - 'hasAdditionalId': 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId', + HAS_ADDITIONAL_ID, } -HAS_ADDITIONAL_ID = 'hasAdditionalId' +# +# The Property dataclass can be used to encapsulate a property for a CURIE. It has helper code to read +# and write these properties. +# -# A single property value. @dataclass -class PropertyValue: +class Property: + """ + A property value for a CURIE. + """ + curie: str property: str value: str - description: str - - -# A property store for a properties file. -class PropertyStore(AbstractContextManager): - def __init__(self, db3file_path, validate_properties=True, autocommit=True): - self.validate_properties = validate_properties - self.autocommit = autocommit - - # Make the prefix directory if it doesn't exist. - os.makedirs(os.path.dirname(db3file_path), exist_ok=True) - - self.connection = sqlite3.connect(db3file_path) - cur = self.connection.cursor() - cur.execute("CREATE TABLE IF NOT EXISTS properties (curie TEXT, property TEXT, value TEXT, description TEXT) ;") - # Create a UNIQUE index on the property values -- this means that if someone tries to set the same value for - # a property either duplicatively or from another source, we simply ignore it. - cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS properties_propvalues ON properties (curie, property, value);") - self.connection.commit() - - def __exit__(self, exc_type, exc_val, exc_tb): - self.connection.close() - - def commit(self): - self.connection.commit() - - def query(self, sql, params=None): - cursor = self.connection.cursor() - return cursor.execute(sql, params) - - def get_by_curie(self, curie) -> list[PropertyValue]: - results = self.query("SELECT curie, property, value, description FROM properties WHERE curie=:curie", params={ - "curie": curie, - }) - return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] - - def get_all(self) -> list[PropertyValue]: - results = self.query("SELECT curie, property, value, description FROM properties") - return [PropertyValue(result[0], result[1], result[2], result[3]) for result in results] - - def insert_all(self, pvs): - cursor = self.connection.cursor() - data = [] - for pv in pvs: - if self.validate_properties and pv.property not in supported_properties: - raise ValueError(f"Unable to insert_all({pvs}): unsupported property {pv.property} in {pv}.") - data.append({ - "curie": pv.curie, - "property": supported_properties[pv.property], - "value": pv.value, - "description": pv.description, - }) - cursor.executemany("INSERT OR IGNORE INTO properties VALUES (:curie, :property, :value, :description)", data) - self.connection.commit() + source: str + + @staticmethod + def valid_keys(): + return ['curie', 'property', 'value', 'source'] + + def __post_init__(self): + """ + Make sure this Property makes sense. + """ + if self.property not in supported_properties: + raise ValueError(f'Property {self.property} is not supported (supported properties: {supported_properties})') + + @staticmethod + def from_dict(prop): + """ + Read this dictionary into a Property. + + :return: A Property version of this JSON line. + """ + + # Check if this dictionary includes keys that aren't valid in a Property. + unexpected_keys = prop.keys() - Property.valid_keys() + if len(unexpected_keys) > 0: + raise ValueError(f'Unexpected keys in dictionary to be converted to Property ({unexpected_keys}): {json.dumps(prop, sort_keys=True, indent=2)}') + + return Property(**prop) + + # TODO: we should have some validation code in here so people don't make nonsense properties, which means + # validating both the property and the value. + + def to_json_line(self): + """ + Returns this property as a JSONL line, including the final newline (so you can write it directly to a file). + + :return: A string containing the JSONL line of this property. + """ + return json.dumps({ + 'curie': self.curie, + 'property': self.property, + 'value': self.value, + 'source': self.source, + }) + '\n' + +# +# The PropertyList object can be used to load and query properties from multiple sources. +# +# We could write them into a DuckDB file as we load them so they can overflow onto disk as needed, but that's overkill +# for right now, so we'll just load them all into memory. +# + +class PropertyList: + """ + This class can be used to load multiple property files for simultaneous querying. + + In order to support the existing property files, we will additionally support the two main alternate formats we use: + - A three column TSV file, with columns: CURIE, property, value + - A four column TSV file, with columns: CURIE, property, value, source + + But eventually all of those files will be subsumed into JSONL files. + """ + + def __init__(self): + """ + Create a new PropertyList object. + + Since most of our queries will be CURIE-based, we'll index properties by CURIE, but we'll also keep + a set of all properties. + """ + self._properties = set[Property]() + self._properties_by_curie = defaultdict(set[Property]) + + @property + def properties(self) -> set[Property]: + return self._properties + + def __getitem__(self, curie: str) -> set[Property]: + """ + Get all properties for a given CURIE. + + :param curie: The CURIE to look up properties. + :return: The set of properties for this CURIE. + """ + return self._properties_by_curie[curie] + + def add_properties(self, props: set[Property]): + """ + Add a set of Property values to the list. + + :param props: A set of Property values. + :return: The number of unique properties added. + """ + + props_to_be_added = (props - self._properties) + + self._properties.update(props) + for prop in props: + self._properties_by_curie[prop.curie].add(prop) + + return len(props_to_be_added) + + def add_properties_jsonl_gz(self, filename_gz: str): + """ + Add all the properties in a JSONL Gzipped file. + + :param filename_gz: The properties JSONL Gzipped filename to load. + :return: The number of unique properties loaded. + """ + + props_to_add = set[Property]() + with gzip.open(filename_gz, 'rt') as f: + for line in f: + props_to_add.add(Property.from_dict(json.loads(line))) + + return self.add_properties(props_to_add) From fb50c2bfb5a364316cd8c1394b704c474f67ed04 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:44:20 -0400 Subject: [PATCH 066/167] Tweaked the data model further. --- src/properties.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/properties.py b/src/properties.py index 623e745e..15fd2be3 100644 --- a/src/properties.py +++ b/src/properties.py @@ -11,7 +11,7 @@ import gzip import json from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field # # SUPPORTED PROPERTIES @@ -22,7 +22,7 @@ HAS_ADDITIONAL_ID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' # Properties currently supported in the property store in one set for validation. -supported_properties = { +supported_predicates = { HAS_ADDITIONAL_ID, } @@ -38,35 +38,42 @@ class Property: """ curie: str - property: str + predicate: str value: str - source: str + sources: list[str] = field(default_factory=list[str]) @staticmethod def valid_keys(): - return ['curie', 'property', 'value', 'source'] + return ['curie', 'predicate', 'value', 'sources'] def __post_init__(self): """ Make sure this Property makes sense. """ - if self.property not in supported_properties: - raise ValueError(f'Property {self.property} is not supported (supported properties: {supported_properties})') + if self.predicate not in supported_predicates: + raise ValueError(f'Predicate {self.predicate} is not supported (supported predicates: {supported_predicates})') @staticmethod - def from_dict(prop): + def from_dict(prop_dict, source=None): """ Read this dictionary into a Property. + :param prop_dict: A dictionary containing the property values. + :param source: The source of this property, if any. :return: A Property version of this JSON line. """ # Check if this dictionary includes keys that aren't valid in a Property. - unexpected_keys = prop.keys() - Property.valid_keys() + unexpected_keys = prop_dict.keys() - Property.valid_keys() if len(unexpected_keys) > 0: - raise ValueError(f'Unexpected keys in dictionary to be converted to Property ({unexpected_keys}): {json.dumps(prop, sort_keys=True, indent=2)}') + raise ValueError(f'Unexpected keys in dictionary to be converted to Property ({unexpected_keys}): {json.dumps(prop_dict, sort_keys=True, indent=2)}') - return Property(**prop) + prop = Property(**prop_dict) + if source is not None: + # Add the source to the end of the sources list. + prop.sources.append(source) + + return prop # TODO: we should have some validation code in here so people don't make nonsense properties, which means # validating both the property and the value. @@ -79,9 +86,9 @@ def to_json_line(self): """ return json.dumps({ 'curie': self.curie, - 'property': self.property, + 'predicate': self.predicate, 'value': self.value, - 'source': self.source, + 'sources': self.sources, }) + '\n' # @@ -116,20 +123,27 @@ def __init__(self): def properties(self) -> set[Property]: return self._properties - def __getitem__(self, curie: str) -> set[Property]: + def get_all(self, curie: str, predicate: str = None) -> set[Property]: """ Get all properties for a given CURIE. :param curie: The CURIE to look up properties. + :param predicate: If specified, only return properties with this predicate. :return: The set of properties for this CURIE. """ - return self._properties_by_curie[curie] + props = self._properties_by_curie[curie] + + if predicate not in supported_predicates: + raise ValueError(f'Predicate {predicate} is not supported (supported predicates: {supported_predicates})') + + return set(filter(lambda p: p.predicate == predicate, props)) def add_properties(self, props: set[Property]): """ Add a set of Property values to the list. :param props: A set of Property values. + :param source: The source of these properties, if any. :return: The number of unique properties added. """ @@ -152,6 +166,6 @@ def add_properties_jsonl_gz(self, filename_gz: str): props_to_add = set[Property]() with gzip.open(filename_gz, 'rt') as f: for line in f: - props_to_add.add(Property.from_dict(json.loads(line))) + props_to_add.add(Property.from_dict(json.loads(line), source=filename_gz)) return self.add_properties(props_to_add) From 29c3fd28045a4945b299c23a073365de8eaedf3d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:45:27 -0400 Subject: [PATCH 067/167] Updated usage of chemical properties to match the new format. --- src/createcompendia/chemicals.py | 31 ++++++++++--------------------- src/snakefiles/chemical.snakefile | 8 ++++++-- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 5bfb302c..120f19ec 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,17 +1,12 @@ -import json import logging from collections import defaultdict -from datetime import datetime import jsonlines import requests import ast import gzip -from gzip import GzipFile - -from src.properties import PropertyStore, PropertyValue, HAS_ADDITIONAL_ID -import yaml +from src.properties import Property, HAS_ADDITIONAL_ID from src.metadata.provenance import write_concord_metadata, write_combined_metadata from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI @@ -471,7 +466,7 @@ def make_gtopdb_relations(infile,outfile, metadata_yaml): concord_filename=outfile, ) -def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): +def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): """CHEBI contains relations both about chemicals with and without inchikeys. You might think that because everything is based on unichem, we could avoid the with structures part, but history has shown that we lose links in that case, so we will use both the structured and unstructured chemical entries.""" @@ -488,20 +483,18 @@ def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): kk = 'keggcompounddatabaselinks' pk = 'pubchemdatabaselinks' secondary_chebi_id = 'secondarychebiid' - with open(outfile,'w') as outf, PropertyStore(propfile) as propstore: - properties = [] - + with open(outfile,'w') as outf, gzip.open(propfile_gz, 'wt') as propf: #Write SDF structured things for cid,props in chebi_sdf_dat.items(): if secondary_chebi_id in props: secondary_ids = props[secondary_chebi_id] for secondary_id in secondary_ids: - properties.append(PropertyValue( + propf.write(Property( curie = cid, - property = HAS_ADDITIONAL_ID, + predicate = HAS_ADDITIONAL_ID, value = secondary_id, - description = 'Listed as a CHEBI secondary ID in the ChEBI SDF file' - )) + source = f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})' + ).to_json_line()) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') if pk in props: @@ -529,10 +522,6 @@ def make_chebi_relations(sdf,dbx,outfile,metadata_yaml): if x[3] == 'Pubchem accession': outf.write(f'{cid}\txref\t{PUBCHEMCOMPOUND}:{x[4]}\n') - # Write out the properties. - propstore.insert_all(properties) - - write_concord_metadata( metadata_yaml, name='make_chebi_relations()', @@ -672,7 +661,7 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c combined_from_filenames=input_metadata_yamls, ) -def build_compendia(type_file, untyped_compendia_file, metadata_yamls, icrdf_filename): +def build_compendia(type_file, untyped_compendia_file, properties_jsonl_gz_files, metadata_yamls, icrdf_filename): types = {} with open(type_file,'r') as inf: for line in inf: @@ -687,9 +676,9 @@ def build_compendia(type_file, untyped_compendia_file, metadata_yamls, icrdf_fil for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] if biotype == DRUG: - write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename, properties_jsonl_gz_files=properties_jsonl_gz_files) else: - write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename, properties_jsonl_gz_files=properties_jsonl_gz_files) def create_typed_sets(eqsets, types): """ diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index f2c7ffac..2ad48b70 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -197,9 +197,10 @@ rule get_chebi_concord: dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', + propfile=config['intermediate_directory']+'/chemicals/properties/get_chebi_concord.json.gz', metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-CHEBI.yaml' run: - chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile, output.metadata_yaml) + chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile,propfile_gz=output.propfile,metadata_yaml=output.metadata_yaml) rule chemical_unichem_concordia: input: @@ -230,13 +231,16 @@ rule chemical_compendia: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', metadata_yamls = config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml', + properties_jsonl_gz = [ + config['intermediate_directory'] + '/chemicals/properties/get_chebi_concord.jsonl.gz' + ], icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])), expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), run: - chemicals.build_compendia(input.typesfile, input.untyped_file, [input.metadata_yamls], input.icrdf_filename) + chemicals.build_compendia(input.typesfile, input.untyped_file, input.properties_jsonl_gz, input.icrdf_filename) rule check_chemical_completeness: input: From b921ec0211161c4a1e961a2b048d5140822b8866 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:46:14 -0400 Subject: [PATCH 068/167] First stab at incorporating HAS_ADDITIONAL_ID into write_compendium(). --- src/babel_utils.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 59a53609..e22dc2d3 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -16,6 +16,7 @@ from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory +from src.properties import PropertyList, HAS_ADDITIONAL_ID from src.util import Text, get_config from src.LabeledID import LabeledID from collections import defaultdict @@ -352,7 +353,7 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, extra_prefixes=[], icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=None, extra_prefixes=None, icrdf_filename=None, properties_jsonl_gz_files=None): """ :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: @@ -367,8 +368,13 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, write_compendium() will throw a RuntimeError if it is not specified. This is to ensure that it has been properly specified as a prerequisite in a Snakemake file, so that write_compendium() is not run until after icRDF.tsv has been generated. + :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ + if extra_prefixes is None: + extra_prefixes = [] + if labels is None: + labels = {} config = get_config() cdir = config['output_directory'] biolink_version = config['biolink_version'] @@ -393,6 +399,14 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True) os.makedirs(os.path.join(cdir, 'synonyms'), exist_ok=True) + # Load all the properties. + property_list = PropertyList() + if properties_jsonl_gz_files: + for properties_jsonl_gz_file in properties_jsonl_gz_files: + property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) + + property_source_count = defaultdict(int) + # Counts. count_cliques = 0 count_eq_ids = 0 @@ -401,6 +415,20 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, # Write compendium and synonym files. with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: for slist in synonym_list: + # At this point, we insert any HAS_ADDITIONAL_ID properties we have. + # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long + # as the additional CURIE is not already in the list of CURIEs. + identifier_list = [] + for iid in slist: + identifier_list.append(iid) + additional_curies = property_list.get_all(iid, HAS_ADDITIONAL_ID) + if additional_curies: + for ac in additional_curies: + if ac.curie not in slist: + identifier_list.append(ac.curie) + for source in ac.sources: + property_source_count[source] += 1 + node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) if node is not None: count_cliques += 1 @@ -568,6 +596,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={}, 'cliques': count_cliques, 'eq_ids': count_eq_ids, 'synonyms': count_synonyms, + 'property_sources': property_source_count, }, combined_from_filenames=metadata_yamls, ) From f322cb2066be94a488561b261c2da308d1d395ef Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:46:55 -0400 Subject: [PATCH 069/167] General code improvements. --- config.yaml | 6 ++++++ src/node.py | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/config.yaml b/config.yaml index 9ac10e53..1c6621b3 100644 --- a/config.yaml +++ b/config.yaml @@ -1,13 +1,19 @@ +# Overall inputs and outputs. input_directory: input_data download_directory: babel_downloads intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs +# Versions that need to be updated on every release. biolink_version: "4.2.6-rc5" umls_version: "2025AA" rxnorm_version: "07072025" drugbank_version: "5-1-13" +# +# The rest of these configs need to be cleaned up. +# + UMLS_UniProtKB_download_raw_url: "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv" ncbi_files: diff --git a/src/node.py b/src/node.py index 25f483a8..9c40e147 100644 --- a/src/node.py +++ b/src/node.py @@ -412,8 +412,7 @@ def apply_labels(self, input_identifiers, labels): labeled_list = [] for iid in input_identifiers: if isinstance(iid,LabeledID): - print('LabeledID dont belong here, pass in labels seperately',iid) - exit() + raise ValueError(f"LabeledID don't belong here ({iid}), pass in labels separately.") if iid in labels: labeled_list.append( LabeledID(identifier=iid, label = labels[iid])) else: @@ -442,7 +441,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: - print(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') + logging.warning(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') cleaned = self.apply_labels(input_identifiers,labels) try: idmap = defaultdict(list) @@ -457,7 +456,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) - exit() + raise ValueError('something very bad') identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts From ba0f4ff2b028fec5bb0a01591318e2dd81ba14c2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:48:42 -0400 Subject: [PATCH 070/167] Fixed calling ChEBI identifiers. --- src/createcompendia/chemicals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 120f19ec..e1507ed1 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -493,7 +493,7 @@ def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): curie = cid, predicate = HAS_ADDITIONAL_ID, value = secondary_id, - source = f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})' + sources = [f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})'] ).to_json_line()) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') From 22ea646d899983d4bde4b4f5d501cacb4ae9e8e7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:51:48 -0400 Subject: [PATCH 071/167] Added support for creating the properties directory. --- src/createcompendia/chemicals.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index e1507ed1..ea422990 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,5 +1,7 @@ import logging +import os from collections import defaultdict +from os.path import dirname import jsonlines import requests @@ -483,6 +485,10 @@ def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): kk = 'keggcompounddatabaselinks' pk = 'pubchemdatabaselinks' secondary_chebi_id = 'secondarychebiid' + + # What if we don't have a propfile directory? + os.makedirs(dirname(propfile_gz), exist_ok=True) + with open(outfile,'w') as outf, gzip.open(propfile_gz, 'wt') as propf: #Write SDF structured things for cid,props in chebi_sdf_dat.items(): From 57b9bd28791e9aa026beacace573b93fb4442de7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 7 Aug 2025 02:52:32 -0400 Subject: [PATCH 072/167] Fixed typo in filename. --- src/snakefiles/chemical.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 2ad48b70..4cecbcd4 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -197,7 +197,7 @@ rule get_chebi_concord: dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', - propfile=config['intermediate_directory']+'/chemicals/properties/get_chebi_concord.json.gz', + propfile=config['intermediate_directory']+'/chemicals/properties/get_chebi_concord.jsonl.gz', metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-CHEBI.yaml' run: chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile,propfile_gz=output.propfile,metadata_yaml=output.metadata_yaml) From 6d24ae88c3be953f1b8ca2cb8ddb893bf48bf708 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 9 Aug 2025 23:43:39 -0400 Subject: [PATCH 073/167] Slightly improved error case. --- src/babel_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index e22dc2d3..1b0fbf5d 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -430,7 +430,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non property_source_count[source] += 1 node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) - if node is not None: + if node is None: + raise RuntimeError(f"Could not create node for ({slist}, {node_type}, {labels}, {extra_prefixes}): returned None.") + else: count_cliques += 1 count_eq_ids += len(slist) From 9b4738c1b11f8e357498e2f766a4ea3c5d5ebd16 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 9 Aug 2025 23:46:13 -0400 Subject: [PATCH 074/167] Removed some redundant code, improved an exception. --- src/node.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/node.py b/src/node.py index 9c40e147..ff89ed0b 100644 --- a/src/node.py +++ b/src/node.py @@ -456,7 +456,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) - raise ValueError('something very bad') + raise RuntimeError('something very bad') identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts @@ -489,19 +489,10 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): self.ignored_prefixes.add( (k,node_type) ) if len(identifiers) == 0: return None - best_id = identifiers[0]['identifier'] - # identifiers is in preferred order, so choose the first non-empty label to be the node label - labels = list(filter(lambda x:len(x) > 0, [ l['label'] for l in identifiers if 'label' in l ])) - label = None - if len(labels) > 0: - label = labels[0] - node = { 'identifiers': identifiers, 'type': node_type } - #if label is not None: - # node['id']['label'] = label return node def pubchemsort(pc_ids, labeled_ids): From d62a9f0b008590ce84922c4d906d4bc615173fac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:15:02 -0400 Subject: [PATCH 075/167] Added some memory tracking code into the protein compendium. Also a rate/estimated time calculator. --- requirements.txt | 3 +++ src/babel_utils.py | 25 ++++++++++++++++++++++++- src/createcompendia/protein.py | 25 +++++++++++++++---------- src/util.py | 21 +++++++++++++++++---- 4 files changed, 59 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 4422caef..a8c7230c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,6 @@ duckdb # on checking if it's online via http://httpstat.us/200, which is often offline. My branch of this # https://github.com/gaurav/apybiomart/tree/change-check-url and changes that to https://example.org. git+https://github.com/gaurav/apybiomart.git@change-check-url + +# Added by Gaurav, Aug 2025 to check for memory information while Babel is running. +psutil diff --git a/src/babel_utils.py b/src/babel_utils.py index 1b0fbf5d..c2e67a01 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -17,7 +17,7 @@ from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.properties import PropertyList, HAS_ADDITIONAL_ID -from src.util import Text, get_config +from src.util import Text, get_config, get_memory_usage_summary from src.LabeledID import LabeledID from collections import defaultdict import sqlite3 @@ -371,6 +371,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ + logging.info(f"Starting write_compendium({metadata_yamls}, {synonym_list}, {ofname}, {node_type}, {labels}, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") + logging.info(f" - Memory usage: {get_memory_usage_summary()}") + if extra_prefixes is None: extra_prefixes = [] if labels is None: @@ -414,7 +417,27 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # Write compendium and synonym files. with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: + # Calculate an estimated time to completion. + start_time = time.time_ns() + count_slist = 0 + total_slist = len(synonym_list) + for slist in synonym_list: + # Before we get started, let's estimate where we're at. + count_slist += 1 + if count_slist % 1000000 == 0: + time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 + remaining_slist = total_slist - count_slist + # count_slist --> time_elapsed_seconds + # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds + logging.info(f"Generated compendia and synonyms for {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") + logging.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") + + time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) + hours, remainder = divmod(time_remaining_seconds, 3600) + minutes, seconds = divmod(remainder, 60) + logging.info(f" - Estimated time remaining: {time_remaining_seconds:.2f} seconds ({hours:} hours, {minutes:02} minutes, {seconds:02} seconds)") + # At this point, we insert any HAS_ADDITIONAL_ID properties we have. # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long # as the additional CURIE is not already in the list of CURIEs. diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index dafa406a..0d4cc33d 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -15,7 +15,8 @@ import gzip import logging -from src.util import LoggingUtil +from src.util import LoggingUtil, get_memory_usage_summary + logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) @@ -159,27 +160,30 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil dicts = {} types = {} uniques = [UNIPROTKB,PR] + logging.info(f"Started building protein comendium ({concordances}, {metadata_yamls}, {identifiers}, {icrdf_filename}) with uniques {uniques}") for ifile in identifiers: - print(ifile) + logging.info(f"Loading identifier file {ifile}") new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes= uniques) types.update(new_types) + logging.info(f"Loaded identifier file {ifile}") + logging.info(f"Finished loading identifiers, memory usage: {get_memory_usage_summary()}") for infile in concordances: - print(infile) - print('loading', infile) + logging.info(f"Loading concordance file {infile}") pairs = [] with open(infile, 'r') as inf: for line_index, line in enumerate(inf): - # if line_index % 10000 == 0: - # print("Loaded line count", line_index) + if line_index % 100000 == 0: + logging.info(f"Loading concordance file {infile}: line {line_index:,}") x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) # print("glomming", infile) # This takes a while, but doesn't add much to the memory glom(dicts, pairs, unique_prefixes=uniques) - print("glommed", infile) - # print("merging dicts") # This seems to increase memory usage slightly. + logging.info(f"Loaded concordance file {infile}") + logging.info(f"Finished loading concordances, memory usage: {get_memory_usage_summary()}") + logging.info(f"Building gene sets") gene_sets = set([frozenset(x) for x in dicts.values()]) - print("merged dicts", infile) + logging.info(f"Gene sets built, memory usage: {get_memory_usage_summary()}") #Try to preserve some memory here. dicts.clear() @@ -188,5 +192,6 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] + logging.info(f"Writing compendium for {baretype}, memory usage: {get_memory_usage_summary()}") write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) - + logging.info(f"Wrote compendium for {baretype}, memory usage: {get_memory_usage_summary()}") diff --git a/src/util.py b/src/util.py index 24eed6e2..512e8f31 100644 --- a/src/util.py +++ b/src/util.py @@ -4,6 +4,7 @@ import curies import yaml +import psutil from collections import namedtuple import copy from logging.handlers import RotatingFileHandler @@ -72,7 +73,7 @@ class Munge(object): @staticmethod def gene (gene): return gene.split ("/")[-1:][0] if gene.startswith ("http://") else gene - + class Text: """ Utilities for processing text. """ prefixmap = { x.lower(): x for k,x in vars(prefixes).items() if not k.startswith("__")} @@ -114,7 +115,7 @@ def recurie(cls,text,new_prefix=None): @staticmethod def un_curie (text): return ':'.join(text.split (':', 1)[1:]) if ':' in text else text - + @staticmethod def short (obj, limit=80): text = str(obj) if obj else None @@ -171,7 +172,7 @@ def opt_to_curie (text): return Text.recurie(r) else: raise ValueError(f"Unable to opt_to_curie({text}): output calculated as {r}, which has no colon.") - + return r @staticmethod @@ -217,7 +218,7 @@ def load_yaml (path): with open (path, 'r') as stream: result = yaml.load (stream.read ()) return result - + def get_resource_obj (resource_name, format='json'): result = None path = Resource.get_resource_path (resource_name) @@ -330,3 +331,15 @@ def get_biolink_prefix_map(): f'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version + '/project/prefixmap/biolink_model_prefix_map.json' ) + +def get_memory_usage_summary(): + """ + Provide a short summary of current memory usage to write into logs. + + :return: A string summarizing current memory usage. + """ + process = psutil.Process() + process.memory_percent() + mem_info = process.memory_info() + + return f"Using {process.memory_percent():.2f}% of available memory (RSS: {mem_info.rss / 1024 ** 2:.2f} MB, VMS: {mem_info.vms / 1024 ** 2:.2f} MB)" From 8ece61a6d1f9e442a03fc25858ed1024f331d92a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:28:33 -0400 Subject: [PATCH 076/167] Fixed output so we don't just write out every single frozenset. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index c2e67a01..6d9f6a0a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -371,7 +371,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ - logging.info(f"Starting write_compendium({metadata_yamls}, {synonym_list}, {ofname}, {node_type}, {labels}, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") + logging.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") logging.info(f" - Memory usage: {get_memory_usage_summary()}") if extra_prefixes is None: From d3b5de4c20873e1634fdae02ed38fe3dfb321a6f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:34:28 -0400 Subject: [PATCH 077/167] Updated memory usage to GB. --- src/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.py b/src/util.py index 512e8f31..c054c9c1 100644 --- a/src/util.py +++ b/src/util.py @@ -342,4 +342,4 @@ def get_memory_usage_summary(): process.memory_percent() mem_info = process.memory_info() - return f"Using {process.memory_percent():.2f}% of available memory (RSS: {mem_info.rss / 1024 ** 2:.2f} MB, VMS: {mem_info.vms / 1024 ** 2:.2f} MB)" + return f"Using {process.memory_percent():.2f}% of available memory (RSS: {mem_info.rss / 1024 ** 3:.2f} GB, VMS: {mem_info.vms / 1024 ** 3:.2f} GB)" From 3a8f196d9e2c99276bdd83c9cb7a3647f42caa8d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:38:09 -0400 Subject: [PATCH 078/167] Added humanfriendly. --- requirements.txt | 1 + src/babel_utils.py | 2 ++ src/util.py | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a8c7230c..d7e473e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,3 +34,4 @@ git+https://github.com/gaurav/apybiomart.git@change-check-url # Added by Gaurav, Aug 2025 to check for memory information while Babel is running. psutil +humanfriendly diff --git a/src/babel_utils.py b/src/babel_utils.py index 6d9f6a0a..2bb21d7d 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -13,6 +13,7 @@ import urllib import jsonlines import yaml +from humanfriendly import format_timespan from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory @@ -437,6 +438,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non hours, remainder = divmod(time_remaining_seconds, 3600) minutes, seconds = divmod(remainder, 60) logging.info(f" - Estimated time remaining: {time_remaining_seconds:.2f} seconds ({hours:} hours, {minutes:02} minutes, {seconds:02} seconds)") + logging.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") # At this point, we insert any HAS_ADDITIONAL_ID properties we have. # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long diff --git a/src/util.py b/src/util.py index c054c9c1..2a836c6d 100644 --- a/src/util.py +++ b/src/util.py @@ -10,6 +10,7 @@ from logging.handlers import RotatingFileHandler from bmt import Toolkit +from humanfriendly import format_size from src.LabeledID import LabeledID from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION @@ -342,4 +343,4 @@ def get_memory_usage_summary(): process.memory_percent() mem_info = process.memory_info() - return f"Using {process.memory_percent():.2f}% of available memory (RSS: {mem_info.rss / 1024 ** 3:.2f} GB, VMS: {mem_info.vms / 1024 ** 3:.2f} GB)" + return f"Using {process.memory_percent():.2f}% of available memory (RSS: {format_size(mem_info.rss, binary=True)}, VMS: {format_size(mem_info.vms, binary=True)})" From 17f8e2b7ed5d055546c5ff2d20278444e95c5dd2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:38:33 -0400 Subject: [PATCH 079/167] Updated requirements.lock. --- requirements.lock | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.lock b/requirements.lock index 9bc7b588..a812926e 100644 --- a/requirements.lock +++ b/requirements.lock @@ -92,7 +92,7 @@ pronto==2.7.0 propcache==0.3.1 psutil==7.0.0 psycopg2-binary==2.9.10 -PuLP==3.1.1 +PuLP==2.7.0 pydantic==2.11.4 pydantic_core==2.33.2 PyJSG==0.11.10 @@ -128,7 +128,7 @@ ShExJSG==0.8.2 six==1.17.0 smart-open==7.1.0 smmap==5.0.2 -snakemake==9.3.3 +snakemake==7.32.4 snakemake-interface-common==1.17.4 snakemake-interface-executor-plugins==9.3.5 snakemake-interface-logger-plugins==1.2.3 @@ -142,10 +142,12 @@ SQLAlchemy==2.0.40 SQLAlchemy-Utils==0.38.3 sssom==0.4.15 sssom-schema==1.0.0 +stopit==1.1.2 stringcase==1.2.0 tabulate==0.9.0 tenacity==8.5.0 throttler==1.2.2 +toposort==1.10 tqdm==4.67.1 traitlets==5.14.3 types-python-dateutil==2.9.0.20241206 From ce3542bc928edd644919cfa9558132f6b01406bc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 13:59:18 -0400 Subject: [PATCH 080/167] Created get_logger() function to replace LoggingUtil. --- src/babel_utils.py | 30 ++++++++++++++++-------------- src/util.py | 27 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 2bb21d7d..87205c14 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -1,4 +1,3 @@ -import logging import subprocess import traceback from enum import Enum @@ -18,12 +17,15 @@ from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory from src.properties import PropertyList, HAS_ADDITIONAL_ID -from src.util import Text, get_config, get_memory_usage_summary +from src.util import Text, get_config, get_memory_usage_summary, get_logger from src.LabeledID import LabeledID from collections import defaultdict import sqlite3 from typing import List, Tuple +# Set up logger. +logger = get_logger() + def make_local_name(fname,subpath=None): config = get_config() if subpath is None: @@ -284,7 +286,7 @@ def pull_via_wget( wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--level=1', '--directory-prefix=' + dl_file_name]) # Execute wget. - logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}") + logger.info(f"Downloading {dl_file_name} using wget: {wget_command_line}") process = subprocess.run(wget_command_line) if process.returncode != 0: raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") @@ -302,17 +304,17 @@ def pull_via_wget( if os.path.isfile(uncompressed_filename): file_size = os.path.getsize(uncompressed_filename) - logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.") + logger.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.") else: raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.') else: if os.path.isfile(dl_file_name): file_size = os.path.getsize(dl_file_name) - logging.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.") + logger.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.") elif os.path.isdir(dl_file_name): # Count the number of files in directory dl_file_name dir_size = sum(os.path.getsize(os.path.join(dl_file_name, f)) for f in os.listdir(dl_file_name) if os.path.isfile(os.path.join(dl_file_name, f))) - logging.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.") + logger.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.") else: raise RuntimeError(f'Unknown file type {dl_file_name}') @@ -372,8 +374,8 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ - logging.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") - logging.info(f" - Memory usage: {get_memory_usage_summary()}") + logger.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") + logger.info(f" - Memory usage: {get_memory_usage_summary()}") if extra_prefixes is None: extra_prefixes = [] @@ -431,14 +433,14 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds - logging.info(f"Generated compendia and synonyms for {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") - logging.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") + logger.info(f"Generated compendia and synonyms for {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") + logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) hours, remainder = divmod(time_remaining_seconds, 3600) minutes, seconds = divmod(remainder, 60) - logging.info(f" - Estimated time remaining: {time_remaining_seconds:.2f} seconds ({hours:} hours, {minutes:02} minutes, {seconds:02} seconds)") - logging.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") + logger.info(f" - Estimated time remaining: {time_remaining_seconds:.2f} seconds ({hours:} hours, {minutes:02} minutes, {seconds:02} seconds)") + logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") # At this point, we insert any HAS_ADDITIONAL_ID properties we have. # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long @@ -571,7 +573,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non if preferred_name: document["preferred_name"] = preferred_name else: - logging.debug( + logger.debug( f"No preferred name for {node}, probably because all names were filtered out, skipping." ) continue @@ -584,7 +586,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # Since synonyms_list is sorted, we can use the length of the first term as the synonym. if len(synonyms_list) == 0: - logging.debug(f"Synonym list for {node} is empty: no valid name. Skipping.") + logger.debug(f"Synonym list for {node} is empty: no valid name. Skipping.") continue else: document["shortest_name_length"] = len(synonyms_list[0]) diff --git a/src/util.py b/src/util.py index 2a836c6d..d4625600 100644 --- a/src/util.py +++ b/src/util.py @@ -1,6 +1,8 @@ import logging import json import os +import sys +from time import gmtime import curies import yaml @@ -16,6 +18,31 @@ from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes +def get_logger(name=None, level=logging.INFO): + """ + Get a logger with the specified name. + + The LoggingUtil is inconsistently used, and we don't want rolling logs anyway -- just logging everything to STDOUT + so that Snakemake can capture it is fine. However, we + """ + if name is None: + name = f"{__name__} ({__file__})" + + # Set up a logger. + logger = logging.getLogger(name) + logger.setLevel(level) + + # Set up a formatter. We want to use UTC time. + formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s') + formatter.converter = gmtime + + # Set up a stream handler for STDERR. + stream_handler = logging.StreamHandler(sys.stderr) + stream_handler.setFormatter(formatter) + logger.addHandler(stream_handler) + + return logger + #loggers = {} class LoggingUtil(object): """ Logging utility controlling format and setting initial logging level """ From abc3bddc20237f80bfc1da2694c5498ec6754e55 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 14:12:59 -0400 Subject: [PATCH 081/167] Standardized logging in node.py. --- src/node.py | 60 +++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/src/node.py b/src/node.py index ff89ed0b..a7236859 100644 --- a/src/node.py +++ b/src/node.py @@ -1,15 +1,21 @@ import json -import logging import os from collections import defaultdict from urllib.parse import urlparse import curies -from src.util import Text, get_config, get_biolink_model_toolkit, get_biolink_prefix_map +from src.util import ( + Text, + get_config, + get_biolink_model_toolkit, + get_biolink_prefix_map, + get_logger, +) from src.LabeledID import LabeledID from src.prefixes import PUBCHEMCOMPOUND +logger = get_logger() class SynonymFactory: """ @@ -35,12 +41,12 @@ def __init__(self,syndir): self.synonym_dir = syndir self.synonyms = {} self.common_synonyms = None - print(f"Created SynonymFactory for directory {syndir}") + logger.info(f"Created SynonymFactory for directory {syndir}") def load_synonyms(self,prefix): lbs = defaultdict(set) labelfname = os.path.join(self.synonym_dir, prefix, 'labels') - print(f'Loading synonyms for {prefix} from {labelfname}') + logger.info(f'Loading synonyms for {prefix} from {labelfname}') count_labels = 0 count_synonyms = 0 if os.path.exists(labelfname): @@ -59,7 +65,7 @@ def load_synonyms(self,prefix): lbs[x[0]].add( (x[1], x[2]) ) count_synonyms += 1 self.synonyms[prefix] = lbs - print(f'Loaded {count_labels} labels and {count_synonyms} synonyms for {prefix} from {labelfname}') + logger.info(f'Loaded {count_labels:,} labels and {count_synonyms:,} synonyms for {prefix} from {labelfname}') def get_synonyms(self,node): config = get_config() @@ -77,7 +83,7 @@ def get_synonyms(self,node): row = json.loads(line) self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) count_common_file_synonyms += 1 - logging.info(f"Loaded {count_common_file_synonyms} common synonyms from {common_synonyms_path}") + logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}") node_synonyms = set() for ident in node['identifiers']: @@ -99,10 +105,10 @@ def __init__(self,rootdir): self.root_dir = rootdir self.descriptions = {} self.common_descriptions = None - print(f"Created DescriptionFactory for directory {rootdir}") + logger.info(f"Created DescriptionFactory for directory {rootdir}") def load_descriptions(self,prefix): - print(f'Loading descriptions for {prefix}') + logger.info(f'Loading descriptions for {prefix}') descs = defaultdict(set) descfname = os.path.join(self.root_dir, prefix, 'descriptions') desc_count = 0 @@ -113,7 +119,7 @@ def load_descriptions(self,prefix): descs[x[0]].add("\t".join(x[1:])) desc_count += 1 self.descriptions[prefix] = descs - print(f'Loaded {desc_count} descriptions for {prefix}') + logger.info(f'Loaded {desc_count:,} descriptions for {prefix}') def get_descriptions(self,node): config = get_config() @@ -131,7 +137,7 @@ def get_descriptions(self,node): row = json.loads(line) self.common_descriptions[row['curie']].extend(row['descriptions']) count_common_file_descriptions += 1 - logging.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") + logger.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") node_descriptions = defaultdict(set) @@ -154,7 +160,7 @@ def __init__(self,rootdir): self.taxa = {} def load_taxa(self, prefix): - print(f'Loading taxa for {prefix}') + logger.info(f'Loading taxa for {prefix}') taxa_per_prefix = defaultdict(set) taxafilename = os.path.join(self.root_dir, prefix, 'taxa') taxon_count = 0 @@ -165,7 +171,7 @@ def load_taxa(self, prefix): taxa_per_prefix[x[0]].add("\t".join(x[1:])) taxon_count += 1 self.taxa[prefix] = taxa_per_prefix - print(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}') + logger.info(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}') def get_taxa(self, node): node_taxa = defaultdict(set) @@ -244,10 +250,10 @@ def __init__(self, ic_file): # Sort the dictionary items by value in descending order sorted_by_prefix = sorted(count_by_prefix.items(), key=lambda item: item[1], reverse=True) - print(f"Loaded {len(self.ic)} InformationContent values from {len(count_by_prefix.keys())} prefixes:") + logger.info(f"Loaded {len(self.ic)} InformationContent values from {len(count_by_prefix.keys())} prefixes:") # Now you can print the sorted items for key, value in sorted_by_prefix: - print(f'- {key}: {value}') + logger.info(f'- {key}: {value}') # We see a number of URLs being mapped to None (250,871 at present). Let's optionally raise an error if that # happens. @@ -259,12 +265,12 @@ def __init__(self, ic_file): unmapped_urls_by_netloc[netloc].append(url) # Print them in reverse count order. - print(f"Found {len(unmapped_urls)} unmapped URLs:") + logger.info(f"Found {len(unmapped_urls)} unmapped URLs:") netlocs_by_count = sorted(unmapped_urls_by_netloc.items(), key=lambda item: len(item[1]), reverse=True) for netloc, urls in netlocs_by_count: - print(f" - {netloc} [{len(urls)}]") + logger.info(f" - {netloc} [{len(urls)}]") for url in sorted(urls): - print(f" - {url}") + logger.info(f" - {url}") assert None not in sorted_by_prefix, ("Found invalid CURIEs in information content values, probably " "because they couldn't be mapped from URLs to CURIEs.") @@ -285,6 +291,7 @@ def get_ic(self, node): class NodeFactory: def __init__(self,label_dir,biolink_version): + self.biolink_version = biolink_version self.toolkit = get_biolink_model_toolkit(biolink_version) self.ancestor_map = {} self.prefix_map = {} @@ -306,7 +313,7 @@ def get_ancestors(self,input_type): def get_prefixes(self,input_type): if input_type in self.prefix_map: return self.prefix_map[input_type] - print(input_type) + logger.info(f"NodeFactory({self.label_dir}, {self.biolink_version}).get_prefixes({input_type}) called") j = self.toolkit.get_element(input_type) prefs = j['id_prefixes'] # biolink doesnt yet include UMLS as a valid prefix for biological process. There is a PR here: @@ -361,16 +368,15 @@ def clean_list(self,input_identifiers): wrote = True break if not wrote: - print(input_identifiers) - exit() + raise ValueError(f"Can't clean up list {v}") return cleaned def load_extra_labels(self,prefix): if self.label_dir is None: - print (f"WARNING: no label_dir specified in load_extra_labels({self}, {prefix}), can't load extra labels for {prefix}. Skipping.") + logger.warning(f"no label_dir specified in load_extra_labels({self}, {prefix}), can't load extra labels for {prefix}. Skipping.") return if prefix is None: - print (f"WARNING: no prefix specified in load_extra_labels({self}, {prefix}), can't load extra labels. Skipping.") + logger.warning(f"no prefix specified in load_extra_labels({self}, {prefix}), can't load extra labels. Skipping.") return labelfname = os.path.join(self.label_dir,prefix,'labels') lbs = {} @@ -404,7 +410,7 @@ def apply_labels(self, input_identifiers, labels): continue self.common_labels[x[0]] = x[1] count_common_file_labels += 1 - logging.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}") + logger.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}") #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. @@ -419,7 +425,7 @@ def apply_labels(self, input_identifiers, labels): try: prefix = Text.get_prefix(iid) except ValueError as e: - print(f"ERROR: Unable to apply_labels({self}, {input_identifiers}, {labels}): could not obtain prefix for identifier {iid}") + logger.error(f"Unable to apply_labels({self}, {input_identifiers}, {labels}): could not obtain prefix for identifier {iid}") raise e if prefix not in self.extra_labels: self.load_extra_labels(prefix) @@ -441,7 +447,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: - logging.warning(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') + logger.warning(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') cleaned = self.apply_labels(input_identifiers,labels) try: idmap = defaultdict(list) @@ -476,7 +482,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): try: newids.sort() except TypeError as e: - print(newids) + logger.error(f"Could not sort {newids} because of a TypeError: {e}") raise e if pupper == PUBCHEMCOMPOUND.upper() and len(newids) > 1: newids = pubchemsort(newids,cleaned) @@ -485,7 +491,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): for k,vals in idmap.items(): for v in vals: if v not in accepted_ids and (k,node_type) not in self.ignored_prefixes: - print(f'Ignoring prefix {k} for type {node_type}, identifier {v}') + logger.warning(f'Ignoring prefix {k} for type {node_type}, identifier {v}') self.ignored_prefixes.add( (k,node_type) ) if len(identifiers) == 0: return None From d4e83de591a325918d899f5f06f2522dc8e516a1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 14:16:57 -0400 Subject: [PATCH 082/167] Documented a future improvement. --- src/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/util.py b/src/util.py index d4625600..f143a3b1 100644 --- a/src/util.py +++ b/src/util.py @@ -26,6 +26,7 @@ def get_logger(name=None, level=logging.INFO): so that Snakemake can capture it is fine. However, we """ if name is None: + # TODO: what we really want to get here is the Snakemake job we're currently in, but that's tricky. name = f"{__name__} ({__file__})" # Set up a logger. From 85e6b46d44474d0fb81170f77b8872481e078d9d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 14:34:08 -0400 Subject: [PATCH 083/167] Reduced number of log outputs. --- src/createcompendia/protein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 0d4cc33d..7d67524f 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -173,7 +173,7 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil pairs = [] with open(infile, 'r') as inf: for line_index, line in enumerate(inf): - if line_index % 100000 == 0: + if line_index % 1000000 == 0: logging.info(f"Loading concordance file {infile}: line {line_index:,}") x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) From 8d430f983856d41cfe993270d5f7705bd8156a5b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 10 Aug 2025 15:30:34 -0400 Subject: [PATCH 084/167] Fixed build_compendium() call. --- src/snakefiles/chemical.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 4cecbcd4..fccccaf2 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -240,7 +240,7 @@ rule chemical_compendia: temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])), expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), run: - chemicals.build_compendia(input.typesfile, input.untyped_file, input.properties_jsonl_gz, input.icrdf_filename) + chemicals.build_compendia(input.typesfile, input.untyped_file, input.properties_jsonl_gz, input.metadata_yamls, input.icrdf_filename) rule check_chemical_completeness: input: From 3e37331c93389e83b9c668984d8de937fc0cf1da Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 10:52:17 -0400 Subject: [PATCH 085/167] Improved log message. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 87205c14..53e5ab9a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -433,7 +433,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds - logger.info(f"Generated compendia and synonyms for {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") + logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) From 3ea3426d4d58bc4ce59e5af18530808de5afb1a4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 11:00:05 -0400 Subject: [PATCH 086/167] Standardized logging of new protein compendium building code. --- src/createcompendia/protein.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 7d67524f..920759d2 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -11,13 +11,9 @@ from src.babel_utils import read_identifier_file,glom,write_compendium,Text import os -import json -import gzip +from src.util import get_memory_usage_summary, get_logger -import logging -from src.util import LoggingUtil, get_memory_usage_summary - -logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) +logger = get_logger() def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename): @@ -160,30 +156,30 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil dicts = {} types = {} uniques = [UNIPROTKB,PR] - logging.info(f"Started building protein comendium ({concordances}, {metadata_yamls}, {identifiers}, {icrdf_filename}) with uniques {uniques}") + logger.info(f"Started building protein compendia ({concordances}, {metadata_yamls}, {identifiers}, {icrdf_filename}) with uniques {uniques}") for ifile in identifiers: - logging.info(f"Loading identifier file {ifile}") + logger.info(f"Loading identifier file {ifile}") new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes= uniques) types.update(new_types) - logging.info(f"Loaded identifier file {ifile}") - logging.info(f"Finished loading identifiers, memory usage: {get_memory_usage_summary()}") + logger.info(f"Loaded identifier file {ifile}") + logger.info(f"Finished loading identifiers, memory usage: {get_memory_usage_summary()}") for infile in concordances: - logging.info(f"Loading concordance file {infile}") + logger.info(f"Loading concordance file {infile}") pairs = [] with open(infile, 'r') as inf: for line_index, line in enumerate(inf): if line_index % 1000000 == 0: - logging.info(f"Loading concordance file {infile}: line {line_index:,}") + logger.info(f"Loading concordance file {infile}: line {line_index:,}") x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) # print("glomming", infile) # This takes a while, but doesn't add much to the memory glom(dicts, pairs, unique_prefixes=uniques) - logging.info(f"Loaded concordance file {infile}") - logging.info(f"Finished loading concordances, memory usage: {get_memory_usage_summary()}") - logging.info(f"Building gene sets") + logger.info(f"Loaded concordance file {infile}") + logger.info(f"Finished loading concordances, memory usage: {get_memory_usage_summary()}") + logger.info(f"Building gene sets") gene_sets = set([frozenset(x) for x in dicts.values()]) - logging.info(f"Gene sets built, memory usage: {get_memory_usage_summary()}") + logger.info(f"Gene sets built, memory usage: {get_memory_usage_summary()}") #Try to preserve some memory here. dicts.clear() @@ -192,6 +188,6 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] - logging.info(f"Writing compendium for {baretype}, memory usage: {get_memory_usage_summary()}") + logger.info(f"Writing compendium for {baretype}, memory usage: {get_memory_usage_summary()}") write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) - logging.info(f"Wrote compendium for {baretype}, memory usage: {get_memory_usage_summary()}") + logger.info(f"Wrote compendium for {baretype}, memory usage: {get_memory_usage_summary()}") From 6a047db65efe7f1219401d511719e455aefebbb8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 12:32:34 -0400 Subject: [PATCH 087/167] Added more memory checks in various places. --- src/node.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/node.py b/src/node.py index a7236859..c9eb3a78 100644 --- a/src/node.py +++ b/src/node.py @@ -11,6 +11,7 @@ get_biolink_model_toolkit, get_biolink_prefix_map, get_logger, + get_memory_usage_summary, ) from src.LabeledID import LabeledID from src.prefixes import PUBCHEMCOMPOUND @@ -46,7 +47,7 @@ def __init__(self,syndir): def load_synonyms(self,prefix): lbs = defaultdict(set) labelfname = os.path.join(self.synonym_dir, prefix, 'labels') - logger.info(f'Loading synonyms for {prefix} from {labelfname}') + logger.info(f'Loading synonyms for {prefix} from {labelfname}: {get_memory_usage_summary()}') count_labels = 0 count_synonyms = 0 if os.path.exists(labelfname): @@ -65,7 +66,7 @@ def load_synonyms(self,prefix): lbs[x[0]].add( (x[1], x[2]) ) count_synonyms += 1 self.synonyms[prefix] = lbs - logger.info(f'Loaded {count_labels:,} labels and {count_synonyms:,} synonyms for {prefix} from {labelfname}') + logger.info(f'Loaded {count_labels:,} labels and {count_synonyms:,} synonyms for {prefix} from {labelfname}: {get_memory_usage_summary()}') def get_synonyms(self,node): config = get_config() @@ -83,7 +84,7 @@ def get_synonyms(self,node): row = json.loads(line) self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) count_common_file_synonyms += 1 - logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}") + logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}: {get_memory_usage_summary()}") node_synonyms = set() for ident in node['identifiers']: @@ -160,7 +161,7 @@ def __init__(self,rootdir): self.taxa = {} def load_taxa(self, prefix): - logger.info(f'Loading taxa for {prefix}') + logger.info(f'Loading taxa for {prefix}: {get_memory_usage_summary()}') taxa_per_prefix = defaultdict(set) taxafilename = os.path.join(self.root_dir, prefix, 'taxa') taxon_count = 0 @@ -171,7 +172,7 @@ def load_taxa(self, prefix): taxa_per_prefix[x[0]].add("\t".join(x[1:])) taxon_count += 1 self.taxa[prefix] = taxa_per_prefix - logger.info(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}') + logger.info(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') def get_taxa(self, node): node_taxa = defaultdict(set) From 7dca09548e2a45c897bcdd4f01c12d914e67e09d Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 14:02:44 -0400 Subject: [PATCH 088/167] Increased the frequency of updates. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 53e5ab9a..5b9daa78 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -428,7 +428,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non for slist in synonym_list: # Before we get started, let's estimate where we're at. count_slist += 1 - if count_slist % 1000000 == 0: + if count_slist % 100000 == 0: time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds From b9eb9980e7aa80ef9e0f888b8a0243b5a968fefe Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 14:05:05 -0400 Subject: [PATCH 089/167] Tweaks. --- src/babel_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 5b9daa78..a46dda77 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -422,7 +422,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: # Calculate an estimated time to completion. start_time = time.time_ns() - count_slist = 0 + count_slist = -1 # So that we display one when we start. total_slist = len(synonym_list) for slist in synonym_list: @@ -430,6 +430,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non count_slist += 1 if count_slist % 100000 == 0: time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 + if time_elapsed_seconds < 0.001: + # We don't want to divide by zero. + time_elapsed_seconds = 0.001 remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds From cb57fdfd329609448917dc4eae9373fb4ca134e7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 14:16:14 -0400 Subject: [PATCH 090/167] Make loggers singletons (sort of). --- src/util.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/util.py b/src/util.py index f143a3b1..e838186d 100644 --- a/src/util.py +++ b/src/util.py @@ -18,6 +18,7 @@ from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes +babel_loggers = {} def get_logger(name=None, level=logging.INFO): """ Get a logger with the specified name. @@ -25,13 +26,18 @@ def get_logger(name=None, level=logging.INFO): The LoggingUtil is inconsistently used, and we don't want rolling logs anyway -- just logging everything to STDOUT so that Snakemake can capture it is fine. However, we """ + if name is None: # TODO: what we really want to get here is the Snakemake job we're currently in, but that's tricky. name = f"{__name__} ({__file__})" + global babel_loggers + if name in babel_loggers: + return babel_loggers[name] + # Set up a logger. - logger = logging.getLogger(name) - logger.setLevel(level) + babel_logger = logging.getLogger(name) + babel_logger.setLevel(level) # Set up a formatter. We want to use UTC time. formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s') @@ -40,9 +46,11 @@ def get_logger(name=None, level=logging.INFO): # Set up a stream handler for STDERR. stream_handler = logging.StreamHandler(sys.stderr) stream_handler.setFormatter(formatter) - logger.addHandler(stream_handler) + babel_logger.addHandler(stream_handler) + + babel_loggers[name] = babel_logger - return logger + return babel_logger #loggers = {} class LoggingUtil(object): From 785430bbd86cfe544311ee96c0bc52e798c0268c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 15:31:13 -0400 Subject: [PATCH 091/167] Improved logger, output. --- src/babel_utils.py | 23 +++++++++++++++++------ src/createcompendia/protein.py | 6 +++--- src/node.py | 2 +- src/util.py | 33 +++++++++++---------------------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index a46dda77..3680c82d 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -23,8 +23,8 @@ import sqlite3 from typing import List, Tuple -# Set up logger. -logger = get_logger() +# Set up a logger. +logger = get_logger(__name__) def make_local_name(fname,subpath=None): config = get_config() @@ -374,8 +374,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ - logger.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files})") - logger.info(f" - Memory usage: {get_memory_usage_summary()}") + logger.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files}): {get_memory_usage_summary()}") if extra_prefixes is None: extra_prefixes = [] @@ -384,8 +383,11 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non config = get_config() cdir = config['output_directory'] biolink_version = config['biolink_version'] + node_factory = NodeFactory(make_local_name(''),biolink_version) + logger.info(f"NodeFactory ready: {node_factory} with {get_memory_usage_summary()}") synonym_factory = SynonymFactory(make_local_name('')) + logger.info(f"SynonymFactory ready: {synonym_factory} with {get_memory_usage_summary()}") # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. @@ -396,10 +398,16 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non if not icrdf_filename: raise RuntimeError("No icrdf_filename parameter provided to write_compendium() -- this is required!") ic_factory = InformationContentFactory(icrdf_filename) + logger.info(f"InformationContentFactory ready: {ic_factory} with {get_memory_usage_summary()}") description_factory = DescriptionFactory(make_local_name('')) + logger.info(f"DescriptionFactory ready: {description_factory} with {get_memory_usage_summary()}") + taxon_factory = TaxonFactory(make_local_name('')) + logger.info(f"TaxonFactory ready: {taxon_factory} with {get_memory_usage_summary()}") + node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes) + logger.info(f"NodeFactory test complete: {node_test} with {get_memory_usage_summary()}") # Create compendia and synonyms directories, just in case they haven't been created yet. os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True) @@ -409,7 +417,10 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non property_list = PropertyList() if properties_jsonl_gz_files: for properties_jsonl_gz_file in properties_jsonl_gz_files: + logger.info(f"Loading properties from {properties_jsonl_gz_file}...") property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) + logger.info(f"Loaded {properties_jsonl_gz_file}") + logger.info(f"All property files loaded: {get_memory_usage_summary()}") property_source_count = defaultdict(int) @@ -422,13 +433,13 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: # Calculate an estimated time to completion. start_time = time.time_ns() - count_slist = -1 # So that we display one when we start. + count_slist = 0 total_slist = len(synonym_list) for slist in synonym_list: # Before we get started, let's estimate where we're at. count_slist += 1 - if count_slist % 100000 == 0: + if (count_slist == 1) or (count_slist % 100000 == 0): time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 if time_elapsed_seconds < 0.001: # We don't want to divide by zero. diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 920759d2..2901ff97 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -13,7 +13,7 @@ import os from src.util import get_memory_usage_summary, get_logger -logger = get_logger() +logger = get_logger(__name__) def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename): @@ -162,7 +162,7 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes= uniques) types.update(new_types) - logger.info(f"Loaded identifier file {ifile}") + logger.info(f"Loaded identifier file {ifile}: {get_memory_usage_summary()}") logger.info(f"Finished loading identifiers, memory usage: {get_memory_usage_summary()}") for infile in concordances: logger.info(f"Loading concordance file {infile}") @@ -175,7 +175,7 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil pairs.append(set([x[0], x[2]])) # print("glomming", infile) # This takes a while, but doesn't add much to the memory glom(dicts, pairs, unique_prefixes=uniques) - logger.info(f"Loaded concordance file {infile}") + logger.info(f"Loaded concordance file {infile}: {get_memory_usage_summary()}") logger.info(f"Finished loading concordances, memory usage: {get_memory_usage_summary()}") logger.info(f"Building gene sets") gene_sets = set([frozenset(x) for x in dicts.values()]) diff --git a/src/node.py b/src/node.py index c9eb3a78..6cc8bd88 100644 --- a/src/node.py +++ b/src/node.py @@ -16,7 +16,7 @@ from src.LabeledID import LabeledID from src.prefixes import PUBCHEMCOMPOUND -logger = get_logger() +logger = get_logger(__name__) class SynonymFactory: """ diff --git a/src/util.py b/src/util.py index e838186d..1cee4091 100644 --- a/src/util.py +++ b/src/util.py @@ -18,39 +18,28 @@ from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes -babel_loggers = {} -def get_logger(name=None, level=logging.INFO): +def get_logger(name, loglevel=logging.INFO): """ Get a logger with the specified name. - The LoggingUtil is inconsistently used, and we don't want rolling logs anyway -- just logging everything to STDOUT - so that Snakemake can capture it is fine. However, we + The LoggingUtil is inconsistently used, and we don't want rolling logs anyway -- just logging everything to STDERR + so that Snakemake can capture it is fine. However, we do want every logger to be configured identically and without + duplicated handlers. """ - if name is None: - # TODO: what we really want to get here is the Snakemake job we're currently in, but that's tricky. - name = f"{__name__} ({__file__})" - - global babel_loggers - if name in babel_loggers: - return babel_loggers[name] - - # Set up a logger. - babel_logger = logging.getLogger(name) - babel_logger.setLevel(level) - - # Set up a formatter. We want to use UTC time. + # Set up the root handler for a logger. Ideally we would call this in one central location, but I'm not sure + # what they would be for Snakemake. basicConfig() should be safe to call from multiple threads after formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s') formatter.converter = gmtime - # Set up a stream handler for STDERR. stream_handler = logging.StreamHandler(sys.stderr) stream_handler.setFormatter(formatter) - babel_logger.addHandler(stream_handler) - - babel_loggers[name] = babel_logger + logging.basicConfig(level=logging.INFO, handlers=[stream_handler]) - return babel_logger + # Set up a logger for the specified loglevel and return it. + logger = logging.getLogger(name) + logger.setLevel(loglevel) + return logger #loggers = {} class LoggingUtil(object): From 9571698a96c191e3964cb5ed2c32ed0b1760b4cc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 15:33:16 -0400 Subject: [PATCH 092/167] Updated datefmt to be a bit more ISO8601ish. --- src/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.py b/src/util.py index 1cee4091..51eeca68 100644 --- a/src/util.py +++ b/src/util.py @@ -29,7 +29,7 @@ def get_logger(name, loglevel=logging.INFO): # Set up the root handler for a logger. Ideally we would call this in one central location, but I'm not sure # what they would be for Snakemake. basicConfig() should be safe to call from multiple threads after - formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s') + formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s', "%Y-%m-%dT%H:%M:%S%z") formatter.converter = gmtime stream_handler = logging.StreamHandler(sys.stderr) From eb811482c0c1fe34c25a47f93ae7cc2e9b432ff1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 15:40:11 -0400 Subject: [PATCH 093/167] Only set up basicConfig() if we don't have any handlers. --- src/util.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/util.py b/src/util.py index 51eeca68..13e988b9 100644 --- a/src/util.py +++ b/src/util.py @@ -28,13 +28,15 @@ def get_logger(name, loglevel=logging.INFO): """ # Set up the root handler for a logger. Ideally we would call this in one central location, but I'm not sure - # what they would be for Snakemake. basicConfig() should be safe to call from multiple threads after - formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s', "%Y-%m-%dT%H:%M:%S%z") - formatter.converter = gmtime + # what they would be for Snakemake. basicConfig() should be safe to call from multiple threads after Python 3.2, but + # we might as well check. + if not logging.getLogger().hasHandlers(): + formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s', "%Y-%m-%dT%H:%M:%S%z") + formatter.converter = gmtime - stream_handler = logging.StreamHandler(sys.stderr) - stream_handler.setFormatter(formatter) - logging.basicConfig(level=logging.INFO, handlers=[stream_handler]) + stream_handler = logging.StreamHandler(sys.stderr) + stream_handler.setFormatter(formatter) + logging.basicConfig(level=logging.INFO, handlers=[stream_handler]) # Set up a logger for the specified loglevel and return it. logger = logging.getLogger(name) From d8e76f05b97fa498e1eed0bec6676b5ba520a01a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 16:19:33 -0400 Subject: [PATCH 094/167] Improved logging. --- src/babel_utils.py | 3 --- src/node.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 3680c82d..7756a055 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -451,9 +451,6 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) - hours, remainder = divmod(time_remaining_seconds, 3600) - minutes, seconds = divmod(remainder, 60) - logger.info(f" - Estimated time remaining: {time_remaining_seconds:.2f} seconds ({hours:} hours, {minutes:02} minutes, {seconds:02} seconds)") logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") # At this point, we insert any HAS_ADDITIONAL_ID properties we have. diff --git a/src/node.py b/src/node.py index 6cc8bd88..a0070b57 100644 --- a/src/node.py +++ b/src/node.py @@ -411,7 +411,7 @@ def apply_labels(self, input_identifiers, labels): continue self.common_labels[x[0]] = x[1] count_common_file_labels += 1 - logger.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}") + logger.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}: {get_memory_usage_summary()}") #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. From f98983665612198b11cf65deba96ccfd4fe0893f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 17:16:45 -0400 Subject: [PATCH 095/167] Improved logging. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 7756a055..83ec7cf8 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -447,7 +447,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds - logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%).") + logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%): {get_memory_usage_summary()}") logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) From 4a01e0650af831c1fcc6ed2d0e2aaa912d6005cf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 18:23:53 -0400 Subject: [PATCH 096/167] Can we save memory by using dict[list] instead of defaultdict[set]? --- src/node.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/node.py b/src/node.py index a0070b57..912c68c6 100644 --- a/src/node.py +++ b/src/node.py @@ -162,24 +162,29 @@ def __init__(self,rootdir): def load_taxa(self, prefix): logger.info(f'Loading taxa for {prefix}: {get_memory_usage_summary()}') - taxa_per_prefix = defaultdict(set) + taxa_per_prefix = dict(list) taxafilename = os.path.join(self.root_dir, prefix, 'taxa') taxon_count = 0 if os.path.exists(taxafilename): with open(taxafilename, 'r') as inf: for line in inf: - x = line.strip().split('\t') - taxa_per_prefix[x[0]].add("\t".join(x[1:])) - taxon_count += 1 + x = line.strip().split('\t', 1) + curie = x[0] + taxon_id = x[1] + if curie not in taxa_per_prefix: + taxa_per_prefix[curie] = list() + if taxon_id not in taxa_per_prefix[curie]: + taxa_per_prefix[curie].add(taxon_id) + taxon_count += 1 self.taxa[prefix] = taxa_per_prefix - logger.info(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') + logger.info(f'Loaded {taxon_count:,} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') def get_taxa(self, node): node_taxa = defaultdict(set) for ident in node['identifiers']: thisid = ident['identifier'] pref = thisid.split(':', 1)[0] - if not pref in self.taxa: + if pref not in self.taxa: self.load_taxa(pref) node_taxa[thisid].update(self.taxa[pref][thisid]) return node_taxa From fac33b333b5cc02201687c55bb93e69dce1f5746 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 18:26:45 -0400 Subject: [PATCH 097/167] Some other improvements. --- src/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/node.py b/src/node.py index 912c68c6..191f674b 100644 --- a/src/node.py +++ b/src/node.py @@ -174,19 +174,19 @@ def load_taxa(self, prefix): if curie not in taxa_per_prefix: taxa_per_prefix[curie] = list() if taxon_id not in taxa_per_prefix[curie]: - taxa_per_prefix[curie].add(taxon_id) + taxa_per_prefix[curie].append(taxon_id) taxon_count += 1 self.taxa[prefix] = taxa_per_prefix logger.info(f'Loaded {taxon_count:,} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') def get_taxa(self, node): - node_taxa = defaultdict(set) + node_taxa = dict(set) for ident in node['identifiers']: thisid = ident['identifier'] pref = thisid.split(':', 1)[0] if pref not in self.taxa: self.load_taxa(pref) - node_taxa[thisid].update(self.taxa[pref][thisid]) + node_taxa[thisid] = set(self.taxa[pref][thisid]) return node_taxa From f6f3f11ed1fcd8600773677965d0d20aef1330a0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 19:17:51 -0400 Subject: [PATCH 098/167] Minor fixes. --- src/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/node.py b/src/node.py index 191f674b..ceb47fdd 100644 --- a/src/node.py +++ b/src/node.py @@ -180,10 +180,10 @@ def load_taxa(self, prefix): logger.info(f'Loaded {taxon_count:,} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') def get_taxa(self, node): - node_taxa = dict(set) + node_taxa = dict[str, set] for ident in node['identifiers']: thisid = ident['identifier'] - pref = thisid.split(':', 1)[0] + pref = Text.get_prefix(thisid) if pref not in self.taxa: self.load_taxa(pref) node_taxa[thisid] = set(self.taxa[pref][thisid]) @@ -416,7 +416,7 @@ def apply_labels(self, input_identifiers, labels): continue self.common_labels[x[0]] = x[1] count_common_file_labels += 1 - logger.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}: {get_memory_usage_summary()}") + logger.info(f"Loaded {count_common_file_labels:,} common labels from {common_labels_path}: {get_memory_usage_summary()}") #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. From f3b93aca4f538607878e246572620d1f16dbcb0a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 22:57:45 -0400 Subject: [PATCH 099/167] A semi-complete TSVDuckDBLoader implementation. --- src/node.py | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 2 deletions(-) diff --git a/src/node.py b/src/node.py index ceb47fdd..b76230bb 100644 --- a/src/node.py +++ b/src/node.py @@ -1,9 +1,13 @@ +import itertools import json import os +import uuid from collections import defaultdict +from contextlib import AbstractContextManager from urllib.parse import urlparse import curies +import duckdb from src.util import ( Text, @@ -162,7 +166,7 @@ def __init__(self,rootdir): def load_taxa(self, prefix): logger.info(f'Loading taxa for {prefix}: {get_memory_usage_summary()}') - taxa_per_prefix = dict(list) + taxa_per_prefix = dict() taxafilename = os.path.join(self.root_dir, prefix, 'taxa') taxon_count = 0 if os.path.exists(taxafilename): @@ -180,7 +184,7 @@ def load_taxa(self, prefix): logger.info(f'Loaded {taxon_count:,} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') def get_taxa(self, node): - node_taxa = dict[str, set] + node_taxa = dict() for ident in node['identifiers']: thisid = ident['identifier'] pref = Text.get_prefix(thisid) @@ -190,6 +194,128 @@ def get_taxa(self, node): return node_taxa +class TSVDuckDBLoader(AbstractContextManager): + """ + All of the files we load here (SynonymFactory, DescriptionFactory, TaxonFactory and InformationContentFactory) + are TSV files in very similar formats (either <curie>\t<value> or <curie>\t<predicate>\t<value>). Some of these + TSV files are very large, so we don't want to load them all into memory at once. Instead, we use DuckDB to: + 1. Load them into DuckDB files (e.g. `UniProtKB/taxa` -> `UniProtKB/duckdbs/{random}.duckdb`), but without explicitly saving + them -- they're just there so that DuckDB can dump to disk if needed. There are a bunch of configuration + items so we can specify what kind of file we have. + 2. Query identifiers by identifier prefix. + 3. Close and delete the DuckDB files when we're done. + """ + + def __init__(self, download_dir, filename, file_format): + self.download_dir = download_dir + self.filename = filename + self.duckdbs = {} + self.duckdb_filenames = {} + + # We only support one format for now. + self.format = format + if file_format in {'curie-curie'}: + # Acceptable format! + pass + else: + raise ValueError(f"Unknown TSVDuckDBLoader file format: {file_format}") + + def __str__(self): + duckdb_counts = self.get_duckdb_counts() + duckdb_counts_str = ", ".join( + f"{prefix}: {count:,} rows" + for prefix, count in sorted(duckdb_counts.items(), key=lambda x: x[1], reverse=True) + ) + return f"TSVDuckDBLoader({self.download_dir}, {self.filename}, {self.format}) containing {len(self.duckdbs)} DuckDBs ({duckdb_counts_str})" + + def get_duckdb_counts(self): + counts = dict() + for prefix in self.duckdbs: + counts[prefix] = self.duckdbs[prefix].execute(f"SELECT COUNT(*) FROM {prefix}").fetchone()[0] + return counts + + def load_prefix(self, prefix): + if prefix in self.duckdbs: + # We've already loaded this prefix! + return True + + # Set up filenames. + tsv_filename = os.path.join(self.download_dir, prefix, self.filename) + + # If the TSV file doesn't exist, we don't need to do anything. + if not os.path.exists(tsv_filename): + return False + + # If we knew that only a single DuckDB process was going to load a prefix at a time (or -- even better -- that + # the DuckDB file was created by Snakemake before we got to this point), then we could simply open and reuse + # that DuckDB file between these jobs. Unfortunately, we have to account for the possibility that: + # 1. Multiple processes or threads might create overlapping TSVDuckDBLoaders on the same prefix, and + # 2. We don't know when the DuckDB file is completely loaded and therefore safe for another process to use. + # + # Luckily, there's an easy way to ensure that both criteria don't matter: give the DuckDB file a random name, + # and delete it once the TSVDuckDBLoader is done. + duckdbs_dir = os.path.join(self.download_dir, prefix, "duckdbs") + os.makedirs(duckdbs_dir, exist_ok=True) + duckdb_filename = os.path.join(str(duckdbs_dir), f"{prefix}_{self.filename}_{uuid.uuid4()}.duckdb") + + # Set up a DuckDB instance. + logger.info(f"Loading {prefix} into {duckdb_filename}...") + conn = duckdb.connect(duckdb_filename) + conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2'])", { + 'tsv_filename': tsv_filename, + }) + self.duckdb_filenames[prefix] = duckdb_filename + self.duckdbs[prefix] = conn + logger.info(f"Loaded {prefix} into {duckdb_filename}") + return True + + def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: + results = defaultdict(set) + + curies_sorted_by_prefix = sorted(curies_to_query, key=lambda curie: Text.get_prefix(curie)) + curies_grouped_by_prefix = itertools.groupby(curies_sorted_by_prefix, key=lambda curie: Text.get_prefix(curie)) + for prefix, curies_group in curies_grouped_by_prefix: + curies = list(curies_group) + logger.info(f"Looking up {prefix} for {curies} curies") + if prefix not in self.duckdbs: + logger.info(f"No DuckDB for {prefix} found, attempting to load it.") + if not self.load_prefix(prefix): + # Nothing to load. + logger.warning(f"No DuckDB for {prefix} found, so can't query it for {curies}") + for curie in curies: + results[curie] = set() + continue + + # Query the DuckDB. + query = f"SELECT DISTINCT curie1, curie2 FROM {prefix} WHERE curie1 = ?" + for curie in curies: + query_result = self.duckdbs[prefix].execute(query, [curie]).fetchall() + if not query_result: + results[curie] = set() + continue + + for row in query_result: + curie1 = row[0] + curie2 = row[1] + results[curie1].add(curie2) + + return dict(results) + + def close(self): + """ + Close all of the DuckDB connections and delete the DuckDB files. + """ + for prefix, db in self.duckdbs.items(): + db.close() + os.remove(self.duckdb_filenames[prefix]) + self.duckdbs = dict() + + def __del__(self): + self.close() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + class InformationContentFactory: """ @@ -565,3 +691,19 @@ def pubchemsort(pc_ids, labeled_ids): best_pubchem = pcelement pc_ids.remove(best_pubchem) return [best_pubchem] + pc_ids + +if __name__ == '__main__': + if False: + tf = TaxonFactory('babel_downloads/') + logger.info(f"Started: {get_memory_usage_summary()}") + result = tf.get_taxa({ + 'identifiers': [{'identifier': 'UniProtKB:I6L8L4'}, {'identifier': 'UniProtKB:C6H147'}], + }) + logger.info(f"Got result from {tf}: {result} with {get_memory_usage_summary()}") + del tf + + tsvdb = TSVDuckDBLoader('babel_downloads/', filename='taxa', file_format='curie-curie') + logger.info(f"Started TSVDuckDBLoader {tsvdb}: {get_memory_usage_summary()}") + result = tsvdb.get_curies(['UniProtKB:I6L8L4', 'UniProtKB:C6H147']) + logger.info(f"Got result from {tsvdb}: {result} with {get_memory_usage_summary()}") + tsvdb.close() From 7e043bd9940f4a81c5cbf5ae3b89801935210e9e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 11 Aug 2025 23:02:12 -0400 Subject: [PATCH 100/167] Replaced TaxonFactory by wrapping a TSVDuckDBLoader. --- src/babel_utils.py | 4 ++++ src/node.py | 37 +++++++++---------------------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 83ec7cf8..1d48c45c 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -641,6 +641,10 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non combined_from_filenames=metadata_yamls, ) + # Close all the factories. + taxon_factory.close() + + def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. conc_set is a dictionary where the values are these equivalent identifier sets and diff --git a/src/node.py b/src/node.py index b76230bb..05c634f8 100644 --- a/src/node.py +++ b/src/node.py @@ -160,41 +160,22 @@ class TaxonFactory: """ A factory for loading taxa for CURIEs where available. """ - def __init__(self,rootdir): + def __init__(self, rootdir): self.root_dir = rootdir - self.taxa = {} + self.tsvloader = TSVDuckDBLoader(rootdir, 'taxa', 'curie-curie') def load_taxa(self, prefix): - logger.info(f'Loading taxa for {prefix}: {get_memory_usage_summary()}') - taxa_per_prefix = dict() - taxafilename = os.path.join(self.root_dir, prefix, 'taxa') - taxon_count = 0 - if os.path.exists(taxafilename): - with open(taxafilename, 'r') as inf: - for line in inf: - x = line.strip().split('\t', 1) - curie = x[0] - taxon_id = x[1] - if curie not in taxa_per_prefix: - taxa_per_prefix[curie] = list() - if taxon_id not in taxa_per_prefix[curie]: - taxa_per_prefix[curie].append(taxon_id) - taxon_count += 1 - self.taxa[prefix] = taxa_per_prefix - logger.info(f'Loaded {taxon_count:,} taxon-CURIE mappings for {prefix}: {get_memory_usage_summary()}') + return self.tsvloader.load_prefix(prefix) def get_taxa(self, node): - node_taxa = dict() - for ident in node['identifiers']: - thisid = ident['identifier'] - pref = Text.get_prefix(thisid) - if pref not in self.taxa: - self.load_taxa(pref) - node_taxa[thisid] = set(self.taxa[pref][thisid]) - return node_taxa + curies = list({ident['identifier'] for ident in node['identifiers']}) + return self.tsvloader.get_curies(curies) + + def close(self): + self.tsvloader.close() -class TSVDuckDBLoader(AbstractContextManager): +class TSVDuckDBLoader: """ All of the files we load here (SynonymFactory, DescriptionFactory, TaxonFactory and InformationContentFactory) are TSV files in very similar formats (either <curie>\t<value> or <curie>\t<predicate>\t<value>). Some of these From a812db0f05ea70cbb4c7d4747530e1fb7e0e04a2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 00:32:04 -0400 Subject: [PATCH 101/167] Turned off some unnecessary log entries. --- src/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/node.py b/src/node.py index 05c634f8..8b5c763f 100644 --- a/src/node.py +++ b/src/node.py @@ -257,12 +257,12 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: curies_grouped_by_prefix = itertools.groupby(curies_sorted_by_prefix, key=lambda curie: Text.get_prefix(curie)) for prefix, curies_group in curies_grouped_by_prefix: curies = list(curies_group) - logger.info(f"Looking up {prefix} for {curies} curies") + logger.debug(f"Looking up {prefix} for {curies} curies") if prefix not in self.duckdbs: - logger.info(f"No DuckDB for {prefix} found, attempting to load it.") + logger.debug(f"No DuckDB for {prefix} found, attempting to load it.") if not self.load_prefix(prefix): # Nothing to load. - logger.warning(f"No DuckDB for {prefix} found, so can't query it for {curies}") + logger.debug(f"No DuckDB for {prefix} found, so can't query it for {curies}") for curie in curies: results[curie] = set() continue From 3d20a59d06dd34b845c693e9cc1127cea1d8c563 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 01:12:37 -0400 Subject: [PATCH 102/167] Commit database before querying it. --- src/node.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/node.py b/src/node.py index 8b5c763f..4f4d3ab8 100644 --- a/src/node.py +++ b/src/node.py @@ -245,6 +245,7 @@ def load_prefix(self, prefix): conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2'])", { 'tsv_filename': tsv_filename, }) + conn.commit() self.duckdb_filenames[prefix] = duckdb_filename self.duckdbs[prefix] = conn logger.info(f"Loaded {prefix} into {duckdb_filename}") From 287d0be412d716cee5eb2919dd7446415956a8ed Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 01:19:16 -0400 Subject: [PATCH 103/167] Add index, optimize query. --- src/node.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/node.py b/src/node.py index 4f4d3ab8..2a46f338 100644 --- a/src/node.py +++ b/src/node.py @@ -245,6 +245,7 @@ def load_prefix(self, prefix): conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2'])", { 'tsv_filename': tsv_filename, }) + conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") conn.commit() self.duckdb_filenames[prefix] = duckdb_filename self.duckdbs[prefix] = conn @@ -269,7 +270,7 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: continue # Query the DuckDB. - query = f"SELECT DISTINCT curie1, curie2 FROM {prefix} WHERE curie1 = ?" + query = f"SELECT curie1, curie2 FROM {prefix} WHERE curie1 = ?" for curie in curies: query_result = self.duckdbs[prefix].execute(query, [curie]).fetchall() if not query_result: From 32714a43815dfd88855f695ab56697bcc96357a3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 14:56:16 -0400 Subject: [PATCH 104/167] Changed DuckDB to be in-memory and sorted by curie1. --- src/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/node.py b/src/node.py index 2a46f338..f1bd3378 100644 --- a/src/node.py +++ b/src/node.py @@ -241,8 +241,8 @@ def load_prefix(self, prefix): # Set up a DuckDB instance. logger.info(f"Loading {prefix} into {duckdb_filename}...") - conn = duckdb.connect(duckdb_filename) - conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2'])", { + conn = duckdb.connect(":memory:") + conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2']) ORDER BY curie1", { 'tsv_filename': tsv_filename, }) conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") From f89e3774dad6c82f04ff9f21627c38bc7600d1f0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 14:57:34 -0400 Subject: [PATCH 105/167] Make curie1 case-insensitive. --- src/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/node.py b/src/node.py index f1bd3378..23cda5ad 100644 --- a/src/node.py +++ b/src/node.py @@ -242,7 +242,7 @@ def load_prefix(self, prefix): # Set up a DuckDB instance. logger.info(f"Loading {prefix} into {duckdb_filename}...") conn = duckdb.connect(":memory:") - conn.execute(f"CREATE TABLE {prefix} AS SELECT curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1', 'curie2']) ORDER BY curie1", { + conn.execute(f"CREATE TABLE {prefix} AS SELECT UPPER(curie1_in) AS curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1_in', 'curie2']) ORDER BY curie1", { 'tsv_filename': tsv_filename, }) conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") @@ -272,7 +272,7 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: # Query the DuckDB. query = f"SELECT curie1, curie2 FROM {prefix} WHERE curie1 = ?" for curie in curies: - query_result = self.duckdbs[prefix].execute(query, [curie]).fetchall() + query_result = self.duckdbs[prefix].execute(query, [curie.upper()]).fetchall() if not query_result: results[curie] = set() continue From 99767ef890e6c70cd3a7d0492f67d45e7a98d536 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 16:17:38 -0400 Subject: [PATCH 106/167] Fix: bug in the case-insensitive code. --- src/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node.py b/src/node.py index 23cda5ad..ea131000 100644 --- a/src/node.py +++ b/src/node.py @@ -278,7 +278,7 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: continue for row in query_result: - curie1 = row[0] + curie1 = curie curie2 = row[1] results[curie1].add(curie2) From c5635eb9f35068c807e90cc2cff9d474330c8110 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 16:41:39 -0400 Subject: [PATCH 107/167] Replaced DuckDB loader with an SQLite loader. --- src/node.py | 94 +++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/src/node.py b/src/node.py index ea131000..ce4f4133 100644 --- a/src/node.py +++ b/src/node.py @@ -1,6 +1,7 @@ import itertools import json import os +import sqlite3 import uuid from collections import defaultdict from contextlib import AbstractContextManager @@ -162,7 +163,7 @@ class TaxonFactory: def __init__(self, rootdir): self.root_dir = rootdir - self.tsvloader = TSVDuckDBLoader(rootdir, 'taxa', 'curie-curie') + self.tsvloader = TSVSQLiteLoader(rootdir, 'taxa', 'curie-curie') def load_taxa(self, prefix): return self.tsvloader.load_prefix(prefix) @@ -175,23 +176,21 @@ def close(self): self.tsvloader.close() -class TSVDuckDBLoader: +class TSVSQLiteLoader: """ All of the files we load here (SynonymFactory, DescriptionFactory, TaxonFactory and InformationContentFactory) are TSV files in very similar formats (either <curie>\t<value> or <curie>\t<predicate>\t<value>). Some of these - TSV files are very large, so we don't want to load them all into memory at once. Instead, we use DuckDB to: - 1. Load them into DuckDB files (e.g. `UniProtKB/taxa` -> `UniProtKB/duckdbs/{random}.duckdb`), but without explicitly saving - them -- they're just there so that DuckDB can dump to disk if needed. There are a bunch of configuration - items so we can specify what kind of file we have. + TSV files are very large, so we don't want to load them all into memory at once. Instead, we use SQLite to: + 1. Load them into SQLite files. SQLite supports "temporary databases" (https://www.sqlite.org/inmemorydb.html) -- + the database is kept in memory, but data can spill onto the disk if the database gets large. 2. Query identifiers by identifier prefix. - 3. Close and delete the DuckDB files when we're done. + 3. Close and delete the SQLite files when we're done. """ def __init__(self, download_dir, filename, file_format): self.download_dir = download_dir self.filename = filename - self.duckdbs = {} - self.duckdb_filenames = {} + self.sqlites = {} # We only support one format for now. self.format = format @@ -199,24 +198,24 @@ def __init__(self, download_dir, filename, file_format): # Acceptable format! pass else: - raise ValueError(f"Unknown TSVDuckDBLoader file format: {file_format}") + raise ValueError(f"Unknown TSVSQLiteLoader file format: {file_format}") def __str__(self): - duckdb_counts = self.get_duckdb_counts() - duckdb_counts_str = ", ".join( + sqlite_counts = self.get_sqlite_counts() + sqlite_counts_str = ", ".join( f"{prefix}: {count:,} rows" - for prefix, count in sorted(duckdb_counts.items(), key=lambda x: x[1], reverse=True) + for prefix, count in sorted(sqlite_counts.items(), key=lambda x: x[1], reverse=True) ) - return f"TSVDuckDBLoader({self.download_dir}, {self.filename}, {self.format}) containing {len(self.duckdbs)} DuckDBs ({duckdb_counts_str})" + return f"TSVSQLiteLoader({self.download_dir}, {self.filename}, {self.format}) containing {len(self.sqlites)} SQLite DBs ({sqlite_counts_str})" - def get_duckdb_counts(self): + def get_sqlite_counts(self): counts = dict() - for prefix in self.duckdbs: - counts[prefix] = self.duckdbs[prefix].execute(f"SELECT COUNT(*) FROM {prefix}").fetchone()[0] + for prefix in self.sqlites: + counts[prefix] = self.sqlites[prefix].execute(f"SELECT COUNT(*) FROM {prefix}").fetchone()[0] return counts def load_prefix(self, prefix): - if prefix in self.duckdbs: + if prefix in self.sqlites: # We've already loaded this prefix! return True @@ -227,29 +226,27 @@ def load_prefix(self, prefix): if not os.path.exists(tsv_filename): return False - # If we knew that only a single DuckDB process was going to load a prefix at a time (or -- even better -- that - # the DuckDB file was created by Snakemake before we got to this point), then we could simply open and reuse - # that DuckDB file between these jobs. Unfortunately, we have to account for the possibility that: - # 1. Multiple processes or threads might create overlapping TSVDuckDBLoaders on the same prefix, and - # 2. We don't know when the DuckDB file is completely loaded and therefore safe for another process to use. - # - # Luckily, there's an easy way to ensure that both criteria don't matter: give the DuckDB file a random name, - # and delete it once the TSVDuckDBLoader is done. - duckdbs_dir = os.path.join(self.download_dir, prefix, "duckdbs") - os.makedirs(duckdbs_dir, exist_ok=True) - duckdb_filename = os.path.join(str(duckdbs_dir), f"{prefix}_{self.filename}_{uuid.uuid4()}.duckdb") - - # Set up a DuckDB instance. - logger.info(f"Loading {prefix} into {duckdb_filename}...") - conn = duckdb.connect(":memory:") - conn.execute(f"CREATE TABLE {prefix} AS SELECT UPPER(curie1_in) AS curie1, curie2 FROM read_csv($tsv_filename, header=false, sep='\\t', column_names=['curie1_in', 'curie2']) ORDER BY curie1", { - 'tsv_filename': tsv_filename, - }) + # Write to a SQLite in-memory database so we don't need to hold it in memory all at once. + logger.info(f"Loading {prefix} into SQLite: {get_memory_usage_summary()}") + # Setting a SQLite database as "" does exactly what we want: create an in-memory database that will spill onto + # a temporary file if needed. + conn = sqlite3.connect('') + conn.execute(f"CREATE TABLE {prefix} (curie1 TEXT, curie2 TEXT)") conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") + + # Load taxa into memory. + logger.info(f"Loading taxa for {prefix} into memory: {get_memory_usage_summary()}") + records = [] + record_count = 0 + if os.path.exists(tsv_filename): + with open(tsv_filename, 'r') as inf: + for line in inf: + x = line.strip().split('\t', maxsplit=1) + records.append([x[0], x[1]]) + record_count += 1 + conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) conn.commit() - self.duckdb_filenames[prefix] = duckdb_filename - self.duckdbs[prefix] = conn - logger.info(f"Loaded {prefix} into {duckdb_filename}") + logger.info(f"Loaded {record_count:,} taxa for {prefix} into SQLite: {get_memory_usage_summary()}") return True def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: @@ -260,19 +257,19 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: for prefix, curies_group in curies_grouped_by_prefix: curies = list(curies_group) logger.debug(f"Looking up {prefix} for {curies} curies") - if prefix not in self.duckdbs: - logger.debug(f"No DuckDB for {prefix} found, attempting to load it.") + if prefix not in self.sqlites: + logger.debug(f"No SQLite for {prefix} found, attempting to load it.") if not self.load_prefix(prefix): # Nothing to load. - logger.debug(f"No DuckDB for {prefix} found, so can't query it for {curies}") + logger.debug(f"No TSV file for {prefix} found, so can't query it for {curies}") for curie in curies: results[curie] = set() continue - # Query the DuckDB. + # Query the SQLite. query = f"SELECT curie1, curie2 FROM {prefix} WHERE curie1 = ?" for curie in curies: - query_result = self.duckdbs[prefix].execute(query, [curie.upper()]).fetchall() + query_result = self.sqlites[prefix].execute(query, [curie.upper()]).fetchall() if not query_result: results[curie] = set() continue @@ -286,12 +283,11 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: def close(self): """ - Close all of the DuckDB connections and delete the DuckDB files. + Close all of the SQLite connections. """ - for prefix, db in self.duckdbs.items(): + for prefix, db in self.sqlites.items(): db.close() - os.remove(self.duckdb_filenames[prefix]) - self.duckdbs = dict() + self.sqlites = dict() def __del__(self): self.close() @@ -685,7 +681,7 @@ def pubchemsort(pc_ids, labeled_ids): logger.info(f"Got result from {tf}: {result} with {get_memory_usage_summary()}") del tf - tsvdb = TSVDuckDBLoader('babel_downloads/', filename='taxa', file_format='curie-curie') + tsvdb = TSVSQLiteLoader('babel_downloads/', filename='taxa', file_format='curie-curie') logger.info(f"Started TSVDuckDBLoader {tsvdb}: {get_memory_usage_summary()}") result = tsvdb.get_curies(['UniProtKB:I6L8L4', 'UniProtKB:C6H147']) logger.info(f"Got result from {tsvdb}: {result} with {get_memory_usage_summary()}") From 6eb2a3f06dd68a5fa2158772ff37a6b7fb0f0f9f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 16:42:20 -0400 Subject: [PATCH 108/167] Added sqlite3 to requirements. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d7e473e3..2129792b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ git+https://github.com/gaurav/apybiomart.git@change-check-url # Added by Gaurav, Aug 2025 to check for memory information while Babel is running. psutil humanfriendly +sqlite3 From ea2dae3a193df663cabb9700af8a41a24c793c90 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 16:50:36 -0400 Subject: [PATCH 109/167] We should make the index after loading the data. --- src/node.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node.py b/src/node.py index ce4f4133..8ed9be0c 100644 --- a/src/node.py +++ b/src/node.py @@ -232,7 +232,6 @@ def load_prefix(self, prefix): # a temporary file if needed. conn = sqlite3.connect('') conn.execute(f"CREATE TABLE {prefix} (curie1 TEXT, curie2 TEXT)") - conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") # Load taxa into memory. logger.info(f"Loading taxa for {prefix} into memory: {get_memory_usage_summary()}") @@ -245,6 +244,7 @@ def load_prefix(self, prefix): records.append([x[0], x[1]]) record_count += 1 conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) + conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") conn.commit() logger.info(f"Loaded {record_count:,} taxa for {prefix} into SQLite: {get_memory_usage_summary()}") return True From 2bce0b1e9eb41f8aee767bd6baa948bd673f1399 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 16:51:12 -0400 Subject: [PATCH 110/167] Simplified "test". --- src/node.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/node.py b/src/node.py index 8ed9be0c..5743bc6e 100644 --- a/src/node.py +++ b/src/node.py @@ -672,15 +672,6 @@ def pubchemsort(pc_ids, labeled_ids): return [best_pubchem] + pc_ids if __name__ == '__main__': - if False: - tf = TaxonFactory('babel_downloads/') - logger.info(f"Started: {get_memory_usage_summary()}") - result = tf.get_taxa({ - 'identifiers': [{'identifier': 'UniProtKB:I6L8L4'}, {'identifier': 'UniProtKB:C6H147'}], - }) - logger.info(f"Got result from {tf}: {result} with {get_memory_usage_summary()}") - del tf - tsvdb = TSVSQLiteLoader('babel_downloads/', filename='taxa', file_format='curie-curie') logger.info(f"Started TSVDuckDBLoader {tsvdb}: {get_memory_usage_summary()}") result = tsvdb.get_curies(['UniProtKB:I6L8L4', 'UniProtKB:C6H147']) From b9a7ffb8873446b661e22fb197bd23be263c763e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 17:03:02 -0400 Subject: [PATCH 111/167] Improved SQLite loader. --- src/node.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/node.py b/src/node.py index 5743bc6e..0d51c999 100644 --- a/src/node.py +++ b/src/node.py @@ -234,19 +234,27 @@ def load_prefix(self, prefix): conn.execute(f"CREATE TABLE {prefix} (curie1 TEXT, curie2 TEXT)") # Load taxa into memory. - logger.info(f"Loading taxa for {prefix} into memory: {get_memory_usage_summary()}") + logger.info(f"Reading records from {tsv_filename} into memory to load into SQLite: {get_memory_usage_summary()}") records = [] record_count = 0 - if os.path.exists(tsv_filename): - with open(tsv_filename, 'r') as inf: - for line in inf: - x = line.strip().split('\t', maxsplit=1) - records.append([x[0], x[1]]) - record_count += 1 + with open(tsv_filename, 'r') as inf: + for line in inf: + x = line.strip().split('\t', maxsplit=1) + records.append([x[0].upper(), x[1]]) + record_count += 1 + if len(records) % 1_000_000 == 0: + # Insert every 1,000,000 records. + logger.info(f"Inserting {len(records):,} records from {tsv_filename} into SQLite: {get_memory_usage_summary()}") + conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) + records = [] + + # Insert any remaining records. + logger.info(f"Inserting {len(records):,} records from {tsv_filename} into SQLite: {get_memory_usage_summary()}") conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) + logger.info(f"Creating a case-insensitive index for the {record_count:,} records loaded into SQLite: {get_memory_usage_summary()}") conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") conn.commit() - logger.info(f"Loaded {record_count:,} taxa for {prefix} into SQLite: {get_memory_usage_summary()}") + logger.info(f"Loaded {record_count:,} records from {tsv_filename} into SQLite table {prefix}: {get_memory_usage_summary()}") return True def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: From 9e1c7d51dca1b444e7960eb7ac9710d79fbc2c05 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 17:10:20 -0400 Subject: [PATCH 112/167] Improved loader. --- src/node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/node.py b/src/node.py index 0d51c999..2a06f789 100644 --- a/src/node.py +++ b/src/node.py @@ -242,9 +242,9 @@ def load_prefix(self, prefix): x = line.strip().split('\t', maxsplit=1) records.append([x[0].upper(), x[1]]) record_count += 1 - if len(records) % 1_000_000 == 0: - # Insert every 1,000,000 records. - logger.info(f"Inserting {len(records):,} records from {tsv_filename} into SQLite: {get_memory_usage_summary()}") + if len(records) % 10_000_000 == 0: + # Insert every 10,000,000 records. + logger.info(f"Inserting {len(records):,} records (total so far: {record_count:,}) from {tsv_filename} into SQLite: {get_memory_usage_summary()}") conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) records = [] From af1fd80c54f72b417758f0f20f9fb897b5bc4b66 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 17:27:42 -0400 Subject: [PATCH 113/167] We should save the SQLite so we can query it later. --- src/node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/node.py b/src/node.py index 2a06f789..2d524d33 100644 --- a/src/node.py +++ b/src/node.py @@ -228,6 +228,7 @@ def load_prefix(self, prefix): # Write to a SQLite in-memory database so we don't need to hold it in memory all at once. logger.info(f"Loading {prefix} into SQLite: {get_memory_usage_summary()}") + # Setting a SQLite database as "" does exactly what we want: create an in-memory database that will spill onto # a temporary file if needed. conn = sqlite3.connect('') @@ -255,6 +256,7 @@ def load_prefix(self, prefix): conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") conn.commit() logger.info(f"Loaded {record_count:,} records from {tsv_filename} into SQLite table {prefix}: {get_memory_usage_summary()}") + self.sqlites[prefix] = conn return True def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: From 2339e69e8614c499714900a248961d0891a56cc8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 17:35:40 -0400 Subject: [PATCH 114/167] Added a note. --- src/node.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/node.py b/src/node.py index 2d524d33..695b414f 100644 --- a/src/node.py +++ b/src/node.py @@ -185,6 +185,10 @@ class TSVSQLiteLoader: the database is kept in memory, but data can spill onto the disk if the database gets large. 2. Query identifiers by identifier prefix. 3. Close and delete the SQLite files when we're done. + + TODO: note that on Sterling, SQLite might not be able to detect when it's running out of memory (we have a limit + of around 500Gi, but the node will have 1.5Ti, so SQLite won't detect a low-mem situation correctly). We should + figure out how to configure that. """ def __init__(self, download_dir, filename, file_format): From 1dc03bc1842d5ef0a3446cb38bc303a46ffed818 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 18:53:04 -0400 Subject: [PATCH 115/167] Set up overall TMPDIR setting so that SQLite temp files go there. Otherwise, they would go to /tmp, which is not on a mounted area and has limited space. --- Snakefile | 5 +++++ config.yaml | 1 + src/node.py | 3 --- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index 0c595a9a..8465ebcf 100644 --- a/Snakefile +++ b/Snakefile @@ -20,6 +20,11 @@ include: "src/snakefiles/duckdb.snakefile" include: "src/snakefiles/reports.snakefile" include: "src/snakefiles/exports.snakefile" +# Some global settings. +import os +os.environ['TMPDIR'] = config['tmp_directory'] + +# Top-level rules. rule all: input: # See rule all_outputs later in this file for how we generate all the outputs. diff --git a/config.yaml b/config.yaml index 1c6621b3..0eaa6290 100644 --- a/config.yaml +++ b/config.yaml @@ -3,6 +3,7 @@ input_directory: input_data download_directory: babel_downloads intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs +tmp_directory: babel_downloads/tmp # Versions that need to be updated on every release. biolink_version: "4.2.6-rc5" diff --git a/src/node.py b/src/node.py index 695b414f..f7403163 100644 --- a/src/node.py +++ b/src/node.py @@ -2,13 +2,10 @@ import json import os import sqlite3 -import uuid from collections import defaultdict -from contextlib import AbstractContextManager from urllib.parse import urlparse import curies -import duckdb from src.util import ( Text, From 025f6e3dd62211471c958e8896806346c6992838 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 18:59:04 -0400 Subject: [PATCH 116/167] Slightly sped up prefix checks. --- src/node.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/node.py b/src/node.py index f7403163..6397b924 100644 --- a/src/node.py +++ b/src/node.py @@ -225,6 +225,7 @@ def load_prefix(self, prefix): # If the TSV file doesn't exist, we don't need to do anything. if not os.path.exists(tsv_filename): + self.sqlites[prefix] = None return False # Write to a SQLite in-memory database so we don't need to hold it in memory all at once. @@ -268,7 +269,7 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: for prefix, curies_group in curies_grouped_by_prefix: curies = list(curies_group) logger.debug(f"Looking up {prefix} for {curies} curies") - if prefix not in self.sqlites: + if prefix not in self.sqlites and self.sqlites[prefix] is not None: logger.debug(f"No SQLite for {prefix} found, attempting to load it.") if not self.load_prefix(prefix): # Nothing to load. From edc6ec55550965989ec0855c52f7c959176ecb00 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 21:07:03 -0400 Subject: [PATCH 117/167] Removed "sqlite3", which is a core module apparently. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2129792b..d7e473e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,3 @@ git+https://github.com/gaurav/apybiomart.git@change-check-url # Added by Gaurav, Aug 2025 to check for memory information while Babel is running. psutil humanfriendly -sqlite3 From 1c18793045b372a07bddf4efd199f621292680bf Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 21:29:36 -0400 Subject: [PATCH 118/167] Improved seconds/clique display. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 1d48c45c..5f517db0 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -448,7 +448,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%): {get_memory_usage_summary()}") - logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") + logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.4f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") From 26abcd8395f465e8b5a0371acd688d983eb10499 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 00:43:13 -0400 Subject: [PATCH 119/167] Fixed a bug in checking if a prefix has already been loaded. --- src/node.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/node.py b/src/node.py index 6397b924..cdeb5333 100644 --- a/src/node.py +++ b/src/node.py @@ -269,14 +269,19 @@ def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: for prefix, curies_group in curies_grouped_by_prefix: curies = list(curies_group) logger.debug(f"Looking up {prefix} for {curies} curies") - if prefix not in self.sqlites and self.sqlites[prefix] is not None: - logger.debug(f"No SQLite for {prefix} found, attempting to load it.") + if prefix not in self.sqlites: + logger.debug(f"No SQLite for {prefix} found, trying to load it.") if not self.load_prefix(prefix): # Nothing to load. logger.debug(f"No TSV file for {prefix} found, so can't query it for {curies}") for curie in curies: results[curie] = set() continue + if self.sqlites[prefix] is None: + logger.debug(f"No {self.filename} file for {prefix} found, so can't query it for {curies}") + for curie in curies: + results[curie] = set() + continue # Query the SQLite. query = f"SELECT curie1, curie2 FROM {prefix} WHERE curie1 = ?" From 37b6f08d72c4ad9da19d861c29032bb2682dfd2c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 21:29:36 -0400 Subject: [PATCH 120/167] Improved seconds/clique display. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 83ec7cf8..d1ecf0e1 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -448,7 +448,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%): {get_memory_usage_summary()}") - logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.2f} seconds/clique.") + logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.4f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") From 9d58f69dedf6d37823a75c58bfc896e57b4ebf18 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 12 Aug 2025 18:53:04 -0400 Subject: [PATCH 121/167] Added a TMPDIR placed without our downloads directory. --- Snakefile | 5 +++++ config.yaml | 1 + 2 files changed, 6 insertions(+) diff --git a/Snakefile b/Snakefile index 0c595a9a..8465ebcf 100644 --- a/Snakefile +++ b/Snakefile @@ -20,6 +20,11 @@ include: "src/snakefiles/duckdb.snakefile" include: "src/snakefiles/reports.snakefile" include: "src/snakefiles/exports.snakefile" +# Some global settings. +import os +os.environ['TMPDIR'] = config['tmp_directory'] + +# Top-level rules. rule all: input: # See rule all_outputs later in this file for how we generate all the outputs. diff --git a/config.yaml b/config.yaml index 1c6621b3..0eaa6290 100644 --- a/config.yaml +++ b/config.yaml @@ -3,6 +3,7 @@ input_directory: input_data download_directory: babel_downloads intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs +tmp_directory: babel_downloads/tmp # Versions that need to be updated on every release. biolink_version: "4.2.6-rc5" From 448a5004d54429a6f54445f7897a730d93be203e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 17:59:41 -0400 Subject: [PATCH 122/167] Cache get_config() so we keep reparsing the YAML file. --- src/util.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/util.py b/src/util.py index 13e988b9..f06921d3 100644 --- a/src/util.py +++ b/src/util.py @@ -314,16 +314,22 @@ def to_named_tuple (type_name, d): return namedtuple(type_name, d.keys())(**d) +# Cache the config.yaml so we don't need to load it every time get_config() is called. +config_yaml = None def get_config(): """ Retrieve the configuration data from the 'config.yaml' file. :return: The configuration data loaded from the 'config.yaml' file. """ + global config_yaml + if config_yaml is not None: + return config_yaml + cname = os.path.join(os.path.dirname(__file__),'..', 'config.yaml') with open(cname,'r') as yaml_file: - data = yaml.safe_load(yaml_file) - return data + config_yaml = yaml.safe_load(yaml_file) + return config_yaml def get_biolink_model_toolkit(biolink_version): From f61abf2d2c39598aabc858829f49c93bb4302c9f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 18:04:25 -0400 Subject: [PATCH 123/167] Prevent factories from running get_config() on every node. --- src/babel_utils.py | 2 +- src/node.py | 70 ++++++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index d1ecf0e1..a946919f 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -625,7 +625,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non print(node["type"]) print(node_factory.get_ancestors(node["type"])) traceback.print_exc() - exit() + raise ex # Write out the metadata.yaml file combining information from all the metadata.yaml files. write_combined_metadata( diff --git a/src/node.py b/src/node.py index a0070b57..d9139125 100644 --- a/src/node.py +++ b/src/node.py @@ -41,7 +41,24 @@ class SynonymFactory: def __init__(self,syndir): self.synonym_dir = syndir self.synonyms = {} - self.common_synonyms = None + self.config = get_config() + + # Load the common synonyms. + common_synonyms = defaultdict(set) + + for common_synonyms_file in self.config['common']['synonyms']: + common_synonyms_path = os.path.join(self.config['download_directory'], 'common', common_synonyms_file) + count_common_file_synonyms = 0 + with open(common_synonyms_path, 'r') as synonymsf: + # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other + # option. + for line in synonymsf: + row = json.loads(line) + self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) + count_common_file_synonyms += 1 + logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}: {get_memory_usage_summary()}") + + self.common_synonyms = common_synonyms logger.info(f"Created SynonymFactory for directory {syndir}") def load_synonyms(self,prefix): @@ -69,23 +86,6 @@ def load_synonyms(self,prefix): logger.info(f'Loaded {count_labels:,} labels and {count_synonyms:,} synonyms for {prefix} from {labelfname}: {get_memory_usage_summary()}') def get_synonyms(self,node): - config = get_config() - if self.common_synonyms is None: - # Load the common synonyms. - self.common_synonyms = defaultdict(set) - - for common_synonyms_file in config['common']['synonyms']: - common_synonyms_path = os.path.join(config['download_directory'], 'common', common_synonyms_file) - count_common_file_synonyms = 0 - with open(common_synonyms_path, 'r') as synonymsf: - # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other - # option. - for line in synonymsf: - row = json.loads(line) - self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) - count_common_file_synonyms += 1 - logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}: {get_memory_usage_summary()}") - node_synonyms = set() for ident in node['identifiers']: thisid = ident['identifier'] @@ -106,6 +106,22 @@ def __init__(self,rootdir): self.root_dir = rootdir self.descriptions = {} self.common_descriptions = None + + self.config = get_config() + common_descriptions = defaultdict(list) + for common_descriptions_file in self.config['common']['descriptions']: + common_descriptions_path = os.path.join(self.config['download_directory'], 'common', common_descriptions_file) + count_common_file_descriptions = 0 + with open(common_descriptions_path, 'r') as descriptionsf: + # Note that these files may contain ANY CURIE -- we should only fallback to this if we have no other + # option. + for line in descriptionsf: + row = json.loads(line) + self.common_descriptions[row['curie']].extend(row['descriptions']) + count_common_file_descriptions += 1 + logger.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") + self.common_descriptions = common_descriptions + logger.info(f"Created DescriptionFactory for directory {rootdir}") def load_descriptions(self,prefix): @@ -123,24 +139,6 @@ def load_descriptions(self,prefix): logger.info(f'Loaded {desc_count:,} descriptions for {prefix}') def get_descriptions(self,node): - config = get_config() - if self.common_descriptions is None: - # Load the common synonyms. - self.common_descriptions = defaultdict(list) - - for common_descriptions_file in config['common']['descriptions']: - common_descriptions_path = os.path.join(config['download_directory'], 'common', common_descriptions_file) - count_common_file_descriptions = 0 - with open(common_descriptions_path, 'r') as descriptionsf: - # Note that these files may contain ANY CURIE -- we should only fallback to this if we have no other - # option. - for line in descriptionsf: - row = json.loads(line) - self.common_descriptions[row['curie']].extend(row['descriptions']) - count_common_file_descriptions += 1 - logger.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") - - node_descriptions = defaultdict(set) for ident in node['identifiers']: thisid = ident['identifier'] From 9e2d8495fa3e6016757e0a5c7e70759feac0709a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 18:06:47 -0400 Subject: [PATCH 124/167] Added py-spy to requirements. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d7e473e3..c0e54939 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,3 +35,4 @@ git+https://github.com/gaurav/apybiomart.git@change-check-url # Added by Gaurav, Aug 2025 to check for memory information while Babel is running. psutil humanfriendly +py-spy From d938217cf15ff5bd947545233afd0f8e7a3792a8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 19:07:20 -0400 Subject: [PATCH 125/167] Fixed bug in accessible instance variable. --- src/node.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/node.py b/src/node.py index d9139125..c8c4c825 100644 --- a/src/node.py +++ b/src/node.py @@ -44,7 +44,7 @@ def __init__(self,syndir): self.config = get_config() # Load the common synonyms. - common_synonyms = defaultdict(set) + self.common_synonyms = defaultdict(set) for common_synonyms_file in self.config['common']['synonyms']: common_synonyms_path = os.path.join(self.config['download_directory'], 'common', common_synonyms_file) @@ -58,7 +58,6 @@ def __init__(self,syndir): count_common_file_synonyms += 1 logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}: {get_memory_usage_summary()}") - self.common_synonyms = common_synonyms logger.info(f"Created SynonymFactory for directory {syndir}") def load_synonyms(self,prefix): @@ -108,7 +107,7 @@ def __init__(self,rootdir): self.common_descriptions = None self.config = get_config() - common_descriptions = defaultdict(list) + self.common_descriptions = defaultdict(list) for common_descriptions_file in self.config['common']['descriptions']: common_descriptions_path = os.path.join(self.config['download_directory'], 'common', common_descriptions_file) count_common_file_descriptions = 0 @@ -120,7 +119,6 @@ def __init__(self,rootdir): self.common_descriptions[row['curie']].extend(row['descriptions']) count_common_file_descriptions += 1 logger.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") - self.common_descriptions = common_descriptions logger.info(f"Created DescriptionFactory for directory {rootdir}") From c086eee387dc28fc3b6ac4fbc0dfc188cefacd25 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 13 Aug 2025 19:11:21 -0400 Subject: [PATCH 126/167] Cleaned up code. --- src/node.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/node.py b/src/node.py index c8c4c825..51f8f078 100644 --- a/src/node.py +++ b/src/node.py @@ -141,7 +141,7 @@ def get_descriptions(self,node): for ident in node['identifiers']: thisid = ident['identifier'] pref = thisid.split(':', 1)[0] - if not pref in self.descriptions: + if pref not in self.descriptions: self.load_descriptions(pref) node_descriptions[thisid].update( self.descriptions[pref][thisid] ) node_descriptions[thisid].update( self.common_descriptions.get(thisid, {}) ) @@ -175,7 +175,7 @@ def get_taxa(self, node): for ident in node['identifiers']: thisid = ident['identifier'] pref = thisid.split(':', 1)[0] - if not pref in self.taxa: + if pref not in self.taxa: self.load_taxa(pref) node_taxa[thisid].update(self.taxa[pref][thisid]) return node_taxa From 8644adc984ee7d08c59239de4e3fdc55e1e18cc4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 09:20:44 -0400 Subject: [PATCH 127/167] Add time elapsed to write_compendium(). --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index a946919f..16e458c9 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -447,7 +447,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non remaining_slist = total_slist - count_slist # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds - logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%): {get_memory_usage_summary()}") + logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%) in {format_timespan(time_elapsed_seconds)}: {get_memory_usage_summary()}") logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.4f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) From eb1e24e440be843ec6850369ffda7e9eb783f4d4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 09:25:53 -0400 Subject: [PATCH 128/167] Increased sig digits for seconds/clique rate. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 16e458c9..68e7e0b3 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -448,7 +448,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non # count_slist --> time_elapsed_seconds # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%) in {format_timespan(time_elapsed_seconds)}: {get_memory_usage_summary()}") - logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.4f} seconds/clique.") + logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.6f} seconds/clique.") time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") From a0416eb2f3994643062c3da943490bba404ee15f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 10:42:23 -0400 Subject: [PATCH 129/167] Fixed bug in TSVSQLiteLoader. --- src/node.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/node.py b/src/node.py index 8f87e833..d1fda547 100644 --- a/src/node.py +++ b/src/node.py @@ -299,7 +299,8 @@ def close(self): Close all of the SQLite connections. """ for prefix, db in self.sqlites.items(): - db.close() + if db is not None: + db.close() self.sqlites = dict() def __del__(self): From dcd9555e1657d809c768081b74bbd5690e602796 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 11:01:27 -0400 Subject: [PATCH 130/167] Made the write_compendium log a little more configurable. --- src/babel_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 68e7e0b3..af381152 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -23,6 +23,9 @@ import sqlite3 from typing import List, Tuple +# Configuration items +WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000 + # Set up a logger. logger = get_logger(__name__) @@ -439,7 +442,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non for slist in synonym_list: # Before we get started, let's estimate where we're at. count_slist += 1 - if (count_slist == 1) or (count_slist % 100000 == 0): + if (count_slist == 1) or (count_slist % WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES == 0): time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 if time_elapsed_seconds < 0.001: # We don't want to divide by zero. From f3af42359ee5914e3e083227a990a24d4286e1db Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 11:01:27 -0400 Subject: [PATCH 131/167] Made the write_compendium log a little more configurable. --- src/babel_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index ec41282a..237a5657 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -23,6 +23,9 @@ import sqlite3 from typing import List, Tuple +# Configuration items +WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000 + # Set up a logger. logger = get_logger(__name__) @@ -439,7 +442,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non for slist in synonym_list: # Before we get started, let's estimate where we're at. count_slist += 1 - if (count_slist == 1) or (count_slist % 100000 == 0): + if (count_slist == 1) or (count_slist % WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES == 0): time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 if time_elapsed_seconds < 0.001: # We don't want to divide by zero. From e809642c64b2f594bdd3bc5663237a70ed9e39a6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 12:57:23 -0400 Subject: [PATCH 132/167] Replaced sources with tuples so that Property is hashable. --- src/properties.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/properties.py b/src/properties.py index 15fd2be3..a64c1e89 100644 --- a/src/properties.py +++ b/src/properties.py @@ -31,7 +31,7 @@ # and write these properties. # -@dataclass +@dataclass(frozen=True) class Property: """ A property value for a CURIE. @@ -40,7 +40,7 @@ class Property: curie: str predicate: str value: str - sources: list[str] = field(default_factory=list[str]) + sources: tuple[str] = field(default_factory=tuple) @staticmethod def valid_keys(): @@ -88,7 +88,7 @@ def to_json_line(self): 'curie': self.curie, 'predicate': self.predicate, 'value': self.value, - 'sources': self.sources, + 'sources': list(self.sources), }) + '\n' # @@ -169,3 +169,14 @@ def add_properties_jsonl_gz(self, filename_gz: str): props_to_add.add(Property.from_dict(json.loads(line), source=filename_gz)) return self.add_properties(props_to_add) + +if __name__ == '__main__': + pl = PropertyList() + ps = set[Property]() + ps.add(Property('A', HAS_ADDITIONAL_ID, 'B')) + ps.add(Property('A', HAS_ADDITIONAL_ID, 'C')) + ps.add(Property('A', HAS_ADDITIONAL_ID, 'D')) + ps.add(Property('A', HAS_ADDITIONAL_ID, 'C')) + pl.add_properties(ps) + print(pl.properties) + assert len(pl.properties) == 3 From ab7df9c8e117ebd20c57b331e648063ff0bc474e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 12:58:42 -0400 Subject: [PATCH 133/167] Renamed HAS_ADDITIONAL_ID to HAS_ALTERNATIVE_ID. --- src/babel_utils.py | 4 ++-- src/createcompendia/chemicals.py | 4 ++-- src/properties.py | 17 +++++++++-------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index bb56e3ce..5cf53d7d 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -16,7 +16,7 @@ from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory -from src.properties import PropertyList, HAS_ADDITIONAL_ID +from src.properties import PropertyList, HAS_ALTERNATIVE_ID from src.util import Text, get_config, get_memory_usage_summary, get_logger from src.LabeledID import LabeledID from collections import defaultdict @@ -462,7 +462,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non identifier_list = [] for iid in slist: identifier_list.append(iid) - additional_curies = property_list.get_all(iid, HAS_ADDITIONAL_ID) + additional_curies = property_list.get_all(iid, HAS_ALTERNATIVE_ID) if additional_curies: for ac in additional_curies: if ac.curie not in slist: diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index ea422990..d871e50b 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -8,7 +8,7 @@ import ast import gzip -from src.properties import Property, HAS_ADDITIONAL_ID +from src.properties import Property, HAS_ALTERNATIVE_ID from src.metadata.provenance import write_concord_metadata, write_combined_metadata from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI @@ -497,7 +497,7 @@ def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): for secondary_id in secondary_ids: propf.write(Property( curie = cid, - predicate = HAS_ADDITIONAL_ID, + predicate = HAS_ALTERNATIVE_ID, value = secondary_id, sources = [f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})'] ).to_json_line()) diff --git a/src/properties.py b/src/properties.py index a64c1e89..5a0c3101 100644 --- a/src/properties.py +++ b/src/properties.py @@ -17,13 +17,14 @@ # SUPPORTED PROPERTIES # -# HAS_ADDITIONAL_ID indicates -# - Used by write_compendia() to -HAS_ADDITIONAL_ID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' +# HAS_ALTERNATIVE_ID indicates that CURIE has an alternative ID that should be included in the clique, but NOT +# treated as part of the clique for the purposes of choosing the clique leader. This is used for e.g. ChEBI secondary +# IDs or other deprecated identifiers. +HAS_ALTERNATIVE_ID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' # Properties currently supported in the property store in one set for validation. supported_predicates = { - HAS_ADDITIONAL_ID, + HAS_ALTERNATIVE_ID, } # @@ -173,10 +174,10 @@ def add_properties_jsonl_gz(self, filename_gz: str): if __name__ == '__main__': pl = PropertyList() ps = set[Property]() - ps.add(Property('A', HAS_ADDITIONAL_ID, 'B')) - ps.add(Property('A', HAS_ADDITIONAL_ID, 'C')) - ps.add(Property('A', HAS_ADDITIONAL_ID, 'D')) - ps.add(Property('A', HAS_ADDITIONAL_ID, 'C')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'D')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) pl.add_properties(ps) print(pl.properties) assert len(pl.properties) == 3 From c0edf6e3db249ed1e578b036573e101c2e65ab69 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 15:53:49 -0400 Subject: [PATCH 134/167] Added sources to test. --- src/properties.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/properties.py b/src/properties.py index 5a0c3101..efaf5310 100644 --- a/src/properties.py +++ b/src/properties.py @@ -174,7 +174,7 @@ def add_properties_jsonl_gz(self, filename_gz: str): if __name__ == '__main__': pl = PropertyList() ps = set[Property]() - ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B', sources=['E', 'F'])) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'D')) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) From b6eb1ec3fa7f07be3ae4b6faa2f43110d4d6a2e6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 14 Aug 2025 16:07:57 -0400 Subject: [PATCH 135/167] Made sources into a single string. --- src/createcompendia/chemicals.py | 2 +- src/properties.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index d871e50b..ac8f8f3e 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -499,7 +499,7 @@ def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): curie = cid, predicate = HAS_ALTERNATIVE_ID, value = secondary_id, - sources = [f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})'] + source = f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})' ).to_json_line()) if kk in props: outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') diff --git a/src/properties.py b/src/properties.py index efaf5310..82b9a1c5 100644 --- a/src/properties.py +++ b/src/properties.py @@ -41,11 +41,12 @@ class Property: curie: str predicate: str value: str - sources: tuple[str] = field(default_factory=tuple) + source: str = "" # TODO: making this a list would be better, but that would make a Property non-frozen, which + # would make it harder to uniquify. @staticmethod def valid_keys(): - return ['curie', 'predicate', 'value', 'sources'] + return ['curie', 'predicate', 'value', 'source'] def __post_init__(self): """ @@ -70,10 +71,6 @@ def from_dict(prop_dict, source=None): raise ValueError(f'Unexpected keys in dictionary to be converted to Property ({unexpected_keys}): {json.dumps(prop_dict, sort_keys=True, indent=2)}') prop = Property(**prop_dict) - if source is not None: - # Add the source to the end of the sources list. - prop.sources.append(source) - return prop # TODO: we should have some validation code in here so people don't make nonsense properties, which means @@ -89,11 +86,11 @@ def to_json_line(self): 'curie': self.curie, 'predicate': self.predicate, 'value': self.value, - 'sources': list(self.sources), + 'source': list(self.source), }) + '\n' # -# The PropertyList object can be used to load and query properties from multiple sources. +# The PropertyList object can be used to load and query properties from a particular source. # # We could write them into a DuckDB file as we load them so they can overflow onto disk as needed, but that's overkill # for right now, so we'll just load them all into memory. @@ -174,7 +171,7 @@ def add_properties_jsonl_gz(self, filename_gz: str): if __name__ == '__main__': pl = PropertyList() ps = set[Property]() - ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B', sources=['E', 'F'])) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B', source='E and F')) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'D')) ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) From 1f5576f3eb78e7cb544e3d46f3a995853edf8e40 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Fri, 15 Aug 2025 23:09:28 -0400 Subject: [PATCH 136/167] Fixed bug in converting sources into a list. --- src/properties.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/properties.py b/src/properties.py index 82b9a1c5..b021cf24 100644 --- a/src/properties.py +++ b/src/properties.py @@ -86,7 +86,7 @@ def to_json_line(self): 'curie': self.curie, 'predicate': self.predicate, 'value': self.value, - 'source': list(self.source), + 'source': self.source, }) + '\n' # From ed818181b28c8070a9da7f50232d67fee13ecdc1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 16 Aug 2025 12:21:54 -0400 Subject: [PATCH 137/167] Fixed bug in metadata_yamls where a single item was passed in. --- src/createcompendia/chemicals.py | 2 +- src/metadata/provenance.py | 2 +- src/snakefiles/chemical.snakefile | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index ac8f8f3e..40fa3c38 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -610,7 +610,7 @@ def get_wikipedia_relationships(outfile, metadata_yaml): concord_filename=outfile, ) -def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): +def build_untyped_compendia(concordances, identifiers, unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = read_partial_unichem(unichem_partial) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 4e83db98..7e7cf100 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -55,7 +55,7 @@ def write_concord_metadata(filename, *, name, concord_filename, url='', descript write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=counts) -def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames=None, also_combined_from=None): +def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames:list[str]=None, also_combined_from=None): combined_from = {} if combined_from_filenames is not None: for metadata_yaml in combined_from_filenames: diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index fccccaf2..15446cd9 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -230,7 +230,9 @@ rule chemical_compendia: input: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', - metadata_yamls = config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml', + metadata_yamls = [ + config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml' + ], properties_jsonl_gz = [ config['intermediate_directory'] + '/chemicals/properties/get_chebi_concord.jsonl.gz' ], From c2403c3e0842bfcd518ece487c15bfb215802672 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 16 Aug 2025 12:25:44 -0400 Subject: [PATCH 138/167] write_concord_metadata() now handles single YAML input files. --- src/metadata/provenance.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py index 7e7cf100..1e22c201 100644 --- a/src/metadata/provenance.py +++ b/src/metadata/provenance.py @@ -1,5 +1,6 @@ import logging import os.path +import traceback from collections import defaultdict from datetime import datetime @@ -58,6 +59,10 @@ def write_concord_metadata(filename, *, name, concord_filename, url='', descript def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames:list[str]=None, also_combined_from=None): combined_from = {} if combined_from_filenames is not None: + if isinstance(combined_from_filenames, str): + logging.warning(f"write_combined_metadata() got a single string for combined_from_files ('{combined_from_filenames}'), converting to a single item list, at: " + f"{''.join(traceback.format_stack())}") + combined_from_filenames = [combined_from_filenames] for metadata_yaml in combined_from_filenames: with open(metadata_yaml, 'r') as metaf: metadata_block = yaml.safe_load(metaf) From 297d47f853382d0d741c8437f5ef360413e9b889 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sat, 16 Aug 2025 21:39:49 -0400 Subject: [PATCH 139/167] Modified behavior when node_factory.create_node() fails. --- src/babel_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 5cf53d7d..d2484b7a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -472,7 +472,11 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) if node is None: - raise RuntimeError(f"Could not create node for ({slist}, {node_type}, {labels}, {extra_prefixes}): returned None.") + # This usually happens because every CURIE in the node is not in the id_prefixes list for that node_type. + # Something to fix at some point, but we don't want to break the pipeline for this, so + # we emit a warning and skip this clique. + logger.warning(f"Could not create node for ({slist}, {node_type}, {labels}, {extra_prefixes}): returned None.") + continue else: count_cliques += 1 count_eq_ids += len(slist) From ec6dae301f88e9e3b155508c5c70f92c9ac11a15 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 11:26:09 -0400 Subject: [PATCH 140/167] Fixed input_yamls (prev dict, now just a list of filenames). --- src/snakefiles/drugchemical.snakefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index f9748aaa..7e99392d 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -59,11 +59,12 @@ rule drugchemical_conflation: input.chemical_compendia, input.icrdf_filename, output.outfile, - input_metadata_yamls={ - 'RXNORM': input.rxnorm_metadata, - 'UMLS': input.umls_metadata, - 'PUBCHEM_RXNORM': input.pubchem_metadata, - }, output_metadata_yaml=output.metadata_yaml) + input_metadata_yamls=[ + input.rxnorm_metadata, + input.umls_metadata, + input.pubchem_metadata + ], + output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input: From e07776bbf18b03d6cbac00869604d8a8950c5dac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 17:55:41 -0400 Subject: [PATCH 141/167] Marked synoynms/Publication.txt as temp(). --- src/snakefiles/publications.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index 687c6b2b..27522ce8 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -57,7 +57,7 @@ rule generate_pubmed_compendia: metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: - publication_compendium = config['output_directory'] + '/compendia/Publication.txt', + publication_compendium = temp(config['output_directory'] + '/compendia/Publication.txt'), # We generate an empty Publication Synonyms files, but we still need to generate one. publication_synonyms_gz = config['output_directory'] + '/synonyms/Publication.txt.gz', run: From 6b6a5161769a14bdd8687de85afa708803eb2707 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 18:02:45 -0400 Subject: [PATCH 142/167] Improved error message when a directory content test fails. --- src/reports/compendia_per_file_reports.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py index ebe0f1de..9a69ee1c 100644 --- a/src/reports/compendia_per_file_reports.py +++ b/src/reports/compendia_per_file_reports.py @@ -20,28 +20,28 @@ def get_datetime_as_string(): return datetime.now().isoformat() -def assert_files_in_directory(dir, files, report_file): +def assert_files_in_directory(dir, expected_files, report_file): """ Asserts that the list of files in a given directory are the list of files provided. :param dir: The directory to check files in. - :param files: The files to compare the list against. + :param expected_files: The files to compare the list against. :param report_file: Write a report to this file. We assume that this file is not intended to be read, but is created so that we can check this assertion has been checked. """ - logging.info(f"Expect files in directory {dir} to be equal to {files}") all_file_list = os.listdir(dir) # On Sterling, we sometimes have `.nfs*` files that represent NFS cached files that weren't properly deleted. # These shouldn't interfere with these tests. file_list = filter(lambda fn: not fn.startswith('.nfs'), all_file_list) - assert set(file_list) == set(files) + assert set(file_list) == set(expected_files), f"Expected files in directory {dir} to be equal to {expected_files} but found {file_list}: " + \ + f"{set(file_list) - set(expected_files)} added, {set(expected_files) - set(file_list)} missing." # If we passed, write the output to the check_file. with open(report_file, "w") as f: - f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n") + f.write(f"Confirmed that {dir} contains only the files {expected_files} at {get_datetime_as_string()}\n") def generate_content_report_for_compendium(compendium_path, report_path): From d2ad9c81ae3b09a78b185933541aab58eb62d2af Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 18:04:00 -0400 Subject: [PATCH 143/167] Fixed bug in error output. --- src/reports/compendia_per_file_reports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py index 9a69ee1c..531a4bca 100644 --- a/src/reports/compendia_per_file_reports.py +++ b/src/reports/compendia_per_file_reports.py @@ -36,7 +36,7 @@ def assert_files_in_directory(dir, expected_files, report_file): # These shouldn't interfere with these tests. file_list = filter(lambda fn: not fn.startswith('.nfs'), all_file_list) - assert set(file_list) == set(expected_files), f"Expected files in directory {dir} to be equal to {expected_files} but found {file_list}: " + \ + assert set(file_list) == set(expected_files), f"Expected files in directory {dir} to be equal to {expected_files} but found {set(file_list)}: " + \ f"{set(file_list) - set(expected_files)} added, {set(expected_files) - set(file_list)} missing." # If we passed, write the output to the check_file. From e8ed0508038e8b2bf60e78a8a9cf8bce1c9257b9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 18:06:14 -0400 Subject: [PATCH 144/167] Moved DrugChemical metadata into metadata/ --- src/snakefiles/drugchemical.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index 7e99392d..6265964e 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -48,7 +48,7 @@ rule drugchemical_conflation: icrdf_filename=config['download_directory']+'/icRDF.tsv', output: outfile=config['output_directory']+'/conflation/DrugChemical.txt', - metadata_yaml=config['output_directory']+'/conflation/metadata.yaml', + metadata_yaml=config['output_directory']+'/metadata/DrugChemical.yaml', run: drugchemical.build_conflation( input.drugchemical_manual_concord, From ec78408d558a58a1aa4025add12730b08bfac227 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Sun, 17 Aug 2025 18:09:27 -0400 Subject: [PATCH 145/167] Cleaned up code. --- src/reports/compendia_per_file_reports.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py index 531a4bca..1370ae2a 100644 --- a/src/reports/compendia_per_file_reports.py +++ b/src/reports/compendia_per_file_reports.py @@ -36,8 +36,10 @@ def assert_files_in_directory(dir, expected_files, report_file): # These shouldn't interfere with these tests. file_list = filter(lambda fn: not fn.startswith('.nfs'), all_file_list) - assert set(file_list) == set(expected_files), f"Expected files in directory {dir} to be equal to {expected_files} but found {set(file_list)}: " + \ - f"{set(file_list) - set(expected_files)} added, {set(expected_files) - set(file_list)} missing." + file_list_set = set(file_list) + expected_files_set = set(expected_files) + assert file_list_set == expected_files_set, f"Expected files in directory {dir} to be equal to {expected_files_set} but found {file_list_set}: " + \ + f"{file_list_set - expected_files_set} added, {expected_files_set - file_list_set} missing." # If we passed, write the output to the check_file. with open(report_file, "w") as f: From 6a71101e7cbec86da9caece83f17a65340b65140 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 01:32:29 -0400 Subject: [PATCH 146/167] Moved one configuration option into its own section. --- config.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 0eaa6290..02295c74 100644 --- a/config.yaml +++ b/config.yaml @@ -12,11 +12,17 @@ rxnorm_version: "07072025" drugbank_version: "5-1-13" # -# The rest of these configs need to be cleaned up. +# PROTEINS # +# Chris Bizon prepared a list of UMLS/UniProtKB mappings which we download and use. UMLS_UniProtKB_download_raw_url: "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv" +# +# The rest of these configs need to be cleaned up. +# + + ncbi_files: - gene2ensembl.gz - gene_info.gz From 45e8e8e6fdedd08a658bcd683a9fa51bb6501f58 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 10:16:58 -0400 Subject: [PATCH 147/167] Ack no that's not a temp file. --- src/snakefiles/publications.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index 27522ce8..687c6b2b 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -57,7 +57,7 @@ rule generate_pubmed_compendia: metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: - publication_compendium = temp(config['output_directory'] + '/compendia/Publication.txt'), + publication_compendium = config['output_directory'] + '/compendia/Publication.txt', # We generate an empty Publication Synonyms files, but we still need to generate one. publication_synonyms_gz = config['output_directory'] + '/synonyms/Publication.txt.gz', run: From 675671e46677306dbcff0076fb93119b2076a96e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 10:16:58 -0400 Subject: [PATCH 148/167] Ack no that's not a temp file. --- src/snakefiles/publications.snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index 27522ce8..687c6b2b 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -57,7 +57,7 @@ rule generate_pubmed_compendia: metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: - publication_compendium = temp(config['output_directory'] + '/compendia/Publication.txt'), + publication_compendium = config['output_directory'] + '/compendia/Publication.txt', # We generate an empty Publication Synonyms files, but we still need to generate one. publication_synonyms_gz = config['output_directory'] + '/synonyms/Publication.txt.gz', run: From eca8284c5839ee217253c88ce37a3ce067534d2e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 22:52:09 -0400 Subject: [PATCH 149/167] Added GeneProteinConflated.txt.gz as an output. --- src/snakefiles/geneprotein.snakefile | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 7e540bd9..a94e6923 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -1,5 +1,6 @@ import src.createcompendia.geneprotein as geneprotein -import src.assess_compendia as assessments +from src.synonyms import synonymconflation +from util import gzip_files ### Gene / Protein @@ -22,9 +23,28 @@ rule geneprotein_conflation: run: geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile) +rule geneprotein_conflated_synonyms: + input: + geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'], + gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), + protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']) + output: + geneprotein_conflated_synonyms=temp(config['output_directory']+'/synonyms/GeneProteinConflated.txt') + run: + synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms) + +rule geneprotein_conflated_synonyms_gz: + input: + geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' + output: + geneprotein_conflated_synonyms_gz=config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz' + run: + gzip_files(input.geneprotein_conflated_synonyms) + rule geneprotein: input: - config['output_directory']+'/conflation/GeneProtein.txt' + config['output_directory']+'/conflation/GeneProtein.txt', + config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz' output: x=config['output_directory']+'/reports/geneprotein_done' shell: From 6dbf5beca04b5456aee77e453da74159ca028cb8 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 22:56:40 -0400 Subject: [PATCH 150/167] Tweaked code so it lines up with DrugChemicalConflated. --- src/snakefiles/geneprotein.snakefile | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index a94e6923..72f286b1 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -26,20 +26,18 @@ rule geneprotein_conflation: rule geneprotein_conflated_synonyms: input: geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'], - gene_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs']), - protein_outputs=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs']) - output: - geneprotein_conflated_synonyms=temp(config['output_directory']+'/synonyms/GeneProteinConflated.txt') - run: - synonymconflation.conflate_synonyms(input.gene_outputs + input.protein_outputs, input.geneprotein_conflations, output.geneprotein_conflated_synonyms) - -rule geneprotein_conflated_synonyms_gz: - input: - geneprotein_conflated_synonyms=config['output_directory']+'/synonyms/GeneProteinConflated.txt' + gene_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']), + protein_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']), + gene_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['gene_outputs']), + protein_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['protein_outputs']) output: geneprotein_conflated_synonyms_gz=config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz' run: - gzip_files(input.geneprotein_conflated_synonyms) + synonymconflation.conflate_synonyms( + input.gene_synonyms_gz + input.protein_synonyms_gz, + input.gene_compendia + input.protein_compendia, + input.geneprotein_conflations, + output.geneprotein_conflated_synonyms_gz) rule geneprotein: input: From 0546767f1c65d93153bc137e536ea2ee7a79d5b7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Mon, 18 Aug 2025 22:58:40 -0400 Subject: [PATCH 151/167] Added to various reports. --- config.yaml | 3 +++ src/snakefiles/util.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/config.yaml b/config.yaml index 0eaa6290..296f3587 100644 --- a/config.yaml +++ b/config.yaml @@ -289,6 +289,9 @@ chemical_outputs: drugchemicalconflated_synonym_outputs: - DrugChemicalConflated.txt +geneproteinconflated_synonym_outputs: + - GeneProteinConflated.txt + taxon_labels: - NCBITaxon - MESH diff --git a/src/snakefiles/util.py b/src/snakefiles/util.py index b28f1a08..cf46e27a 100644 --- a/src/snakefiles/util.py +++ b/src/snakefiles/util.py @@ -62,6 +62,7 @@ def get_all_synonyms(config): config['cell_line_outputs'] + config['genefamily_outputs'] + config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] + # Publication.txt is empty, but it's still created, so it needs to be here. @@ -87,6 +88,7 @@ def get_all_synonyms_except_drugchemicalconflated(config): config['cell_line_outputs'] + config['genefamily_outputs'] + # config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] ) @@ -110,6 +112,7 @@ def get_all_synonyms_with_drugchemicalconflated(config): config['cell_line_outputs'] + config['genefamily_outputs'] + config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] ) From 531692d0b551905b2f24c4c8d2d3a318b33f86e3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 19 Aug 2025 21:29:46 -0400 Subject: [PATCH 152/167] Fixed bug in handling taxa. --- src/synonyms/synonymconflation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/synonyms/synonymconflation.py b/src/synonyms/synonymconflation.py index 46864f03..70353ee2 100644 --- a/src/synonyms/synonymconflation.py +++ b/src/synonyms/synonymconflation.py @@ -213,7 +213,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output if 'taxa' in synonym: if 'taxa' not in final_conflation: final_conflation['taxa'] = set() - final_conflation.update(synonym['taxa']) + final_conflation['taxa'].update(synonym['taxa']) # Convert the taxa into a list. final_conflation['taxa'] = sorted(final_conflation['taxa']) From 675939f09cc0a09616209bb81f4b62d1ad1691ea Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 26 Aug 2025 19:22:37 -0400 Subject: [PATCH 153/167] Fixed some bugs in actually exporting the ChEBI alternate properties. --- src/babel_utils.py | 11 +++++------ src/properties.py | 4 ++++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index d2484b7a..91a8dc6a 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -421,9 +421,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non if properties_jsonl_gz_files: for properties_jsonl_gz_file in properties_jsonl_gz_files: logger.info(f"Loading properties from {properties_jsonl_gz_file}...") - property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) - logger.info(f"Loaded {properties_jsonl_gz_file}") - logger.info(f"All property files loaded: {get_memory_usage_summary()}") + count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) + logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") + logger.info(f"All property files loaded (total unique properties: {len(property_list)}: {get_memory_usage_summary()}") property_source_count = defaultdict(int) @@ -467,10 +467,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non for ac in additional_curies: if ac.curie not in slist: identifier_list.append(ac.curie) - for source in ac.sources: - property_source_count[source] += 1 + property_source_count[ac.source] += 1 - node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) + node = node_factory.create_node(input_identifiers=identifier_list, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) if node is None: # This usually happens because every CURIE in the node is not in the id_prefixes list for that node_type. # Something to fix at some point, but we don't want to break the pipeline for this, so diff --git a/src/properties.py b/src/properties.py index b021cf24..43317aa1 100644 --- a/src/properties.py +++ b/src/properties.py @@ -117,6 +117,10 @@ def __init__(self): self._properties = set[Property]() self._properties_by_curie = defaultdict(set[Property]) + @property + def __sizeof__(self): + return len(self._properties) + @property def properties(self) -> set[Property]: return self._properties From 7a53ce62f17663a582b27ac659ddaf95a325b5c7 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 26 Aug 2025 19:28:01 -0400 Subject: [PATCH 154/167] Attempt to fix a bug in KEGG xrefs from ChEBI. --- src/createcompendia/chemicals.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 40fa3c38..55390e33 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -502,7 +502,12 @@ def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): source = f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})' ).to_json_line()) if kk in props: - outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') + # This is apparently a list now sometimes? + kegg_ids = props[kk] + if not isinstance(kegg_ids, list): + kegg_ids = [kegg_ids] + for kegg_id in kegg_ids: + outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{kegg_id}\n') if pk in props: #Apparently there's a lot of structure here? database_links = props[pk] From 4ffb0ad248678a130f5b954be07b66dc3c0a413b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Tue, 26 Aug 2025 20:29:11 -0400 Subject: [PATCH 155/167] Added a UMLS-MeSH concord to proteins. Since MeSH is not an ids file for proteins, this should only pull in MeSH IDs that are associated with a UMLS ID. --- config.yaml | 1 + src/createcompendia/protein.py | 5 ++++- src/snakefiles/protein.snakefile | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/config.yaml b/config.yaml index 5f8c30c8..ce42b1fe 100644 --- a/config.yaml +++ b/config.yaml @@ -158,6 +158,7 @@ protein_concords: - PR - NCIT_UniProtKB - NCIT_UMLS + - UMLS_MESH - UMLS_UniProtKB protein_outputs: diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 2901ff97..1ab48230 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,7 +1,7 @@ import re from src.metadata.provenance import write_concord_metadata -from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON +from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON, MESH from src.categories import PROTEIN import src.datahandlers.umls as umls @@ -150,6 +150,9 @@ def build_ncit_uniprot_relationships(infile,outfile, metadata_yaml): def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml): umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}, provenance_metadata_yaml=metadata_yaml) +def build_umls_mesh_relationships(mrconso, idfile, outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH}, provenance_metadata_yaml=metadata_yaml) + def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 74f4cdce..94f37d43 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -79,6 +79,17 @@ rule get_protein_ncit_umls_relationships: run: protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) +rule get_protein_umls_mesh_relationships: + input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", + infile=config['intermediate_directory']+"/protein/ids/UMLS" + output: + outfile=config['intermediate_directory']+'/protein/concords/UMLS_MESH', + metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-UMLS_MESH.yaml' + run: + protein.build_umls_mesh_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + + rule protein_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['protein_labels']), From 9495a6dacfdf38e60f96259cb8fea18637046807 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 11:12:20 -0400 Subject: [PATCH 156/167] Added DRUGBANK mappings to UMLS/protein concords. --- config.yaml | 2 +- src/createcompendia/protein.py | 12 +++++++++--- src/snakefiles/protein.snakefile | 8 ++++---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/config.yaml b/config.yaml index ce42b1fe..e38fc714 100644 --- a/config.yaml +++ b/config.yaml @@ -158,7 +158,7 @@ protein_concords: - PR - NCIT_UniProtKB - NCIT_UMLS - - UMLS_MESH + - UMLS - UMLS_UniProtKB protein_outputs: diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 1ab48230..4ee2c24b 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,7 +1,7 @@ import re from src.metadata.provenance import write_concord_metadata -from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON, MESH +from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON, MESH, DRUGBANK from src.categories import PROTEIN import src.datahandlers.umls as umls @@ -150,8 +150,14 @@ def build_ncit_uniprot_relationships(infile,outfile, metadata_yaml): def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml): umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}, provenance_metadata_yaml=metadata_yaml) -def build_umls_mesh_relationships(mrconso, idfile, outfile, metadata_yaml): - umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH}, provenance_metadata_yaml=metadata_yaml) +def build_umls_relationships(mrconso, idfile, outfile, metadata_yaml): + # The corresponding code in chemicals also includes (1) {'RXNORM': RXCUI}, and (2) we also pull in RxNorm to + # provide the inverse concords (i.e. RxNorm -> MESH and DRUGBANK). Doing so will probably fix some RXCUI IDs, + # but assigning RXCUI to proteins seems like a bridge too far for me. + # + # TODO: we should probably add some kind of filtering so we don't include concords that point to chemicals rather + # than proteins, which could result in duplicates (if the same ID is picked up in both chemicals and proteins). + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, provenance_metadata_yaml=metadata_yaml) def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index 94f37d43..b493fae6 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -79,15 +79,15 @@ rule get_protein_ncit_umls_relationships: run: protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) -rule get_protein_umls_mesh_relationships: +rule get_protein_umls_relationships: input: mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+"/protein/ids/UMLS" output: - outfile=config['intermediate_directory']+'/protein/concords/UMLS_MESH', - metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-UMLS_MESH.yaml' + outfile=config['intermediate_directory']+'/protein/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-UMLS.yaml' run: - protein.build_umls_mesh_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule protein_compendia: From 0fe069038046fbd59364f97d6f52a45b41a798dd Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 11:21:04 -0400 Subject: [PATCH 157/167] DuckDB files are temporary, but regenerating them is a pain. --- src/snakefiles/duckdb.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/snakefiles/duckdb.snakefile b/src/snakefiles/duckdb.snakefile index 55fba662..fd78b98d 100644 --- a/src/snakefiles/duckdb.snakefile +++ b/src/snakefiles/duckdb.snakefile @@ -23,7 +23,7 @@ rule export_compendia_to_duckdb: input: compendium_file=config['output_directory'] + "/compendia/{filename}.txt", output: - duckdb_filename=temp(config['output_directory'] + "/duckdb/duckdbs/filename={filename}/compendium.duckdb"), + duckdb_filename=config['output_directory'] + "/duckdb/duckdbs/filename={filename}/compendium.duckdb", clique_parquet_file=config['output_directory'] + "/duckdb/parquet/filename={filename}/Clique.parquet", run: duckdb_exporters.export_compendia_to_parquet(input.compendium_file, output.clique_parquet_file, output.duckdb_filename) @@ -47,7 +47,7 @@ rule export_synonyms_to_duckdb: input: synonyms_file=config['output_directory'] + "/synonyms/{filename}.txt.gz", output: - duckdb_filename=temp(config['output_directory'] + "/duckdb/duckdbs/filename={filename}/synonyms.duckdb"), + duckdb_filename=config['output_directory'] + "/duckdb/duckdbs/filename={filename}/synonyms.duckdb", synonyms_parquet_filename=config['output_directory'] + "/duckdb/parquet/filename={filename}/Synonyms.parquet", run: duckdb_exporters.export_synonyms_to_parquet(input.synonyms_file, output.duckdb_filename, output.synonyms_parquet_filename) From 14219c3efda749e1a3d038b9836a484cd5aeee7b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 14:25:57 -0400 Subject: [PATCH 158/167] Added comments to document what chemicals.build_compendia() is doing. Could also be useful to track memory in the future. --- src/createcompendia/chemicals.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 55390e33..356d7575 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -8,6 +8,7 @@ import ast import gzip +from src import util from src.properties import Property, HAS_ALTERNATIVE_ID from src.metadata.provenance import write_concord_metadata, write_combined_metadata from src.ubergraph import UberGraph @@ -20,6 +21,9 @@ import src.datahandlers.mesh as mesh import src.datahandlers.umls as umls +from src.util import get_memory_usage_summary + +logger = util.get_logger(__name__) def get_type_from_smiles(smiles): if '.' in smiles: @@ -678,12 +682,18 @@ def build_compendia(type_file, untyped_compendia_file, properties_jsonl_gz_files for line in inf: x = line.strip().split('\t') types[x[0]] = x[1] + logger.info(f'Loaded {len(types)} types from {type_file}: {get_memory_usage_summary()}') + untyped_sets = set() with open(untyped_compendia_file,'r') as inf: for line in inf: s = ast.literal_eval(line.strip()) untyped_sets.add(frozenset(s)) + logger.info(f'Loaded {len(untyped_sets)} untyped sets from {untyped_compendia_file}: {get_memory_usage_summary()}') + typed_sets = create_typed_sets(untyped_sets, types) + logger.info(f'Created {len(typed_sets)} typed sets from {len(untyped_sets)} untyped sets: {get_memory_usage_summary()}') + for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] if biotype == DRUG: From 656032edbdce321aae0cf25b9fcb3de5badaa6a6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 21:39:40 -0400 Subject: [PATCH 159/167] Fixed PropertyList count. --- src/babel_utils.py | 2 +- src/properties.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 91a8dc6a..4e73801c 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -423,7 +423,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non logger.info(f"Loading properties from {properties_jsonl_gz_file}...") count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") - logger.info(f"All property files loaded (total unique properties: {len(property_list)}: {get_memory_usage_summary()}") + logger.info(f"All property files loaded (total unique properties: {property_list.count_unique()}: {get_memory_usage_summary()}") property_source_count = defaultdict(int) diff --git a/src/properties.py b/src/properties.py index 43317aa1..35711232 100644 --- a/src/properties.py +++ b/src/properties.py @@ -118,7 +118,12 @@ def __init__(self): self._properties_by_curie = defaultdict(set[Property]) @property - def __sizeof__(self): + def count_unique(self): + """ + Return the number of unique Properties in this PropertyList. + + :return: The number of unique Properties in this PropertyList. + """ return len(self._properties) @property From 97fd066e97d6d7dd382029e48fce1707877077d4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 21:39:40 -0400 Subject: [PATCH 160/167] Fixed PropertyList count. --- src/babel_utils.py | 2 +- src/properties.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 91a8dc6a..4e73801c 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -423,7 +423,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non logger.info(f"Loading properties from {properties_jsonl_gz_file}...") count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") - logger.info(f"All property files loaded (total unique properties: {len(property_list)}: {get_memory_usage_summary()}") + logger.info(f"All property files loaded (total unique properties: {property_list.count_unique()}: {get_memory_usage_summary()}") property_source_count = defaultdict(int) diff --git a/src/properties.py b/src/properties.py index 43317aa1..35711232 100644 --- a/src/properties.py +++ b/src/properties.py @@ -118,7 +118,12 @@ def __init__(self): self._properties_by_curie = defaultdict(set[Property]) @property - def __sizeof__(self): + def count_unique(self): + """ + Return the number of unique Properties in this PropertyList. + + :return: The number of unique Properties in this PropertyList. + """ return len(self._properties) @property From 3afc6f9140745afd0d6354d88fa92a72f600909a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 21:45:33 -0400 Subject: [PATCH 161/167] Improved log message. --- src/babel_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 4e73801c..e8eaeb1b 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -423,7 +423,7 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non logger.info(f"Loading properties from {properties_jsonl_gz_file}...") count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") - logger.info(f"All property files loaded (total unique properties: {property_list.count_unique()}: {get_memory_usage_summary()}") + logger.info(f"All {len(properties_jsonl_gz_files)} property files loaded ({property_list.count_unique()} total unique properties): {get_memory_usage_summary()}") property_source_count = defaultdict(int) From 30eb4e84501eff8f01882f14382cc240d7c8b3aa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 22:30:29 -0400 Subject: [PATCH 162/167] Fixed bug in PropertyList count. --- src/babel_utils.py | 4 +++- src/properties.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index e8eaeb1b..856d064e 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -423,7 +423,9 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non logger.info(f"Loading properties from {properties_jsonl_gz_file}...") count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") - logger.info(f"All {len(properties_jsonl_gz_files)} property files loaded ({property_list.count_unique()} total unique properties): {get_memory_usage_summary()}") + logger.info(f"All {len(properties_jsonl_gz_files)} property files loaded ({property_list.count_unique()} total unique properties): {get_memory_usage_summary()}") + else: + logger.info("No property files provided or loaded.") property_source_count = defaultdict(int) diff --git a/src/properties.py b/src/properties.py index 35711232..52dbebad 100644 --- a/src/properties.py +++ b/src/properties.py @@ -117,7 +117,6 @@ def __init__(self): self._properties = set[Property]() self._properties_by_curie = defaultdict(set[Property]) - @property def count_unique(self): """ Return the number of unique Properties in this PropertyList. From 472ff5946dc7dc45ca5dec988d3c012418eedf1e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Wed, 27 Aug 2025 23:51:09 -0400 Subject: [PATCH 163/167] Added DRUGBANK as an extra prefix for the Protein compendia. --- src/createcompendia/protein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 4ee2c24b..1fd22d1b 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -198,5 +198,5 @@ def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_fil baretype = PROTEIN.split(':')[-1] logger.info(f"Writing compendium for {baretype}, memory usage: {get_memory_usage_summary()}") - write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, extra_prefixes=[DRUGBANK], icrdf_filename=icrdf_filename) logger.info(f"Wrote compendium for {baretype}, memory usage: {get_memory_usage_summary()}") From 55fe9fe0a60b4e1b3b23ca836a0b71302af8bb86 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 28 Aug 2025 04:16:50 -0400 Subject: [PATCH 164/167] Fixed minor bug in properties. --- src/properties.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/properties.py b/src/properties.py index 52dbebad..84cd49a1 100644 --- a/src/properties.py +++ b/src/properties.py @@ -139,6 +139,9 @@ def get_all(self, curie: str, predicate: str = None) -> set[Property]: """ props = self._properties_by_curie[curie] + if predicate is None: + return props + if predicate not in supported_predicates: raise ValueError(f'Predicate {predicate} is not supported (supported predicates: {supported_predicates})') From 9d8a0ceda3ef9bfde8624d40e12ce9d87886ab9c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 28 Aug 2025 04:18:23 -0400 Subject: [PATCH 165/167] Fixed bug in getting the alternative ID. --- src/babel_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 856d064e..77620f60 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -467,8 +467,8 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non additional_curies = property_list.get_all(iid, HAS_ALTERNATIVE_ID) if additional_curies: for ac in additional_curies: - if ac.curie not in slist: - identifier_list.append(ac.curie) + if ac.value not in slist: + identifier_list.append(ac.value) property_source_count[ac.source] += 1 node = node_factory.create_node(input_identifiers=identifier_list, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) From b5731eda71df85b1263730ddec84c61240fb0132 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 28 Aug 2025 15:43:59 -0400 Subject: [PATCH 166/167] Major cleanup of additional CURIEs with labels and properties. --- src/babel_utils.py | 70 +++++++++++++++++++++++++++++++++------------- src/node.py | 10 +++---- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/src/babel_utils.py b/src/babel_utils.py index 77620f60..e99ea2ed 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -458,19 +458,6 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") - # At this point, we insert any HAS_ADDITIONAL_ID properties we have. - # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long - # as the additional CURIE is not already in the list of CURIEs. - identifier_list = [] - for iid in slist: - identifier_list.append(iid) - additional_curies = property_list.get_all(iid, HAS_ALTERNATIVE_ID) - if additional_curies: - for ac in additional_curies: - if ac.value not in slist: - identifier_list.append(ac.value) - property_source_count[ac.source] += 1 - node = node_factory.create_node(input_identifiers=identifier_list, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) if node is None: # This usually happens because every CURIE in the node is not in the id_prefixes list for that node_type. @@ -544,20 +531,63 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=Non else: preferred_name = '' - # Generate the node. - descs = description_factory.get_descriptions(node) - taxa = taxon_factory.get_taxa(node) + # At this point, we insert any HAS_ADDITIONAL_ID IDs we have. + # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long + # as the additional CURIE is not already in the list of CURIEs. + # + # We will attempt to retrieve a label or description for this ID as well. + current_curies = set() + identifier_list = [] + curie_labels = dict() + for nid in node['identifiers']: + iid = nid['identifier'] + + # Prevent duplicates (might happen if e.g. we have an additional CURIE that duplicates an existing one later in the list). + if iid in current_curies: + continue + + identifier_list.append(iid) + current_curies.add(iid) + + if 'label' in nid: + curie_labels[iid] = nid['label'] + + # Are there any additional CURIEs for this CURIE? + additional_curies = property_list.get_all(iid, HAS_ALTERNATIVE_ID) + if additional_curies: + # ac_labelled will be a list that consists of either LabeledID (if the CURIE could be labeled) + # or str objects (consisting of an unlabeled CURIE). + ac_labelled = node_factory.apply_labels(input_identifiers=additional_curies, labels=labels) + + for ac, label in zip(additional_curies, list(ac_labelled)): + additional_curie = Text.get_curie(label) + if additional_curie not in current_curies: + identifier_list.append(additional_curie) + current_curies.add(additional_curie) + + # Track the property sources we used. + property_source_count[ac.source] += 1 + + if isinstance(label, LabeledID) and label.label: + curie_labels[additional_curie] = label.label + + # Add description and taxon information and construct the final nw object. + descs = description_factory.get_descriptions(identifier_list) + taxa = taxon_factory.get_taxa(identifier_list) + + # Construct the written-out identifier objects. nw['identifiers'] = [] - for nids in node['identifiers']: - id_info = {'i': nids['identifier']} + for iid in identifier_list: + id_info = {'i': iid} - if 'label' in nids: - id_info['l'] = nids['label'] + if iid in curie_labels: + id_info['l'] = curie_labels[iid] if id_info['i'] in descs: # Sort descriptions from the shortest to the longest. id_info['d'] = list(sorted(descs[id_info['i']], key=lambda x: len(x))) + if id_info['i'] in taxa: # Sort taxa by CURIE suffix. id_info['t'] = list(sorted(taxa[id_info['i']], key=get_numerical_curie_suffix)) diff --git a/src/node.py b/src/node.py index d1fda547..b33de36f 100644 --- a/src/node.py +++ b/src/node.py @@ -138,11 +138,10 @@ def load_descriptions(self,prefix): self.descriptions[prefix] = descs logger.info(f'Loaded {desc_count:,} descriptions for {prefix}') - def get_descriptions(self,node): + def get_descriptions(self, ids: list[str]): node_descriptions = defaultdict(set) - for ident in node['identifiers']: - thisid = ident['identifier'] - pref = thisid.split(':', 1)[0] + for thisid in ids: + pref = Text.get_prefix(thisid) if pref not in self.descriptions: self.load_descriptions(pref) node_descriptions[thisid].update( self.descriptions[pref][thisid] ) @@ -161,8 +160,7 @@ def __init__(self, rootdir): def load_taxa(self, prefix): return self.tsvloader.load_prefix(prefix) - def get_taxa(self, node): - curies = list({ident['identifier'] for ident in node['identifiers']}) + def get_taxa(self, curies: list[str]): return self.tsvloader.get_curies(curies) def close(self): From ece1da84391a96888489319ecf501dbd402974e6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya <gaurav@ggvaidya.com> Date: Thu, 28 Aug 2025 17:33:52 -0400 Subject: [PATCH 167/167] Added DuckDB to the Dockerfile so we can use the CLI. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index dac38223..4ada0106 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get install -y screen RUN apt-get install -y vim RUN apt-get install -y rsync RUN apt-get install -y jq +RUN apt-get install -y duckdb # Create a non-root-user. RUN adduser --home ${ROOT} --uid 1000 nru