diff --git a/Dockerfile b/Dockerfile index dac38223..4ada0106 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,7 @@ RUN apt-get install -y screen RUN apt-get install -y vim RUN apt-get install -y rsync RUN apt-get install -y jq +RUN apt-get install -y duckdb # Create a non-root-user. RUN adduser --home ${ROOT} --uid 1000 nru diff --git a/Snakefile b/Snakefile index 0c595a9a..8465ebcf 100644 --- a/Snakefile +++ b/Snakefile @@ -20,6 +20,11 @@ include: "src/snakefiles/duckdb.snakefile" include: "src/snakefiles/reports.snakefile" include: "src/snakefiles/exports.snakefile" +# Some global settings. +import os +os.environ['TMPDIR'] = config['tmp_directory'] + +# Top-level rules. rule all: input: # See rule all_outputs later in this file for how we generate all the outputs. diff --git a/config.yaml b/config.yaml index 76f8c0b4..e38fc714 100644 --- a/config.yaml +++ b/config.yaml @@ -1,15 +1,28 @@ +# Overall inputs and outputs. input_directory: input_data download_directory: babel_downloads intermediate_directory: babel_outputs/intermediate output_directory: babel_outputs +tmp_directory: babel_downloads/tmp +# Versions that need to be updated on every release. biolink_version: "4.2.6-rc5" -umls_version: "2024AB" -rxnorm_version: "03032025" +umls_version: "2025AA" +rxnorm_version: "07072025" drugbank_version: "5-1-13" +# +# PROTEINS +# + +# Chris Bizon prepared a list of UMLS/UniProtKB mappings which we download and use. UMLS_UniProtKB_download_raw_url: "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv" +# +# The rest of these configs need to be cleaned up. +# + + ncbi_files: - gene2ensembl.gz - gene_info.gz @@ -145,6 +158,7 @@ protein_concords: - PR - NCIT_UniProtKB - NCIT_UMLS + - UMLS - UMLS_UniProtKB protein_outputs: @@ -282,6 +296,9 @@ chemical_outputs: drugchemicalconflated_synonym_outputs: - DrugChemicalConflated.txt +geneproteinconflated_synonym_outputs: + - GeneProteinConflated.txt + taxon_labels: - NCBITaxon - MESH diff --git a/requirements.lock b/requirements.lock index 9bc7b588..a812926e 100644 --- a/requirements.lock +++ b/requirements.lock @@ -92,7 +92,7 @@ pronto==2.7.0 propcache==0.3.1 psutil==7.0.0 psycopg2-binary==2.9.10 -PuLP==3.1.1 +PuLP==2.7.0 pydantic==2.11.4 pydantic_core==2.33.2 PyJSG==0.11.10 @@ -128,7 +128,7 @@ ShExJSG==0.8.2 six==1.17.0 smart-open==7.1.0 smmap==5.0.2 -snakemake==9.3.3 +snakemake==7.32.4 snakemake-interface-common==1.17.4 snakemake-interface-executor-plugins==9.3.5 snakemake-interface-logger-plugins==1.2.3 @@ -142,10 +142,12 @@ SQLAlchemy==2.0.40 SQLAlchemy-Utils==0.38.3 sssom==0.4.15 sssom-schema==1.0.0 +stopit==1.1.2 stringcase==1.2.0 tabulate==0.9.0 tenacity==8.5.0 throttler==1.2.2 +toposort==1.10 tqdm==4.67.1 traitlets==5.14.3 types-python-dateutil==2.9.0.20241206 diff --git a/requirements.txt b/requirements.txt index 4422caef..c0e54939 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,3 +31,8 @@ duckdb # on checking if it's online via http://httpstat.us/200, which is often offline. My branch of this # https://github.com/gaurav/apybiomart/tree/change-check-url and changes that to https://example.org. git+https://github.com/gaurav/apybiomart.git@change-check-url + +# Added by Gaurav, Aug 2025 to check for memory information while Babel is running. +psutil +humanfriendly +py-spy diff --git a/src/babel_utils.py b/src/babel_utils.py index a96120d1..e99ea2ed 100644 --- a/src/babel_utils.py +++ b/src/babel_utils.py @@ -1,24 +1,34 @@ -import logging import subprocess import traceback from enum import Enum from ftplib import FTP from io import BytesIO import gzip -from datetime import datetime as dt +from datetime import datetime as dt, datetime from datetime import timedelta import time import requests import os import urllib import jsonlines +import yaml +from humanfriendly import format_timespan + +from src.metadata.provenance import write_combined_metadata from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory -from src.util import Text, get_config +from src.properties import PropertyList, HAS_ALTERNATIVE_ID +from src.util import Text, get_config, get_memory_usage_summary, get_logger from src.LabeledID import LabeledID from collections import defaultdict import sqlite3 from typing import List, Tuple +# Configuration items +WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES = 1_000_000 + +# Set up a logger. +logger = get_logger(__name__) + def make_local_name(fname,subpath=None): config = get_config() if subpath is None: @@ -279,7 +289,7 @@ def pull_via_wget( wget_command_line.extend(['--recursive', '--no-parent', '--no-directories', '--level=1', '--directory-prefix=' + dl_file_name]) # Execute wget. - logging.info(f"Downloading {dl_file_name} using wget: {wget_command_line}") + logger.info(f"Downloading {dl_file_name} using wget: {wget_command_line}") process = subprocess.run(wget_command_line) if process.returncode != 0: raise RuntimeError(f"Could not execute wget {wget_command_line}: {process.stderr}") @@ -297,17 +307,17 @@ def pull_via_wget( if os.path.isfile(uncompressed_filename): file_size = os.path.getsize(uncompressed_filename) - logging.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.") + logger.info(f"Downloaded {uncompressed_filename} from {url}, file size {file_size} bytes.") else: raise RuntimeError(f'Expected uncompressed file {uncompressed_filename} does not exist.') else: if os.path.isfile(dl_file_name): file_size = os.path.getsize(dl_file_name) - logging.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.") + logger.info(f"Downloaded {dl_file_name} from {url}, file size {file_size} bytes.") elif os.path.isdir(dl_file_name): # Count the number of files in directory dl_file_name dir_size = sum(os.path.getsize(os.path.join(dl_file_name, f)) for f in os.listdir(dl_file_name) if os.path.isfile(os.path.join(dl_file_name, f))) - logging.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.") + logger.info(f"Downloaded {dir_size} files from {url} to {dl_file_name}.") else: raise RuntimeError(f'Unknown file type {dl_file_name}') @@ -349,10 +359,11 @@ def get_numerical_curie_suffix(curie): return None -def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None): +def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels=None, extra_prefixes=None, icrdf_filename=None, properties_jsonl_gz_files=None): """ + :param metadata_yaml: The YAML files containing the metadata for this compendium. :param synonym_list: - :param ofname: + :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories. :param node_type: :param labels: A map of identifiers Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx). @@ -363,13 +374,23 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i write_compendium() will throw a RuntimeError if it is not specified. This is to ensure that it has been properly specified as a prerequisite in a Snakemake file, so that write_compendium() is not run until after icRDF.tsv has been generated. + :param properties_files: (OPTIONAL) A list of SQLite3 files containing properties to be added to the output. :return: """ + logger.info(f"Starting write_compendium({metadata_yamls}, {len(synonym_list)} slists, {ofname}, {node_type}, {len(labels)} labels, {extra_prefixes}, {icrdf_filename}, {properties_jsonl_gz_files}): {get_memory_usage_summary()}") + + if extra_prefixes is None: + extra_prefixes = [] + if labels is None: + labels = {} config = get_config() cdir = config['output_directory'] biolink_version = config['biolink_version'] + node_factory = NodeFactory(make_local_name(''),biolink_version) + logger.info(f"NodeFactory ready: {node_factory} with {get_memory_usage_summary()}") synonym_factory = SynonymFactory(make_local_name('')) + logger.info(f"SynonymFactory ready: {synonym_factory} with {get_memory_usage_summary()}") # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when # coming up with a preferred label for a particular Biolink class. @@ -380,20 +401,74 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i if not icrdf_filename: raise RuntimeError("No icrdf_filename parameter provided to write_compendium() -- this is required!") ic_factory = InformationContentFactory(icrdf_filename) + logger.info(f"InformationContentFactory ready: {ic_factory} with {get_memory_usage_summary()}") description_factory = DescriptionFactory(make_local_name('')) + logger.info(f"DescriptionFactory ready: {description_factory} with {get_memory_usage_summary()}") + taxon_factory = TaxonFactory(make_local_name('')) + logger.info(f"TaxonFactory ready: {taxon_factory} with {get_memory_usage_summary()}") + node_test = node_factory.create_node(input_identifiers=[],node_type=node_type,labels={},extra_prefixes = extra_prefixes) + logger.info(f"NodeFactory test complete: {node_test} with {get_memory_usage_summary()}") # Create compendia and synonyms directories, just in case they haven't been created yet. os.makedirs(os.path.join(cdir, 'compendia'), exist_ok=True) os.makedirs(os.path.join(cdir, 'synonyms'), exist_ok=True) + # Load all the properties. + property_list = PropertyList() + if properties_jsonl_gz_files: + for properties_jsonl_gz_file in properties_jsonl_gz_files: + logger.info(f"Loading properties from {properties_jsonl_gz_file}...") + count_loaded = property_list.add_properties_jsonl_gz(properties_jsonl_gz_file) + logger.info(f"Loaded {count_loaded} unique properties from {properties_jsonl_gz_file}") + logger.info(f"All {len(properties_jsonl_gz_files)} property files loaded ({property_list.count_unique()} total unique properties): {get_memory_usage_summary()}") + else: + logger.info("No property files provided or loaded.") + + property_source_count = defaultdict(int) + + # Counts. + count_cliques = 0 + count_eq_ids = 0 + count_synonyms = 0 + # Write compendium and synonym files. with jsonlines.open(os.path.join(cdir,'compendia',ofname),'w') as outf, jsonlines.open(os.path.join(cdir,'synonyms',ofname),'w') as sfile: + # Calculate an estimated time to completion. + start_time = time.time_ns() + count_slist = 0 + total_slist = len(synonym_list) + for slist in synonym_list: - node = node_factory.create_node(input_identifiers=slist, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) - if node is not None: + # Before we get started, let's estimate where we're at. + count_slist += 1 + if (count_slist == 1) or (count_slist % WRITE_COMPENDIUM_LOG_EVERY_X_CLIQUES == 0): + time_elapsed_seconds = (time.time_ns() - start_time) / 1E9 + if time_elapsed_seconds < 0.001: + # We don't want to divide by zero. + time_elapsed_seconds = 0.001 + remaining_slist = total_slist - count_slist + # count_slist --> time_elapsed_seconds + # remaining_slist --> remaining_slist/count_slit*time_elapsed_seconds + logger.info(f"Generating compendia and synonyms for {ofname} currently at {count_slist:,} out of {total_slist:,} ({count_slist/total_slist*100:.2f}%) in {format_timespan(time_elapsed_seconds)}: {get_memory_usage_summary()}") + logger.info(f" - Current rate: {count_slist/time_elapsed_seconds:.2f} cliques/second or {time_elapsed_seconds/count_slist:.6f} seconds/clique.") + + time_remaining_seconds = (time_elapsed_seconds / count_slist * remaining_slist) + logger.info(f" - Estimated time remaining: {format_timespan(time_remaining_seconds)}") + + node = node_factory.create_node(input_identifiers=identifier_list, node_type=node_type,labels = labels, extra_prefixes = extra_prefixes) + if node is None: + # This usually happens because every CURIE in the node is not in the id_prefixes list for that node_type. + # Something to fix at some point, but we don't want to break the pipeline for this, so + # we emit a warning and skip this clique. + logger.warning(f"Could not create node for ({slist}, {node_type}, {labels}, {extra_prefixes}): returned None.") + continue + else: + count_cliques += 1 + count_eq_ids += len(slist) + nw = {"type": node['type']} ic = ic_factory.get_ic(node) nw['ic'] = ic @@ -456,20 +531,63 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i else: preferred_name = '' - # Generate the node. - descs = description_factory.get_descriptions(node) - taxa = taxon_factory.get_taxa(node) + # At this point, we insert any HAS_ADDITIONAL_ID IDs we have. + # The logic we use is: we insert all additional IDs for a CURIE *AFTER* that CURIE, in a random order, as long + # as the additional CURIE is not already in the list of CURIEs. + # + # We will attempt to retrieve a label or description for this ID as well. + current_curies = set() + identifier_list = [] + curie_labels = dict() + for nid in node['identifiers']: + iid = nid['identifier'] + + # Prevent duplicates (might happen if e.g. we have an additional CURIE that duplicates an existing one later in the list). + if iid in current_curies: + continue + + identifier_list.append(iid) + current_curies.add(iid) + + if 'label' in nid: + curie_labels[iid] = nid['label'] + + # Are there any additional CURIEs for this CURIE? + additional_curies = property_list.get_all(iid, HAS_ALTERNATIVE_ID) + if additional_curies: + # ac_labelled will be a list that consists of either LabeledID (if the CURIE could be labeled) + # or str objects (consisting of an unlabeled CURIE). + ac_labelled = node_factory.apply_labels(input_identifiers=additional_curies, labels=labels) + + for ac, label in zip(additional_curies, list(ac_labelled)): + additional_curie = Text.get_curie(label) + if additional_curie not in current_curies: + identifier_list.append(additional_curie) + current_curies.add(additional_curie) + + # Track the property sources we used. + property_source_count[ac.source] += 1 + + if isinstance(label, LabeledID) and label.label: + curie_labels[additional_curie] = label.label + + # Add description and taxon information and construct the final nw object. + descs = description_factory.get_descriptions(identifier_list) + taxa = taxon_factory.get_taxa(identifier_list) + + # Construct the written-out identifier objects. nw['identifiers'] = [] - for nids in node['identifiers']: - id_info = {'i': nids['identifier']} + for iid in identifier_list: + id_info = {'i': iid} - if 'label' in nids: - id_info['l'] = nids['label'] + if iid in curie_labels: + id_info['l'] = curie_labels[iid] if id_info['i'] in descs: # Sort descriptions from the shortest to the longest. id_info['d'] = list(sorted(descs[id_info['i']], key=lambda x: len(x))) + if id_info['i'] in taxa: # Sort taxa by CURIE suffix. id_info['t'] = list(sorted(taxa[id_info['i']], key=get_numerical_curie_suffix)) @@ -498,11 +616,13 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i "names": synonyms_list, "types": [t[8:] for t in types]} # remove biolink: + count_synonyms += len(synonyms_list) + # Write out the preferred name. if preferred_name: document["preferred_name"] = preferred_name else: - logging.debug( + logger.debug( f"No preferred name for {node}, probably because all names were filtered out, skipping." ) continue @@ -515,7 +635,7 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i # Since synonyms_list is sorted, we can use the length of the first term as the synonym. if len(synonyms_list) == 0: - logging.debug(f"Synonym list for {node} is empty: no valid name. Skipping.") + logger.debug(f"Synonym list for {node} is empty: no valid name. Skipping.") continue else: document["shortest_name_length"] = len(synonyms_list[0]) @@ -543,7 +663,25 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i print(node["type"]) print(node_factory.get_ancestors(node["type"])) traceback.print_exc() - exit() + raise ex + + # Write out the metadata.yaml file combining information from all the metadata.yaml files. + write_combined_metadata( + os.path.join(cdir, 'metadata', ofname + '.yaml'), + typ='compendium', + name=ofname, + counts={ + 'cliques': count_cliques, + 'eq_ids': count_eq_ids, + 'synonyms': count_synonyms, + 'property_sources': property_source_count, + }, + combined_from_filenames=metadata_yamls, + ) + + # Close all the factories. + taxon_factory.close() + def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}): """We want to construct sets containing equivalent identifiers. diff --git a/src/createcompendia/anatomy.py b/src/createcompendia/anatomy.py index dce606d5..75920f28 100644 --- a/src/createcompendia/anatomy.py +++ b/src/createcompendia/anatomy.py @@ -2,6 +2,7 @@ import requests import src.datahandlers.obo as obo +from src.metadata.provenance import write_concord_metadata from src.util import Text from src.prefixes import MESH, NCIT, CL, GO, UBERON, SNOMEDCT, WIKIDATA, UMLS, FMA @@ -91,14 +92,37 @@ def write_umls_ids(mrsty, outfile): #CL only shows up as an xref once in uberon, and it's a mistake. It doesn't show up in anything else. #GO only shows up as an xref once in uberon, and it's a mistake. It doesn't show up in anything else. #PMID is just wrong -def build_anatomy_obo_relationships(outdir): +def build_anatomy_obo_relationships(outdir, metadata_yamls): ignore_list = ['PMID','BTO','BAMS','FMA','CALOHA','GOC','WIKIPEDIA.EN','CL','GO','NIF_SUBCELLULAR','HTTP','OPENCYC'] #Create the equivalence pairs with open(f'{outdir}/{UBERON}', 'w') as uberon, open(f'{outdir}/{GO}', 'w') as go, open(f'{outdir}/{CL}', 'w') as cl: build_sets(f'{UBERON}:0001062', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) build_sets(f'{GO}:0005575', {UBERON:uberon, GO:go, CL:cl},'xref', ignore_list=ignore_list) - -def build_wikidata_cell_relationships(outdir): + # CL is now being handled by Wikidata (build_wikidata_cell_relationships), so we can probably remove it from here. + + # Write out metadata. + for metadata_name in [UBERON, GO, CL]: + write_concord_metadata(metadata_yamls[metadata_name], + name='build_anatomy_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'UBERON' + }, + { + 'type': 'UberGraph', + 'name': 'GO' + }, + { + 'type': 'UberGraph', + 'name': 'CL' + } + ], + description=f'get_subclasses_and_xrefs() of {UBERON}:0001062 and {GO}:0005575', + concord_filename=f'{outdir}/{metadata_name}', + ) + +def build_wikidata_cell_relationships(outdir, metadata_yaml): #This sparql returns all the wikidata items that have a UMLS identifier and a CL identifier sparql = """PREFIX wdt: PREFIX wdtn: @@ -135,10 +159,21 @@ def build_wikidata_cell_relationships(outdir): else: print(f'Pair {pair} is not unique {counts[pair[0]]} {counts[pair[1]]}') -def build_anatomy_umls_relationships(mrconso, idfile,outfile): - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}) - -def build_compendia(concordances, identifiers, icrdf_filename): + # Write out metadata + write_concord_metadata(metadata_yaml, + name='build_wikidata_cell_relationships()', + sources=[{ + 'type': 'Frink', + 'name': 'Frink Direct Normalized Graph via SPARQL' + }], + description='wd:P7963 ("Cell Ontology ID") and wd:P2892 ("UMLS CUI") from Wikidata', + concord_filename=f'{outdir}/{WIKIDATA}', + ) + +def build_anatomy_umls_relationships(mrconso, idfile, outfile, umls_metadata): + umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'GO': GO, 'FMA': FMA}, provenance_metadata_yaml=umls_metadata) + +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -175,7 +210,7 @@ def build_compendia(concordances, identifiers, icrdf_filename): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/createcompendia/cell_line.py b/src/createcompendia/cell_line.py index 89252c1e..d61176ff 100644 --- a/src/createcompendia/cell_line.py +++ b/src/createcompendia/cell_line.py @@ -2,7 +2,7 @@ from src.babel_utils import read_identifier_file,glom,write_compendium -def build_compendia(ifile, icrdf_filename): +def build_compendia(ifile, metadata_yamls, icrdf_filename): """:identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} @@ -13,5 +13,5 @@ def build_compendia(ifile, icrdf_filename): types.update(new_types) idsets = set([frozenset(x) for x in dicts.values()]) baretype = CELL_LINE.split(':')[-1] - write_compendium(idsets, f'{baretype}.txt', CELL_LINE, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, idsets, f'{baretype}.txt', CELL_LINE, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/chemicals.py b/src/createcompendia/chemicals.py index 0a6dd39f..356d7575 100644 --- a/src/createcompendia/chemicals.py +++ b/src/createcompendia/chemicals.py @@ -1,11 +1,16 @@ import logging +import os from collections import defaultdict +from os.path import dirname + import jsonlines import requests import ast import gzip -from gzip import GzipFile +from src import util +from src.properties import Property, HAS_ALTERNATIVE_ID +from src.metadata.provenance import write_concord_metadata, write_combined_metadata from src.ubergraph import UberGraph from src.prefixes import MESH, CHEBI, UNII, DRUGBANK, INCHIKEY, PUBCHEMCOMPOUND,GTOPDB, KEGGCOMPOUND, DRUGCENTRAL, CHEMBLCOMPOUND, UMLS, RXCUI from src.categories import MOLECULAR_MIXTURE, SMALL_MOLECULE, CHEMICAL_ENTITY, POLYPEPTIDE, COMPLEX_MOLECULAR_MIXTURE, CHEMICAL_MIXTURE, DRUG @@ -16,6 +21,9 @@ import src.datahandlers.mesh as mesh import src.datahandlers.umls as umls +from src.util import get_memory_usage_summary + +logger = util.get_logger(__name__) def get_type_from_smiles(smiles): if '.' in smiles: @@ -70,11 +78,11 @@ def write_rxnorm_ids(infile, outfile): umlsmap ["A1.4.1.1.1"] = DRUG umls.write_rxnorm_ids(umlsmap, filter_types, infile, outfile, prefix=RXCUI, styfile="RXNSTY.RRF") -def build_chemical_umls_relationships(mrconso, idfile,outfile): - umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }) +def build_chemical_umls_relationships(mrconso, idfile,outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK, 'RXNORM': RXCUI }, provenance_metadata_yaml=metadata_yaml) -def build_chemical_rxnorm_relationships(conso, idfile,outfile): - umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI) +def build_chemical_rxnorm_relationships(conso, idfile, outfile, metadata_yaml): + umls.build_sets(conso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, cui_prefix=RXCUI, provenance_metadata_yaml=metadata_yaml) def write_pubchem_ids(labelfile,smilesfile,outfile): #Trying to be memory efficient here. We could just ingest the whole smilesfile which would make this code easier @@ -358,14 +366,22 @@ def is_cas(thing): return False return True -def make_pubchem_cas_concord(pubchemsynonyms, outfile): +def make_pubchem_cas_concord(pubchemsynonyms, outfile, metadata_yaml): with open(pubchemsynonyms,'r') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split('\t') if is_cas(x[1]): outf.write(f'{x[0]}\txref\tCAS:{x[1]}\n') -def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): + write_concord_metadata( + metadata_yaml, + name='make_pubchem_cas_concord()', + description='make_pubchem_cas_concord() creates xrefs from PUBCHEM identifiers in the PubChem synonyms file ' + + f'({pubchemsynonyms}) to Chemical Abstracts Service (CAS) identifiers.', + concord_filename=outfile, + ) + +def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile, metadata_yaml): mesh_label_to_id={} #Meshlabels has all kinds of stuff. e.g. these are both in there: #MESH:D014867 Water @@ -380,7 +396,7 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): # first mapping is the 'best' i.e. the one most frequently reported. # We will only use the first one used_pubchem = set() - with open(pubcheminput,'r') as inf, open(outfile,'w') as outf: + with open(pubcheminput,'r', encoding='ISO-8859-1') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split('\t') # x[0] = puchemid (no prefix), x[1] = mesh label if x[0] in used_pubchem: @@ -393,7 +409,15 @@ def make_pubchem_mesh_concord(pubcheminput,meshlabels,outfile): outf.write(f'{PUBCHEMCOMPOUND}:{x[0]}\txref\t{mesh_id}\n') used_pubchem.add(x[0]) -def build_drugcentral_relations(infile,outfile): + write_concord_metadata( + metadata_yaml, + name='make_pubchem_mesh_concord()', + description=f'make_pubchem_mesh_concord() loads MeSH labels from {meshlabels}, then creates xrefs from PubChem ' + + f'identifiers in the PubChem input file ({pubcheminput}) to those MeSH identifiers using the labels as keys.', + concord_filename=outfile, + ) + +def build_drugcentral_relations(infile,outfile, metadata_yaml): prefixmap = { 'CHEBI': CHEBI, 'ChEMBL_ID': CHEMBLCOMPOUND, 'DRUGBANK_ID': DRUGBANK, @@ -417,8 +441,14 @@ def build_drugcentral_relations(infile,outfile): #print('ok') outf.write(f'{DRUGCENTRAL}:{parts[drugcentral_id_col]}\txref\t{prefixmap[external_ns]}:{parts[external_id_col]}\n') + write_concord_metadata( + metadata_yaml, + name='build_drugcentral_relations()', + description=f'Build xrefs from DrugCentral ({infile}) to {DRUGCENTRAL} using the prefix map {prefixmap}.', + concord_filename=outfile, + ) -def make_gtopdb_relations(infile,outfile): +def make_gtopdb_relations(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: h = inf.readline() # We might have a header/version line. If so, skip to the next line. @@ -435,7 +465,14 @@ def make_gtopdb_relations(infile,outfile): inchi = f'{INCHIKEY}:{x[inchi_index][1:-1]}' outf.write(f'{gid}\txref\t{inchi}\n') -def make_chebi_relations(sdf,dbx,outfile): + write_concord_metadata( + metadata_yaml, + name='make_gtopdb_relations()', + description=f'Transform Ligand ID/InChIKey mappings from {infile} into a concord.', + concord_filename=outfile, + ) + +def make_chebi_relations(sdf,dbx,outfile,propfile_gz,metadata_yaml): """CHEBI contains relations both about chemicals with and without inchikeys. You might think that because everything is based on unichem, we could avoid the with structures part, but history has shown that we lose links in that case, so we will use both the structured and unstructured chemical entries.""" @@ -451,14 +488,35 @@ def make_chebi_relations(sdf,dbx,outfile): dbxdata = inf.read() kk = 'keggcompounddatabaselinks' pk = 'pubchemdatabaselinks' - with open(outfile,'w') as outf: + secondary_chebi_id = 'secondarychebiid' + + # What if we don't have a propfile directory? + os.makedirs(dirname(propfile_gz), exist_ok=True) + + with open(outfile,'w') as outf, gzip.open(propfile_gz, 'wt') as propf: #Write SDF structured things for cid,props in chebi_sdf_dat.items(): + if secondary_chebi_id in props: + secondary_ids = props[secondary_chebi_id] + for secondary_id in secondary_ids: + propf.write(Property( + curie = cid, + predicate = HAS_ALTERNATIVE_ID, + value = secondary_id, + source = f'Listed as a CHEBI secondary ID in the ChEBI SDF file ({sdf})' + ).to_json_line()) if kk in props: - outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{props[kk]}\n') + # This is apparently a list now sometimes? + kegg_ids = props[kk] + if not isinstance(kegg_ids, list): + kegg_ids = [kegg_ids] + for kegg_id in kegg_ids: + outf.write(f'{cid}\txref\t{KEGGCOMPOUND}:{kegg_id}\n') if pk in props: #Apparently there's a lot of structure here? - v = props[pk] + database_links = props[pk] + # To simulate previous behavior, I'll concatenate previous values together. + v = "".join(database_links) parts = v.split('SID: ') for p in parts: if 'CID' in p: @@ -479,10 +537,15 @@ def make_chebi_relations(sdf,dbx,outfile): if x[3] == 'Pubchem accession': outf.write(f'{cid}\txref\t{PUBCHEMCOMPOUND}:{x[4]}\n') + write_concord_metadata( + metadata_yaml, + name='make_chebi_relations()', + description=f'make_chebi_relations() creates xrefs from the ChEBI database ({sdf}) to {PUBCHEMCOMPOUND} and {KEGGCOMPOUND}.', + concord_filename=outfile, + ) - -def get_mesh_relationships(mesh_id_file,cas_out, unii_out): +def get_mesh_relationships(mesh_id_file,cas_out, unii_out, cas_metadata, unii_metadata): meshes = set() with open(mesh_id_file,'r') as inf: for line in inf: @@ -504,7 +567,31 @@ def get_mesh_relationships(mesh_id_file,cas_out, unii_out): #is a unii? uniiout.write(f'{meshid}\txref\tUNII:{reg}\n') -def get_wikipedia_relationships(outfile): + write_concord_metadata( + cas_metadata, + name='get_mesh_relationships()', + sources=[{ + 'type': 'MeSH Registry', + 'name': 'MeSH Registry', + }], + description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' + f'in {mesh_id_file}, then writes out CAS mappings to {cas_out}', + concord_filename=cas_out, + ) + + write_concord_metadata( + unii_metadata, + name='get_mesh_relationships()', + sources=[{ + 'type': 'MeSH Registry', + 'name': 'MeSH Registry', + }], + description=f'get_mesh_relationships() iterates through the MeSH registry, filters it to the MeSH IDs ' + f'in {mesh_id_file}, then writes out non-CAS mappings (i.e. UNII mappings) to {unii_out}', + concord_filename=unii_out, + ) + +def get_wikipedia_relationships(outfile, metadata_yaml): url = 'https://query.wikidata.org/sparql?format=json&query=SELECT ?chebi ?mesh WHERE { ?compound wdt:P683 ?chebi . ?compound wdt:P486 ?mesh. }' results = requests.get(url).json() pairs = [(f'{MESH}:{r["mesh"]["value"]}', f'{CHEBI}:{r["chebi"]["value"]}') @@ -521,7 +608,18 @@ def get_wikipedia_relationships(outfile): for m,c in pairs: outf.write(f'{m}\txref\t{c}\n') -def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_concord, type_file): + write_concord_metadata( + metadata_yaml, + name="get_wikipedia_relationships()", + sources=[{ + 'type': 'Wikidata', + 'name': 'Wikidata SPARQL query', + }], + description='Wikidata SPARQL query to find Wikidata entities with both CHEBI and MESH IDs, and build a concordance between them.', + concord_filename=outfile, + ) + +def build_untyped_compendia(concordances, identifiers, unichem_partial, untyped_concord, type_file, metadata_yaml, input_metadata_yamls): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = read_partial_unichem(unichem_partial) @@ -567,24 +665,41 @@ def build_untyped_compendia(concordances, identifiers,unichem_partial, untyped_c for s in untyped_sets: outf.write(f'{set(s)}\n') -def build_compendia(type_file, untyped_compendia_file, icrdf_filename): + # Build the metadata file by combining the input metadata_yamls. + write_combined_metadata( + filename=metadata_yaml, + typ='untyped_compendium', + name='chemicals.build_untyped_compendia()', + description=f'Generate an untyped compendium from concordances {concordances}, identifiers {identifiers}, " +' + f'unichem_partial {unichem_partial}, untyped_concord {untyped_concord}, and type file {type_file}.', + # sources=None, url='', counts=None, + combined_from_filenames=input_metadata_yamls, + ) + +def build_compendia(type_file, untyped_compendia_file, properties_jsonl_gz_files, metadata_yamls, icrdf_filename): types = {} with open(type_file,'r') as inf: for line in inf: x = line.strip().split('\t') types[x[0]] = x[1] + logger.info(f'Loaded {len(types)} types from {type_file}: {get_memory_usage_summary()}') + untyped_sets = set() with open(untyped_compendia_file,'r') as inf: for line in inf: s = ast.literal_eval(line.strip()) untyped_sets.add(frozenset(s)) + logger.info(f'Loaded {len(untyped_sets)} untyped sets from {untyped_compendia_file}: {get_memory_usage_summary()}') + typed_sets = create_typed_sets(untyped_sets, types) + logger.info(f'Created {len(typed_sets)} typed sets from {len(untyped_sets)} untyped sets: {get_memory_usage_summary()}') + for biotype, sets in typed_sets.items(): baretype = biotype.split(':')[-1] if biotype == DRUG: - write_compendium(sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[MESH,UNII], icrdf_filename=icrdf_filename, properties_jsonl_gz_files=properties_jsonl_gz_files) else: - write_compendium(sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{baretype}.txt', biotype, {}, extra_prefixes=[RXCUI], icrdf_filename=icrdf_filename, properties_jsonl_gz_files=properties_jsonl_gz_files) def create_typed_sets(eqsets, types): """ diff --git a/src/createcompendia/diseasephenotype.py b/src/createcompendia/diseasephenotype.py index 65431d63..d341a67f 100644 --- a/src/createcompendia/diseasephenotype.py +++ b/src/createcompendia/diseasephenotype.py @@ -2,6 +2,7 @@ from collections import defaultdict import src.datahandlers.obo as obo +from src.metadata.provenance import write_concord_metadata from src.prefixes import MESH, NCIT, MONDO, OMIM, HP, SNOMEDCT, MEDDRA, EFO, ORPHANET, ICD0, ICD9, ICD10, UMLS, KEGGDISEASE from src.categories import DISEASE, PHENOTYPIC_FEATURE @@ -87,28 +88,69 @@ def write_umls_ids(mrsty, outfile,badumlsfile): umls.write_umls_ids(mrsty, umlsmap, outfile, blocklist_umls_ids=badumls) -def build_disease_obo_relationships(outdir): +def build_disease_obo_relationships(outdir, metadata_yamls): #Create the equivalence pairs with open(f'{outdir}/{HP}', 'w') as outfile: + other_prefixes = {'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 } build_sets(f'{HP}:0000118', {HP:outfile}, ignore_list=['ICD'], - other_prefixes={'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 }, + other_prefixes=other_prefixes, set_type='xref') + + write_concord_metadata( + metadata_yamls['HP'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'HP' + } + ], + description=f'ubergraph.build_sets() of {HP}:0000118 with other_prefixes {other_prefixes}', + concord_filename=f'{outdir}/{HP}' + ) + with open(f'{outdir}/{MONDO}', 'w') as outfile: #Orphanet here is confusing. In mondo it comes out mixed case like "Orphanet" and we want to cap it. We have a normer # in build sets, but it is based on the UPPERCASED prefix. So we're passing in that we want to change uppercase orphanet to uppercase # orphanet. In actuality that matching key will pick up any case orphanet, including the one that actually occurs. build_sets('MONDO:0000001', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='exact', other_prefixes={'ORPHANET':ORPHANET}) + + write_concord_metadata(metadata_yamls['MONDO'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (exact) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes', + concord_filename=f'{outdir}/{MONDO}' + ) + with open(f'{outdir}/{MONDO}_close', 'w') as outfile: build_sets('MONDO:0000001', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) build_sets('MONDO:0042489', {MONDO:outfile}, set_type='close', other_prefixes={'ORPHANET':ORPHANET}) -def build_disease_efo_relationships(idfile,outfile): - efo.make_concords(idfile, outfile) + write_concord_metadata( + metadata_yamls['MONDO_close'], + name='build_disease_obo_relationships()', + sources=[ + { + 'type': 'UberGraph', + 'name': 'MONDO' + } + ], + description=f'ubergraph.build_sets() (close matches) of {MONDO}:0000001 and {MONDO}:0042489, including ORPHANET prefixes', + concord_filename=f'{outdir}/{MONDO}_close' + ) + +def build_disease_efo_relationships(idfile,outfile, metadata_yaml): + efo.make_concords(idfile, outfile, provenance_metadata=metadata_yaml) -def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile): +def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfile, metadata_yaml): #UMLS contains xrefs between a disease UMLS and a gene OMIM. So here we are saying: if you are going to link to # an omim identifier, make sure it's a disease omim, not some other thing. good_ids = {} @@ -118,15 +160,31 @@ def build_disease_umls_relationships(mrconso, idfile, outfile, omimfile, ncitfil for line in inf: x = line.split()[0] good_ids[prefix].add(x) - umls.build_sets(mrconso, idfile, outfile, {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM},acceptable_identifiers=good_ids) + umls.build_sets(mrconso, idfile, outfile, + {'SNOMEDCT_US':SNOMEDCT,'MSH': MESH, 'NCI': NCIT, 'HPO': HP, 'MDR':MEDDRA, 'OMIM': OMIM}, + acceptable_identifiers=good_ids, + provenance_metadata_yaml=metadata_yaml) -def build_disease_doid_relationships(idfile,outfile): - doid.build_xrefs(idfile, outfile, other_prefixes={'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, - 'SNOMEDCT_US_2018_03_01': SNOMEDCT, 'SNOMEDCT_US_2019_09_01': SNOMEDCT, - 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, - 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE}) +def build_disease_doid_relationships(idfile,outfile, metadata_yaml): + other_prefixes = {'ICD10CM':ICD10, 'ICD9CM':ICD9, 'ICDO': ICD0, 'NCI': NCIT, + 'SNOMEDCT_US_2018_03_01': SNOMEDCT, 'SNOMEDCT_US_2019_09_01': SNOMEDCT, + 'SNOMEDCT_US_2020_03_01': SNOMEDCT, 'SNOMEDCT_US_2020_09_01': SNOMEDCT, + 'UMLS_CUI': UMLS, 'KEGG': KEGGDISEASE} + doid.build_xrefs(idfile, outfile, other_prefixes=other_prefixes) + write_concord_metadata( + metadata_yaml, + name='build_disease_doid_relationships()', + description=f'build_disease_doid_relationships() using the DOID ID file {idfile} and other_prefixes {other_prefixes}', + concord_filename=outfile, + sources=[ + { + 'type': 'DOID', + 'name': 'doid.build_xrefs' + } + ] + ) -def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_filename): +def build_compendium(concordances, metadata_yamls, identifiers, mondoclose, badxrefs, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -173,7 +231,7 @@ def build_compendium(concordances, identifiers, mondoclose, badxrefs, icrdf_file typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into @@ -231,6 +289,7 @@ def read_badxrefs(fn): return morebad def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_filename): + metadata_yamls = [] #print('disease/phenotype') #print('get and write hp sets') #bad_mappings = read_bad_hp_mappings(badhpos) @@ -301,8 +360,8 @@ def load_diseases_and_phenotypes(concords,idlists,badhpos,badhpoxrefs, icrdf_fil print('dump it') fs = set([frozenset(x) for x in dicts.values()]) diseases,phenotypes = create_typed_sets(fs) - write_compendium(diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename) - write_compendium(phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, diseases,'disease.txt','biolink:Disease',labels, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, phenotypes,'phenotypes.txt','biolink:PhenotypicFeature',labels, icrdf_filename=icrdf_filename) if __name__ == '__main__': with open('crapfile','w') as crapfile: diff --git a/src/createcompendia/drugchemical.py b/src/createcompendia/drugchemical.py index 2de48047..3f9410c7 100644 --- a/src/createcompendia/drugchemical.py +++ b/src/createcompendia/drugchemical.py @@ -1,5 +1,6 @@ import csv +from src.metadata.provenance import write_combined_metadata, write_concord_metadata from src.node import NodeFactory, InformationContentFactory from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE, @@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui): print(x) exit() -def build_rxnorm_relationships(conso, relfile, outfile): +def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml): """RXNREL is a lousy file. The subject and object can sometimes be a CUI and sometimes an AUI and you have to use CONSO to figure out how to go back and forth. @@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile): #This is maybe relying on convention a bit too much. if outfile == "UMLS": prefix = UMLS + sources = [ + { + 'type': 'UMLS', + 'name': 'MRCONSO', + 'filename': conso + }, + { + 'type': 'UMLS', + 'name': 'MRREL', + 'filename': relfile + } + ] else: prefix = RXCUI + sources = [ + { + 'type': 'RXNORM', + 'name': 'RXNCONSO', + 'filename': conso + }, + { + 'type': 'RXNOM', + 'name': 'RXNREL', + 'filename': relfile + } + ] aui_to_cui, sdui_to_cui = get_aui_to_cui(conso) # relfile = os.path.join('input_data', 'private', "RXNREL.RRF") single_use_relations = {"has_active_ingredient": defaultdict(set), @@ -214,6 +239,14 @@ def build_rxnorm_relationships(conso, relfile, outfile): continue outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n") + write_concord_metadata( + metadata_yaml, + name='build_rxnorm_relationships()', + description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).', + sources=sources, + concord_filename=outfile, + ) + def load_cliques(compendium): rx_to_clique = {} @@ -228,7 +261,7 @@ def load_cliques(compendium): rx_to_clique[terms["i"]] = clique return rx_to_clique -def build_pubchem_relationships(infile,outfile): +def build_pubchem_relationships(infile,outfile, metadata_yaml): with open(infile,"r") as inf: document = json.load(inf) with open(outfile,"w") as outf: @@ -238,7 +271,20 @@ def build_pubchem_relationships(infile,outfile): for cid in cids: outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n") -def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename): + write_concord_metadata( + metadata_yaml, + name='build_pubchem_relationships()', + description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.', + sources=[{ + 'type': 'PubChem', + 'name': 'PubChem RxNorm annotations', + 'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()', + 'filename': infile + }], + concord_filename=outfile, + ) + +def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml): """RXN_concord contains relationshps between rxcuis that can be used to conflate Now we don't want all of them. We want the ones that are between drugs and chemicals, and the ones between drugs and drugs. @@ -251,6 +297,9 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem print("Loading manual concords ...") manual_concords = [] + manual_concords_curies = set() + manual_concords_predicate_counts = defaultdict(int) + manual_concords_curie_prefix_counts = defaultdict(int) with open(manual_concord_filename,"r") as manualf: csv_reader = csv.DictReader(manualf, dialect=csv.excel_tab) for row in csv_reader: @@ -260,6 +309,13 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem if row['subject'].strip() == '' or row['object'].strip() == '': raise RuntimeError(f"Empty subject or object fields in {manual_concord_filename}: {row}") manual_concords.append((row['subject'], row['object'])) + manual_concords_predicate_counts[row['predicate']] += 1 + manual_concords_curies.add(row['subject']) + manual_concords_curies.add(row['object']) + + sorted_curies = sorted([row['subject'], row['object']]) + prefix_count_label = row['predicate'] + '(' + (' ,'.join(sorted_curies)) + ')' + manual_concords_curie_prefix_counts[prefix_count_label] += 1 print(f"{len(manual_concords)} manual concords loaded.") print("load all chemical conflations so we can normalize identifiers") @@ -556,6 +612,27 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem outfile.write(f"{json.dumps(final_conflation_id_list)}\n") written.add(fs) + # Write out metadata.yaml + write_combined_metadata( + output_metadata_yaml, + typ='conflation', + name='drugchemical.build_conflation()', + description='Build DrugChemical conflation.', + combined_from_filenames=input_metadata_yamls, + also_combined_from={ + 'Manual': { + 'name': 'DrugChemical Manual', + 'filename': manual_concord_filename, + 'counts': { + 'count_concords': len(manual_concords), + 'count_distinct_curies': len(manual_concords_curies), + 'predicates': dict(manual_concords_predicate_counts), + 'prefix_counts': dict(manual_concords_curie_prefix_counts), + } + } + } + ) + def sort_by_curie_suffix(curie): """ diff --git a/src/createcompendia/gene.py b/src/createcompendia/gene.py index 1c89da02..9a574f2e 100644 --- a/src/createcompendia/gene.py +++ b/src/createcompendia/gene.py @@ -1,6 +1,7 @@ import re from src import babel_utils +from src.metadata.provenance import write_concord_metadata from src.prefixes import OMIM,ENSEMBL,NCBIGENE,WORMBASE, MGI, ZFIN, DICTYBASE, FLYBASE, RGD, SGD, HGNC, UMLS from src.categories import GENE @@ -23,8 +24,18 @@ def write_mods_ids(dd,id,modlist): x = line.split('\t')[0] outf.write(f'{x}\n') -def build_gene_ensembl_relationships(ensembl_dir, outfile): +def build_gene_ensembl_relationships(ensembl_dir, outfile, metadata_yaml): """Loop over all the ensembl species. Find any protein-coding gene""" + # Identifiers to extract. + column_to_prefix = { 'NCBI gene (formerly Entrezgene) ID': {NCBIGENE}, + 'ZFIN ID': {ZFIN}, + 'SGD gene name ID': {SGD}, + 'WormBase Gene ID': {WORMBASE}, + 'FlyBase ID': {FLYBASE}, + 'MGI ID': {MGI}, + 'RGD ID': {RGD} + } + with open(outfile,'w') as outf: #find all the ensembl directories dirlisting = os.listdir(ensembl_dir) @@ -39,14 +50,6 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile): h = inf.readline() x = h[:-1].split('\t') gene_column = x.index('Gene stable ID') - column_to_prefix = { 'NCBI gene (formerly Entrezgene) ID': {NCBIGENE}, - 'ZFIN ID': {ZFIN}, - 'SGD gene name ID': {SGD}, - 'WormBase Gene ID': {WORMBASE}, - 'FlyBase ID': {FLYBASE}, - 'MGI ID': {MGI}, - 'RGD ID': {RGD} - } protein_column = x.index('Protein stable ID') columnno_to_prefix = {} for i,v in enumerate(x): @@ -74,6 +77,17 @@ def build_gene_ensembl_relationships(ensembl_dir, outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ENSEMBL}:{ensembl_id_without_version}\teq\t{gid}\n') + write_concord_metadata( + metadata_yaml, + name='build_gene_ensembl_relationships()', + description=f'Extracts gene-ensembl relationships from the ensembl files ({ensembl_dir}) for prefixes: {column_to_prefix.values()}', + sources=[{ + 'name': 'ENSEMBL', + 'filename': ensembl_dir, + }], + concord_filename=outfile, + ) + def write_zfin_ids(infile,outfile): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: @@ -155,7 +169,7 @@ def read_ncbi_idfile(ncbi_idfile): ncbi_ids.add(x) return ncbi_ids -def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile): +def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile, metadata_yaml): ncbi_ids = read_ncbi_idfile(ncbi_idfile) with gzip.open(infile,'r') as inf, open(outfile,'w') as outf: h = inf.readline() @@ -181,7 +195,20 @@ def build_gene_ncbi_ensembl_relationships(infile,ncbi_idfile,outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ncbigene_id}\teq\t{ENSEMBL}:{ensembl_id_without_version}\n') -def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile): + write_concord_metadata( + metadata_yaml, + name='build_gene_ncbi_ensembl_relationships()', + description=f'Extracts gene-ensembl relationships from the NCBIGene gene2ensembl.gz file ({infile}), filtering to ' + + f'NCBIGene IDs in {ncbi_idfile}', + sources=[{ + 'type': 'NCBIGENE', + 'name': 'NCBIGene gene2ensembl.gz', + 'filename': infile, + }], + concord_filename=outfile, + ) + +def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile, metadata_yaml): mappings = {'WormBase': WORMBASE, 'FLYBASE': FLYBASE, 'ZFIN': ZFIN, 'HGNC': HGNC, 'MGI': MGI, 'RGD': RGD, 'dictyBase': DICTYBASE, 'SGD': SGD } @@ -202,7 +229,20 @@ def build_gene_ncbigene_xrefs(infile,ncbi_idfile,outfile): if found_prefix in mappings: outf.write(f'{ncbigene_id}\txref\t{mappings[found_prefix]}:{xref_parts[-1]}\n') -def build_gene_medgen_relationships(infile,outfile): + write_concord_metadata( + metadata_yaml, + name='build_gene_ncbigene_xrefs()', + description=f'Extracts gene-xref relationships from the NCBIGene gene_info.gz file ({infile}), filtering to ' + + f'NCBIGene IDs in {ncbi_idfile} and extracting mappings for prefixes {mappings.values()}', + sources=[{ + 'type': 'NCBIGENE', + 'name': 'NCBIGene gene_info.gz', + 'filename': infile, + }], + concord_filename=outfile, + ) + +def build_gene_medgen_relationships(infile,outfile, metadata_yaml): with open(infile, 'r') as inf, open(outfile, 'w') as outf: h = inf.readline() for line in inf: @@ -217,7 +257,18 @@ def build_gene_medgen_relationships(infile,outfile): umls_id = f'{UMLS}:{x[4]}' outf.write(f'{ncbigene_id}\teq\t{umls_id}\n') -def write_ensembl_ids(ensembl_dir, outfile): + write_concord_metadata( + metadata_yaml, + name='build_gene_medgen_relationships()', + description=f'Extracts gene-OMIM relationships from the mim2gene_medgen file ({infile})', + sources=[{ + 'name': 'MIM2Gene MEDGEN', + 'filename': infile, + }], + concord_filename=outfile, + ) + +def write_ensembl_gene_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile,'w') as outf: #find all the ensembl directories @@ -247,11 +298,11 @@ def write_ensembl_ids(ensembl_dir, outfile): outf.write(f'{gid}\n') -def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile): +def build_gene_umls_hgnc_relationships(mrconso, umls_idfile, outfile, metadata_yaml): #Could also add MESH, if that were a valid gene prefix - umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}) + umls.build_sets(mrconso, umls_idfile, outfile, {'HGNC':HGNC}, provenance_metadata_yaml=metadata_yaml) -def build_gene_compendia(concordances, identifiers, icrdf_filename): +def build_gene_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -273,5 +324,5 @@ def build_gene_compendia(concordances, identifiers, icrdf_filename): glom(dicts, pairs, unique_prefixes=uniques) gene_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', GENE, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/genefamily.py b/src/createcompendia/genefamily.py index fb26f9f4..b2163177 100644 --- a/src/createcompendia/genefamily.py +++ b/src/createcompendia/genefamily.py @@ -2,7 +2,7 @@ from src.babel_utils import read_identifier_file,glom,write_compendium -def build_compendia(identifiers, icrdf_filename): +def build_compendia(identifiers, metadata_yamls, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -15,5 +15,5 @@ def build_compendia(identifiers, icrdf_filename): types.update(new_types) genefam_sets = set([frozenset(x) for x in dicts.values()]) baretype = GENE_FAMILY.split(':')[-1] - write_compendium(genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, genefam_sets, f'{baretype}.txt', GENE_FAMILY, {}, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/geneprotein.py b/src/createcompendia/geneprotein.py index 0655b86f..c73d2bb5 100644 --- a/src/createcompendia/geneprotein.py +++ b/src/createcompendia/geneprotein.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import UNIPROTKB, NCBIGENE from src.babel_utils import glom from collections import defaultdict @@ -8,7 +9,7 @@ from src.util import LoggingUtil logger = LoggingUtil.init_logging(__name__, level=logging.ERROR) -def build_uniprotkb_ncbigene_relationships(infile,outfile): +def build_uniprotkb_ncbigene_relationships(infile,outfile, metadata_yaml): #The trick is that the uniprot mapping file can have more than one gene per protein. # Our model is 1 gene, many proteins, so this causes trouble. # For the moment, we will not include that have more than one gene per protein @@ -26,6 +27,18 @@ def build_uniprotkb_ncbigene_relationships(infile,outfile): ncbigene_id = ncbigene_ids[0] outf.write(f'{uniprot_id}\trelated_to\t{ncbigene_id}\n') + write_concord_metadata( + metadata_yaml, + name='build_uniprotkb_ncbigene_relationships()', + description='Extract NCBIGene-UniProtKB relationships from UniProtKB id-mapping file {infile}', + sources=[{ + 'type': 'UniProtKB', + 'name': 'UniProtKB idmapping file', + 'filename': infile, + }], + concord_filename=outfile, + ) + def merge(geneproteinlist): """We have a gene and one or more proteins. We want to create a combined something.""" diff --git a/src/createcompendia/macromolecular_complex.py b/src/createcompendia/macromolecular_complex.py index ceb482b2..7ca4dd29 100644 --- a/src/createcompendia/macromolecular_complex.py +++ b/src/createcompendia/macromolecular_complex.py @@ -4,7 +4,7 @@ import src.datahandlers.complexportal as complexportal from src.babel_utils import read_identifier_file, glom, write_compendium -def build_compendia(identifiers, icrdf_filename): +def build_compendia(identifiers, metadata_yamls, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -16,4 +16,4 @@ def build_compendia(identifiers, icrdf_filename): types.update(new_types) sets = set([frozenset(x) for x in dicts.values()]) type = MACROMOLECULAR_COMPLEX.split(':')[-1] - write_compendium(sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL], icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets, f'{type}.txt', MACROMOLECULAR_COMPLEX, {}, extra_prefixes=[COMPLEXPORTAL], icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/processactivitypathway.py b/src/createcompendia/processactivitypathway.py index a303efd8..1d4b71ee 100644 --- a/src/createcompendia/processactivitypathway.py +++ b/src/createcompendia/processactivitypathway.py @@ -6,6 +6,7 @@ import src.datahandlers.rhea as rhea import src.datahandlers.ec as ec import src.datahandlers.umls as umls +from src.metadata.provenance import write_concord_metadata from src.prefixes import GO, REACT, WIKIPATHWAYS, RHEA, SMPDB, EC, PANTHERPATHWAY, TCDB from src.categories import BIOLOGICAL_PROCESS, MOLECULAR_ACTIVITY, PATHWAY @@ -42,10 +43,10 @@ def write_umls_ids(mrsty, outfile): } umls.write_umls_ids(mrsty, umlsmap, outfile) -def build_process_umls_relationships(mrconso, idfile,outfile): - umls.build_sets(mrconso, idfile, outfile, {'GO': GO}) +def build_process_umls_relationships(mrconso, idfile,outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'GO': GO}, provenance_metadata_yaml=metadata_yaml) -def build_process_obo_relationships(outdir): +def build_process_obo_relationships(outdir, metadata_yaml): #Create the equivalence pairs #op={'MSH':MESH,'SNOMEDCT_US':SNOMEDCT,'SNOMED_CT': SNOMEDCT, 'ORPHANET':ORPHANET, 'ICD-9':ICD9, 'ICD-10':ICD10, 'ICD-0':ICD0, 'ICD-O':ICD0 } op={'WIKIPEDIA': WIKIPATHWAYS, 'REACTOME':REACT, 'TC':TCDB } @@ -54,11 +55,23 @@ def build_process_obo_relationships(outdir): build_sets(f'{GO}:0008150', {GO:outfile}, set_type='xref', other_prefixes=op ) build_sets(f'{GO}:0003674', {GO:outfile}, set_type='xref', other_prefixes=op ) -def build_process_rhea_relationships(outfile): - rhea.make_concord(outfile) + write_concord_metadata( + metadata_yaml, + name='build_process_obo_relationships()', + description=f"Extract GO-GO relationships from UberGraph with get_subclasses_and_xrefs() from {GO}:0007165, {GO}:0008150 and {GO}:0003674," + f"with other_prefixes {op.values()}", + sources=[{ + 'type': 'UberGraph', + 'name': 'GO-GO relationships from UberGraph', + }], + concord_filename=f'{outdir}/{GO}' + ) +def build_process_rhea_relationships(outfile, metadata_yaml): + rhea.make_concord(outfile, metadata_yaml) -def build_compendia(concordances, identifiers, icrdf_filename): + +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" #These are concords that cause problems and are being special cased out. In disease/process we put these in some @@ -105,7 +118,7 @@ def build_compendia(concordances, identifiers, icrdf_filename): typed_sets = create_typed_sets(set([frozenset(x) for x in dicts.values()]),types) for biotype,sets in typed_sets.items(): baretype = biotype.split(':')[-1] - write_compendium(sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, sets,f'{baretype}.txt',biotype,{}, icrdf_filename=icrdf_filename) def create_typed_sets(eqsets,types): """Given a set of sets of equivalent identifiers, we want to type each one into diff --git a/src/createcompendia/protein.py b/src/createcompendia/protein.py index 20bf9287..1fd22d1b 100644 --- a/src/createcompendia/protein.py +++ b/src/createcompendia/protein.py @@ -1,6 +1,7 @@ import re -from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON +from src.metadata.provenance import write_concord_metadata +from src.prefixes import ENSEMBL, UMLS, PR, UNIPROTKB, NCIT, NCBITAXON, MESH, DRUGBANK from src.categories import PROTEIN import src.datahandlers.umls as umls @@ -10,12 +11,9 @@ from src.babel_utils import read_identifier_file,glom,write_compendium,Text import os -import json -import gzip +from src.util import get_memory_usage_summary, get_logger -import logging -from src.util import LoggingUtil -logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) +logger = get_logger(__name__) def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename): @@ -30,35 +28,6 @@ def extract_taxon_ids_from_uniprotkb(idmapping_filename, uniprotkb_taxa_filename outf.write(f'{UNIPROTKB}:{x[0]}\t{NCBITAXON}:{x[2]}\n') -def write_ensembl_ids(ensembl_dir, outfile): - """Loop over all the ensembl species. Find any protein-coding gene""" - with open(outfile,'w') as outf: - #find all the ensembl directories - dirlisting = os.listdir(ensembl_dir) - for dl in dirlisting: - dlpath = os.path.join(ensembl_dir,dl) - if os.path.isdir(dlpath): - infname = os.path.join(dlpath,'BioMart.tsv') - if os.path.exists(infname): - #open each ensembl file, find the id column, and put it in the output - with open(infname,'r') as inf: - wrote=set() - h = inf.readline() - x = h[:-1].split('\t') - gene_column = x.index('Gene stable ID') - protein_column = x.index('Protein stable ID') - for line in inf: - x = line[:-1].split('\t') - #Is it protein coding? - if x[protein_column] == '': - continue - gid = f'{ENSEMBL}:{x[gene_column]}' - #The gid is not unique, so don't write the same one over again - if gid in wrote: - continue - wrote.add(gid) - outf.write(f'{gid}\n') - def write_umls_ids(mrsty, outfile): umlsmap = {} umlsmap['A1.4.1.2.1.7'] = PROTEIN @@ -69,7 +38,7 @@ def write_pr_ids(outfile): obo.write_obo_ids([(protein_id, PROTEIN)], outfile, [PROTEIN]) -def write_ensembl_ids(ensembl_dir, outfile): +def write_ensembl_protein_ids(ensembl_dir, outfile): """Loop over all the ensembl species. Find any protein-coding gene""" with open(outfile, 'w') as outf: # find all the ensembl directories @@ -97,7 +66,7 @@ def write_ensembl_ids(ensembl_dir, outfile): wrote.add(pid) outf.write(f'{pid}\n') -def build_pr_uniprot_relationships(outfile, ignore_list = []): +def build_pr_uniprot_relationships(outfile, ignore_list = [], metadata_yaml = None): """Given an IRI create a list of sets. Each set is a set of equivalent LabeledIDs, and there is a set for each subclass of the input iri. Write these lists to concord files, indexed by the prefix""" iri = 'PR:000000001' @@ -110,7 +79,19 @@ def build_pr_uniprot_relationships(outfile, ignore_list = []): if k.startswith('PR'): concfile.write(f'{k}\txref\t{x}\n') -def build_protein_uniprotkb_ensemble_relationships(infile,outfile): + if metadata_yaml: + write_concord_metadata( + metadata_yaml, + name='build_pr_uniprot_relationships()', + description=f"Extracts {PR} xrefs from UberGraph after getting subclasses and xrefs of {iri}, ignoring {ignore_list}.", + sources=[{ + 'type': 'UberGraph', + 'name': 'UberGraph', + }], + concord_filename=outfile, + ) + +def build_protein_uniprotkb_ensemble_relationships(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: x = line.strip().split() @@ -128,8 +109,19 @@ def build_protein_uniprotkb_ensemble_relationships(infile,outfile): ensembl_id_without_version = res.group(1) outf.write(f'{ensembl_id}\teq\t{ENSEMBL}:{ensembl_id_without_version}\n') + write_concord_metadata( + metadata_yaml, + name='build_protein_uniprotkb_ensemble_relationships()', + description=f'Extracts {UNIPROTKB}-to-{ENSEMBL} relationships from the ENSEMBL id-mapping file ({infile}) file.', + sources=[{ + 'name': 'ENSEMBL', + 'filename': infile, + }], + concord_filename=outfile, + ) + -def build_ncit_uniprot_relationships(infile,outfile): +def build_ncit_uniprot_relationships(infile,outfile, metadata_yaml): with open(infile,'r') as inf, open(outfile,'w') as outf: for line in inf: # These lines are sometimes empty (I think because the @@ -144,36 +136,59 @@ def build_ncit_uniprot_relationships(infile,outfile): uniprot_id = f'{UNIPROTKB}:{x[1]}' outf.write(f'{ncit_id}\teq\t{uniprot_id}\n') -def build_umls_ncit_relationships(mrconso, idfile, outfile): - umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}) - -def build_protein_compendia(concordances, identifiers, icrdf_filename): + write_concord_metadata( + metadata_yaml, + name='build_ncit_uniprot_relationships()', + description=f'Extracts {NCIT}-to-{UNIPROTKB} relationships from the NCIt-SwissProt_Mapping file ({infile}).', + sources=[{ + 'name': 'NCIt-SwissProt Mapping file', + 'filename': infile, + }], + concord_filename=outfile, + ) + +def build_umls_ncit_relationships(mrconso, idfile, outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'NCI': NCIT}, provenance_metadata_yaml=metadata_yaml) + +def build_umls_relationships(mrconso, idfile, outfile, metadata_yaml): + # The corresponding code in chemicals also includes (1) {'RXNORM': RXCUI}, and (2) we also pull in RxNorm to + # provide the inverse concords (i.e. RxNorm -> MESH and DRUGBANK). Doing so will probably fix some RXCUI IDs, + # but assigning RXCUI to proteins seems like a bridge too far for me. + # + # TODO: we should probably add some kind of filtering so we don't include concords that point to chemicals rather + # than proteins, which could result in duplicates (if the same ID is picked up in both chemicals and proteins). + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'DRUGBANK': DRUGBANK}, provenance_metadata_yaml=metadata_yaml) + +def build_protein_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} types = {} uniques = [UNIPROTKB,PR] + logger.info(f"Started building protein compendia ({concordances}, {metadata_yamls}, {identifiers}, {icrdf_filename}) with uniques {uniques}") for ifile in identifiers: - print(ifile) + logger.info(f"Loading identifier file {ifile}") new_identifiers, new_types = read_identifier_file(ifile) glom(dicts, new_identifiers, unique_prefixes= uniques) types.update(new_types) + logger.info(f"Loaded identifier file {ifile}: {get_memory_usage_summary()}") + logger.info(f"Finished loading identifiers, memory usage: {get_memory_usage_summary()}") for infile in concordances: - print(infile) - print('loading', infile) + logger.info(f"Loading concordance file {infile}") pairs = [] with open(infile, 'r') as inf: for line_index, line in enumerate(inf): - # if line_index % 10000 == 0: - # print("Loaded line count", line_index) + if line_index % 1000000 == 0: + logger.info(f"Loading concordance file {infile}: line {line_index:,}") x = line.strip().split('\t') pairs.append(set([x[0], x[2]])) # print("glomming", infile) # This takes a while, but doesn't add much to the memory glom(dicts, pairs, unique_prefixes=uniques) - print("glommed", infile) - # print("merging dicts") # This seems to increase memory usage slightly. + logger.info(f"Loaded concordance file {infile}: {get_memory_usage_summary()}") + logger.info(f"Finished loading concordances, memory usage: {get_memory_usage_summary()}") + logger.info(f"Building gene sets") gene_sets = set([frozenset(x) for x in dicts.values()]) - print("merged dicts", infile) + logger.info(f"Gene sets built, memory usage: {get_memory_usage_summary()}") #Try to preserve some memory here. dicts.clear() @@ -182,5 +197,6 @@ def build_protein_compendia(concordances, identifiers, icrdf_filename): # only then generate the compendium from those input files. baretype = PROTEIN.split(':')[-1] - write_compendium(gene_sets, f'{baretype}.txt', PROTEIN, {}, icrdf_filename=icrdf_filename) - + logger.info(f"Writing compendium for {baretype}, memory usage: {get_memory_usage_summary()}") + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', PROTEIN, {}, extra_prefixes=[DRUGBANK], icrdf_filename=icrdf_filename) + logger.info(f"Wrote compendium for {baretype}, memory usage: {get_memory_usage_summary()}") diff --git a/src/createcompendia/publications.py b/src/createcompendia/publications.py index 41c8eafd..01d95c44 100644 --- a/src/createcompendia/publications.py +++ b/src/createcompendia/publications.py @@ -11,6 +11,7 @@ from src.babel_utils import pull_via_wget, WgetRecursionOptions, glom, read_identifier_file, write_compendium from src.categories import JOURNAL_ARTICLE, PUBLICATION +from src.metadata.provenance import write_concord_metadata from src.prefixes import PMID, DOI, PMC @@ -50,7 +51,8 @@ def download_pubmed(download_file, timestamping=True) # Step 3. Download the PMC/PMID mapping file from PMC. - pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed') + # We don't actually use this file -- we currently only use the PMC IDs already included in the PubMed XML files. + # pull_via_wget(pmc_base, 'PMC-ids.csv.gz', decompress=True, subpath='PubMed') # We're all done! Path.touch(download_file) @@ -136,7 +138,7 @@ def verify_pubmed_downloads(pubmed_directories, def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_file, pmid_id_file, - pmid_doi_concord_file): + pmid_doi_concord_file, metadata_yaml): """ Read through the PubMed files in the baseline_dir and updatefiles_dir, and writes out label and status information. @@ -145,6 +147,7 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi :param titles_file: An output TSV file in the format `\t`. :param status_file: A JSON file containing publication status information. :param pmid_doi_concord_file: A concord file in the format `<PMID>\teq\t<DOI>` and other identifiers. + :param metadata_yaml: The metadata YAML file to write. """ # We can write labels and concords as we go. @@ -245,8 +248,26 @@ def parse_pubmed_into_tsvs(baseline_dir, updatefiles_dir, titles_file, status_fi for pmid, statuses in pmid_status.items(): statusf.write(json.dumps({'id': pmid, 'statuses': sorted(statuses)}, sort_keys=True) + '\n') - -def generate_compendium(concordances, identifiers, titles, publication_compendium, icrdf_filename): + write_concord_metadata( + metadata_yaml, + name='parse_pubmed_into_tsvs()', + description="Parse PubMed files into TSVs and JSONL status files.", + sources=[{ + 'type': 'download', + 'name': 'PubMed Baseline', + 'url': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/' + }, { + 'type': 'download', + 'name': 'PubMed Updates', + 'url': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/' + }], + counts={ + 'pmid_count': len(pmid_status.keys()), + }, + concord_filename=pmid_doi_concord_file, + ) + +def generate_compendium(concordances, metadata_yamls, identifiers, titles, publication_compendium, icrdf_filename): """ Generate a Publication compendium using the ID and Concord files provided. @@ -295,5 +316,5 @@ def generate_compendium(concordances, identifiers, titles, publication_compendiu # Write out the compendium. publication_sets = set([frozenset(x) for x in dicts.values()]) baretype = PUBLICATION.split(':')[-1] - write_compendium(publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels, + write_compendium(metadata_yamls, publication_sets, os.path.basename(publication_compendium), PUBLICATION, labels, icrdf_filename=icrdf_filename) diff --git a/src/createcompendia/taxon.py b/src/createcompendia/taxon.py index f9a71661..bfa40099 100644 --- a/src/createcompendia/taxon.py +++ b/src/createcompendia/taxon.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import NCBITAXON,MESH,UMLS from src.categories import ORGANISM_TAXON @@ -61,10 +62,10 @@ def write_umls_ids(mrsty, outfile): ]} umls.write_umls_ids(mrsty, umlsmap,outfile) -def build_taxon_umls_relationships(mrconso, idfile, outfile): - umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}) +def build_taxon_umls_relationships(mrconso, idfile, outfile, metadata_yaml): + umls.build_sets(mrconso, idfile, outfile, {'MSH': MESH, 'NCBITaxon': NCBITAXON}, provenance_metadata_yaml=metadata_yaml) -def build_relationships(outfile,mesh_ids): +def build_relationships(outfile,mesh_ids, metadata_yaml): regis = mesh.pull_mesh_registry() with open(mesh_ids,'r') as inf: lines = inf.read().strip().split('\n') @@ -80,9 +81,20 @@ def build_relationships(outfile,mesh_ids): #left = list(all_mesh_taxa.difference( set([x[0] for x in regis]) )) #eutil.lookup(left) + write_concord_metadata( + metadata_yaml, + name='build_relationships()', + description='Builds relationships between MeSH and NCBI Taxon from the MeSH registry.', + sources=[{ + 'type': 'MeSH', + 'name': 'MeSH Registry', + 'url': 'ftp://ftp.nlm.nih.gov/online/mesh/rdf/mesh.nt.gz', + }], + concord_filename=outfile, + ) -def build_compendia(concordances, identifiers, icrdf_filename): +def build_compendia(concordances, metadata_yamls, identifiers, icrdf_filename): """:concordances: a list of files from which to read relationships :identifiers: a list of files from which to read identifiers and optional categories""" dicts = {} @@ -106,5 +118,5 @@ def build_compendia(concordances, identifiers, icrdf_filename): baretype = ORGANISM_TAXON.split(':')[-1] # We need to use extra_prefixes since UMLS is not listed as an identifier prefix at # https://biolink.github.io/biolink-model/docs/OrganismTaxon.html - write_compendium(gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}, icrdf_filename=icrdf_filename) + write_compendium(metadata_yamls, gene_sets, f'{baretype}.txt', ORGANISM_TAXON, {}, icrdf_filename=icrdf_filename) diff --git a/src/datahandlers/clo.py b/src/datahandlers/clo.py index 018f8d44..625c02e6 100644 --- a/src/datahandlers/clo.py +++ b/src/datahandlers/clo.py @@ -1,6 +1,7 @@ import logging import re +from src.metadata.provenance import write_download_metadata from src.prefixes import CLO from src.categories import CELL_LINE from src.babel_utils import pull_via_urllib @@ -10,8 +11,12 @@ logger = LoggingUtil.init_logging(__name__, level=logging.WARNING) -def pull_clo(): +def pull_clo(metadata_file): _=pull_via_urllib('http://purl.obolibrary.org/obo/','clo.owl', subpath='CLO', decompress=False) + write_download_metadata(metadata_file, + name='Cell Line Ontology', + url='http://purl.obolibrary.org/obo/clo.owl', + ) class CLOgraph: """Load the file for querying""" diff --git a/src/datahandlers/complexportal.py b/src/datahandlers/complexportal.py index 30dc3c7c..f7b679a9 100644 --- a/src/datahandlers/complexportal.py +++ b/src/datahandlers/complexportal.py @@ -1,10 +1,11 @@ from src.babel_utils import pull_via_urllib, make_local_name +from src.metadata.provenance import write_metadata from src.prefixes import COMPLEXPORTAL def pull_complexportal(): pull_via_urllib('http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/',f'559292.tsv', decompress=False, subpath=COMPLEXPORTAL) -def make_labels_and_synonyms(infile, labelfile, synfile): +def make_labels_and_synonyms(infile, labelfile, synfile, metadata_yaml): usedsyns = set() with open(infile, 'r') as inf, open(labelfile, 'w') as outl, open(synfile, 'w') as outsyn: next(inf) # skip header @@ -20,3 +21,15 @@ def make_labels_and_synonyms(infile, labelfile, synfile): if not syn in usedsyns: outsyn.write(f'{COMPLEXPORTAL}:{id}\t{syn}\n') usedsyns.add(syn) + + write_metadata( + metadata_yaml, + typ='transform', + name='ComplexPortal', + description='Labels and synonyms extracted from ComplexPortal download of 559292 (Saccharomyces cerevisiae)', + sources=[{ + 'type': 'download', + 'name': 'ComplexPortal for organism 559292 (Saccharomyces cerevisiae)', + 'url': 'http://ftp.ebi.ac.uk/pub/databases/intact/complex/current/complextab/559292.tsv' + }] + ) diff --git a/src/datahandlers/efo.py b/src/datahandlers/efo.py index 03fd59f1..f3270e7d 100644 --- a/src/datahandlers/efo.py +++ b/src/datahandlers/efo.py @@ -1,6 +1,7 @@ import logging import re +from src.metadata.provenance import write_concord_metadata from src.prefixes import EFO,ORPHANET from src.babel_utils import pull_via_urllib from src.babel_utils import make_local_name @@ -165,7 +166,7 @@ def make_ids(roots,idfname): m = EFOgraph() m.pull_EFO_ids(roots,idfname) -def make_concords(idfilename, outfilename): +def make_concords(idfilename, outfilename, provenance_metadata=None): """Given a list of identifiers, find out all of the equivalent identifiers from the owl""" m = EFOgraph() with open(idfilename,"r") as inf, open(outfilename,"w") as concfile: @@ -174,3 +175,15 @@ def make_concords(idfilename, outfilename): nexacts = m.get_exacts(efo_id,concfile) if nexacts == 0: m.get_xrefs(efo_id,concfile) + + if provenance_metadata is not None: + write_concord_metadata( + provenance_metadata, + name='Experimental Factor Ontology (EFO) cross-references', + description=f'Cross-references from the Experimental Factor Ontology (EFO) for the EFO IDs in {idfilename}', + sources=[{ + 'name': 'Experimental Factor Ontology', + 'url': 'http://www.ebi.ac.uk/efo/efo.owl', + }], + concord_filename=outfilename, + ) diff --git a/src/datahandlers/hgncfamily.py b/src/datahandlers/hgncfamily.py index cc6f8c13..161a9d93 100644 --- a/src/datahandlers/hgncfamily.py +++ b/src/datahandlers/hgncfamily.py @@ -1,6 +1,7 @@ from pronto.utils.io import decompress from src.babel_utils import make_local_name, pull_via_ftp, pull_via_urllib +from src.metadata.provenance import write_metadata from src.prefixes import HGNCFAMILY def pull_hgncfamily(): @@ -10,7 +11,7 @@ def pull_hgncfamily(): decompress=False, subpath=HGNCFAMILY) -def pull_labels(infile,outfile): +def pull_labels(infile,outfile, metadata_yaml): with open(infile,'r') as inf: data = inf.read().strip() lines = data.split('\n') @@ -24,3 +25,15 @@ def pull_labels(infile,outfile): l = parts[2][1:-1] outf.write(f'{i}\t{l}\n') + write_metadata( + metadata_yaml, + typ='transform', + name='HGNC Gene Family labels', + description='Labels extracted from HGNC GeneFamily CSV download', + sources=[{ + 'type': 'download', + 'name': 'HGNC Gene Family', + 'url': 'https://storage.googleapis.com/public-download-files/hgnc/csv/csv/genefamily_db_tables/family.csv', + 'description': 'HGNC GeneFamily CSV download' + }] + ) diff --git a/src/datahandlers/pantherfamily.py b/src/datahandlers/pantherfamily.py index f4a0c596..0947b403 100644 --- a/src/datahandlers/pantherfamily.py +++ b/src/datahandlers/pantherfamily.py @@ -1,4 +1,5 @@ from src.babel_utils import make_local_name, pull_via_ftp +from src.metadata.provenance import write_metadata from src.prefixes import PANTHERFAMILY def pull_pantherfamily(): @@ -7,7 +8,7 @@ def pull_pantherfamily(): # If you need to check this quickly, it's also available on HTTP at: # - http://data.pantherdb.org/ftp/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/ -def pull_labels(infile,outfile): +def pull_labels(infile,outfile, metadata_yaml): with open(infile,'r') as inf: data = inf.read() lines = data.strip().split('\n') @@ -38,3 +39,15 @@ def pull_labels(infile,outfile): #labels[sub_family]=sfname labelf.write(f'{sub_family}\t{sfname}\n') done.add(sf) + + write_metadata( + metadata_yaml, + typ='transform', + name='pantherfamily.pull_labels()', + description='Main families and subfamily labels extracted from PANTHER Sequence Classification human.', + sources=[{ + 'type': 'download', + 'name': 'PANTHER Sequence Classification: Human', + 'url': 'ftp://ftp.pantherdb.org/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/PTHR19.0_human', + }] + ) diff --git a/src/datahandlers/rhea.py b/src/datahandlers/rhea.py index 6546e8ff..6691b79a 100644 --- a/src/datahandlers/rhea.py +++ b/src/datahandlers/rhea.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import RHEA,EC from src.babel_utils import pull_via_urllib from src.babel_utils import make_local_name, pull_via_ftp @@ -12,6 +13,7 @@ class Rhea: """Load the mesh rdf file for querying""" def __init__(self): ifname = make_local_name('rhea.rdf', subpath='RHEA') + self.filename = ifname from datetime import datetime as dt print('loading rhea') start = dt.now() @@ -37,7 +39,7 @@ def pull_rhea_labels(self,ofname): #The rhea ids in the rdf use the currently approved prefix, but just to be sure... rheaid = iterm.split(':')[-1] outf.write(f'{RHEA}:{rheaid}\t{label}\n') - def pull_rhea_ec_concs(self,ofname): + def pull_rhea_ec_concs(self,ofname, metadata_yaml): s=""" PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX rh: <http://rdf.rhea-db.org/> @@ -55,12 +57,29 @@ def pull_rhea_ec_concs(self,ofname): rheaid = iterm.split(':')[-1] outf.write(f'{RHEA}:{rheaid}\toio:equivalent\t{ec}\n') + write_concord_metadata( + metadata_yaml, + name='Rhea.pull_rhea_ec_concs()', + description=f'pull_rhea_ec_concs() extracts the EC number/accession number mappings from the Rhea RDF file ({self.filename}).', + sources=[{ + 'type': 'rdf', + 'name': 'rhea.rdf', + 'filename': self.filename, + 'sources': [{ + 'type': 'download', + 'name': 'rhea.rdf', + 'url': 'https://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz', + }] + }], + concord_filename=ofname, + ) + #Ids are handled by just getting everything from the labels def make_labels(labelfile): m = Rhea() m.pull_rhea_labels(labelfile) -def make_concord(concfile): +def make_concord(concfile, metadata_yaml): m = Rhea() - m.pull_rhea_ec_concs(concfile) \ No newline at end of file + m.pull_rhea_ec_concs(concfile, metadata_yaml) diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index c840c896..32c5d580 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -1,3 +1,4 @@ +from src.metadata.provenance import write_concord_metadata from src.prefixes import UMLS, RXCUI from src.babel_utils import make_local_name from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE @@ -200,8 +201,8 @@ def write_rxnorm_ids(category_map, bad_categories, infile, outfile,prefix=RXCUI, # One is to keep from having to pass through the umls file more than once, but that's a bad reason # The second is because I want to use the UMLS as a source for some terminologies (SNOMED) even if there's another # way. I'm going to modify this to do one thing at a time, and if it takes a little longer, then so be it. -def build_sets(mrconso,umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, - cui_prefix = UMLS): +def build_sets(mrconso, umls_input, umls_output , other_prefixes, bad_mappings=defaultdict(set), acceptable_identifiers={}, + cui_prefix = UMLS, provenance_metadata_yaml=None): """Given a list of umls identifiers we want to generate all the concordances between UMLS and that other entity""" # On UMLS / MESH: we have been getting all UMLS / MESH relationships. This has led to some clear mistakes @@ -259,6 +260,18 @@ def build_sets(mrconso,umls_input, umls_output , other_prefixes, bad_mappings=de concordfile.write(f'{tup[0]}\teq\t{tup[1]}\n') pairs.add(tup) + # Write provenance for this build_sets() call. + if provenance_metadata_yaml is not None: + write_concord_metadata(provenance_metadata_yaml, + name='umls.build_sets()', + sources=[{ + 'type': 'UMLS', + 'name': 'MRCONSO' + }], + description=f'umls.build_sets() using UMLS MRCONSO with prefixes: {other_prefixes} with cui_prefix set to {cui_prefix}', + concord_filename=umls_output, + ) + def read_umls_priority(): mrp = os.path.join('input_data', 'umls_precedence.txt') pris = [] diff --git a/src/metadata/provenance.py b/src/metadata/provenance.py new file mode 100644 index 00000000..1e22c201 --- /dev/null +++ b/src/metadata/provenance.py @@ -0,0 +1,125 @@ +import logging +import os.path +import traceback +from collections import defaultdict +from datetime import datetime + +import yaml + +def write_download_metadata(filename, *, name, url='', description='', sources=None, counts=None): + write_metadata(filename, 'download', name, url=url, description=description, sources=sources, counts=None) + +def write_concord_metadata(filename, *, name, concord_filename, url='', description='', sources=None, counts=None): + # Concord files should all be in the format: + # <curie>\t<predicate>\t<curie> + # From this, we extract three counts: + # 'count_concords': Number of lines in this file. + # 'count_distinct_curies': Number of distinct CURIEs. + # 'predicates': A dictionary of counts per predicate. + # 'prefix_counts': A dictionary of prefix pairs along with the predicate + count_concords = 0 + distinct_curies = set() + predicate_counts = defaultdict(int) + curie_prefix_counts = defaultdict(int) + with open(concord_filename, 'r') as concordf: + for line in concordf: + row = line.strip().split('\t') + if len(row) != 3: + logging.warning(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3 -- skipping: '{line}'") + # raise ValueError(f"Concord file {concord_filename} has a line with {len(row)} columns, not 3: {line}") + continue + curie1 = row[0] + predicate = row[1] + curie2 = row[2] + + count_concords += 1 + predicate_counts[predicate] += 1 + distinct_curies.add(curie1) + distinct_curies.add(curie2) + + prefixes = [curie1.split(':')[0], curie2.split(':')[0]] + sorted_prefixes = sorted(prefixes) + curie_prefix_counts[f"{predicate}({sorted_prefixes[0]}, {sorted_prefixes[1]})"] += 1 + + if counts is None: + counts = {} + + if 'concords' in counts: + raise ValueError(f"Cannot add counts to concord metadata for {name} because it already has counts: {counts}") + + counts['concords'] = { + 'count_concords': count_concords, + 'count_distinct_curies': len(distinct_curies), + 'predicates': dict(predicate_counts), + 'prefix_counts': dict(curie_prefix_counts), + } + + write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=counts) + +def write_combined_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from_filenames:list[str]=None, also_combined_from=None): + combined_from = {} + if combined_from_filenames is not None: + if isinstance(combined_from_filenames, str): + logging.warning(f"write_combined_metadata() got a single string for combined_from_files ('{combined_from_filenames}'), converting to a single item list, at: " + f"{''.join(traceback.format_stack())}") + combined_from_filenames = [combined_from_filenames] + for metadata_yaml in combined_from_filenames: + with open(metadata_yaml, 'r') as metaf: + metadata_block = yaml.safe_load(metaf) + if metadata_block is None or metadata_block == {}: + raise ValueError("Metadata file {metadata_yaml} is empty.") + + if 'name' not in metadata_block: + raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}") + + metadata_name = metadata_block['name'] + + if type(metadata_name) is not str: + raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}") + + if metadata_name in combined_from: + # If it's not already a list, then make it into a list. + if type(combined_from[metadata_name]) is not list: + combined_from[metadata_name] = [combined_from[metadata_name]] + combined_from[metadata_name].append(metadata_block) + else: + combined_from[metadata_name] = metadata_block + if also_combined_from is not None: + combined_from.update(also_combined_from) + + write_metadata( + filename, + typ=typ, + name=name, + sources=sources, + url=url, + description=description, + counts=counts, + combined_from=combined_from + ) + +def write_metadata(filename, typ, name, *, sources=None, url='', description='', counts=None, combined_from=None): + if type(typ) is not str: + raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'") + if type(name) is not str: + raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'") + if sources is None: + sources = [] + if counts is None: + counts = [] + if combined_from is None: + combined_from = [] + + metadata_dir = os.path.dirname(filename) + os.makedirs(metadata_dir, exist_ok=True) + with open(filename, 'w') as fout: + yaml.dump({ + 'created_at': datetime.now().isoformat(), + 'type': typ, + 'name': name, + 'url': url, + 'description': description, + 'sources': sources, + 'counts': counts, + 'combined_from': combined_from, + }, fout) diff --git a/src/node.py b/src/node.py index 25f483a8..b33de36f 100644 --- a/src/node.py +++ b/src/node.py @@ -1,15 +1,24 @@ +import itertools import json -import logging import os +import sqlite3 from collections import defaultdict from urllib.parse import urlparse import curies -from src.util import Text, get_config, get_biolink_model_toolkit, get_biolink_prefix_map +from src.util import ( + Text, + get_config, + get_biolink_model_toolkit, + get_biolink_prefix_map, + get_logger, + get_memory_usage_summary, +) from src.LabeledID import LabeledID from src.prefixes import PUBCHEMCOMPOUND +logger = get_logger(__name__) class SynonymFactory: """ @@ -34,13 +43,29 @@ class SynonymFactory: def __init__(self,syndir): self.synonym_dir = syndir self.synonyms = {} - self.common_synonyms = None - print(f"Created SynonymFactory for directory {syndir}") + self.config = get_config() + + # Load the common synonyms. + self.common_synonyms = defaultdict(set) + + for common_synonyms_file in self.config['common']['synonyms']: + common_synonyms_path = os.path.join(self.config['download_directory'], 'common', common_synonyms_file) + count_common_file_synonyms = 0 + with open(common_synonyms_path, 'r') as synonymsf: + # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other + # option. + for line in synonymsf: + row = json.loads(line) + self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) + count_common_file_synonyms += 1 + logger.info(f"Loaded {count_common_file_synonyms:,} common synonyms from {common_synonyms_path}: {get_memory_usage_summary()}") + + logger.info(f"Created SynonymFactory for directory {syndir}") def load_synonyms(self,prefix): lbs = defaultdict(set) labelfname = os.path.join(self.synonym_dir, prefix, 'labels') - print(f'Loading synonyms for {prefix} from {labelfname}') + logger.info(f'Loading synonyms for {prefix} from {labelfname}: {get_memory_usage_summary()}') count_labels = 0 count_synonyms = 0 if os.path.exists(labelfname): @@ -59,26 +84,9 @@ def load_synonyms(self,prefix): lbs[x[0]].add( (x[1], x[2]) ) count_synonyms += 1 self.synonyms[prefix] = lbs - print(f'Loaded {count_labels} labels and {count_synonyms} synonyms for {prefix} from {labelfname}') + logger.info(f'Loaded {count_labels:,} labels and {count_synonyms:,} synonyms for {prefix} from {labelfname}: {get_memory_usage_summary()}') def get_synonyms(self,node): - config = get_config() - if self.common_synonyms is None: - # Load the common synonyms. - self.common_synonyms = defaultdict(set) - - for common_synonyms_file in config['common']['synonyms']: - common_synonyms_path = os.path.join(config['download_directory'], 'common', common_synonyms_file) - count_common_file_synonyms = 0 - with open(common_synonyms_path, 'r') as synonymsf: - # Note that these files may contain ANY prefix -- we should only fallback to this if we have no other - # option. - for line in synonymsf: - row = json.loads(line) - self.common_synonyms[row['curie']].add((row['predicate'], row['synonym'])) - count_common_file_synonyms += 1 - logging.info(f"Loaded {count_common_file_synonyms} common synonyms from {common_synonyms_path}") - node_synonyms = set() for ident in node['identifiers']: thisid = ident['identifier'] @@ -99,10 +107,25 @@ def __init__(self,rootdir): self.root_dir = rootdir self.descriptions = {} self.common_descriptions = None - print(f"Created DescriptionFactory for directory {rootdir}") + + self.config = get_config() + self.common_descriptions = defaultdict(list) + for common_descriptions_file in self.config['common']['descriptions']: + common_descriptions_path = os.path.join(self.config['download_directory'], 'common', common_descriptions_file) + count_common_file_descriptions = 0 + with open(common_descriptions_path, 'r') as descriptionsf: + # Note that these files may contain ANY CURIE -- we should only fallback to this if we have no other + # option. + for line in descriptionsf: + row = json.loads(line) + self.common_descriptions[row['curie']].extend(row['descriptions']) + count_common_file_descriptions += 1 + logger.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") + + logger.info(f"Created DescriptionFactory for directory {rootdir}") def load_descriptions(self,prefix): - print(f'Loading descriptions for {prefix}') + logger.info(f'Loading descriptions for {prefix}') descs = defaultdict(set) descfname = os.path.join(self.root_dir, prefix, 'descriptions') desc_count = 0 @@ -113,32 +136,13 @@ def load_descriptions(self,prefix): descs[x[0]].add("\t".join(x[1:])) desc_count += 1 self.descriptions[prefix] = descs - print(f'Loaded {desc_count} descriptions for {prefix}') - - def get_descriptions(self,node): - config = get_config() - if self.common_descriptions is None: - # Load the common synonyms. - self.common_descriptions = defaultdict(list) - - for common_descriptions_file in config['common']['descriptions']: - common_descriptions_path = os.path.join(config['download_directory'], 'common', common_descriptions_file) - count_common_file_descriptions = 0 - with open(common_descriptions_path, 'r') as descriptionsf: - # Note that these files may contain ANY CURIE -- we should only fallback to this if we have no other - # option. - for line in descriptionsf: - row = json.loads(line) - self.common_descriptions[row['curie']].extend(row['descriptions']) - count_common_file_descriptions += 1 - logging.info(f"Loaded {count_common_file_descriptions} common descriptions from {common_descriptions_path}") - + logger.info(f'Loaded {desc_count:,} descriptions for {prefix}') + def get_descriptions(self, ids: list[str]): node_descriptions = defaultdict(set) - for ident in node['identifiers']: - thisid = ident['identifier'] - pref = thisid.split(':', 1)[0] - if not pref in self.descriptions: + for thisid in ids: + pref = Text.get_prefix(thisid) + if pref not in self.descriptions: self.load_descriptions(pref) node_descriptions[thisid].update( self.descriptions[pref][thisid] ) node_descriptions[thisid].update( self.common_descriptions.get(thisid, {}) ) @@ -149,34 +153,159 @@ class TaxonFactory: """ A factory for loading taxa for CURIEs where available. """ - def __init__(self,rootdir): + def __init__(self, rootdir): self.root_dir = rootdir - self.taxa = {} + self.tsvloader = TSVSQLiteLoader(rootdir, 'taxa', 'curie-curie') def load_taxa(self, prefix): - print(f'Loading taxa for {prefix}') - taxa_per_prefix = defaultdict(set) - taxafilename = os.path.join(self.root_dir, prefix, 'taxa') - taxon_count = 0 - if os.path.exists(taxafilename): - with open(taxafilename, 'r') as inf: - for line in inf: - x = line.strip().split('\t') - taxa_per_prefix[x[0]].add("\t".join(x[1:])) - taxon_count += 1 - self.taxa[prefix] = taxa_per_prefix - print(f'Loaded {taxon_count} taxon-CURIE mappings for {prefix}') + return self.tsvloader.load_prefix(prefix) - def get_taxa(self, node): - node_taxa = defaultdict(set) - for ident in node['identifiers']: - thisid = ident['identifier'] - pref = thisid.split(':', 1)[0] - if not pref in self.taxa: - self.load_taxa(pref) - node_taxa[thisid].update(self.taxa[pref][thisid]) - return node_taxa + def get_taxa(self, curies: list[str]): + return self.tsvloader.get_curies(curies) + + def close(self): + self.tsvloader.close() + + +class TSVSQLiteLoader: + """ + All of the files we load here (SynonymFactory, DescriptionFactory, TaxonFactory and InformationContentFactory) + are TSV files in very similar formats (either <curie>\t<value> or <curie>\t<predicate>\t<value>). Some of these + TSV files are very large, so we don't want to load them all into memory at once. Instead, we use SQLite to: + 1. Load them into SQLite files. SQLite supports "temporary databases" (https://www.sqlite.org/inmemorydb.html) -- + the database is kept in memory, but data can spill onto the disk if the database gets large. + 2. Query identifiers by identifier prefix. + 3. Close and delete the SQLite files when we're done. + + TODO: note that on Sterling, SQLite might not be able to detect when it's running out of memory (we have a limit + of around 500Gi, but the node will have 1.5Ti, so SQLite won't detect a low-mem situation correctly). We should + figure out how to configure that. + """ + + def __init__(self, download_dir, filename, file_format): + self.download_dir = download_dir + self.filename = filename + self.sqlites = {} + + # We only support one format for now. + self.format = format + if file_format in {'curie-curie'}: + # Acceptable format! + pass + else: + raise ValueError(f"Unknown TSVSQLiteLoader file format: {file_format}") + + def __str__(self): + sqlite_counts = self.get_sqlite_counts() + sqlite_counts_str = ", ".join( + f"{prefix}: {count:,} rows" + for prefix, count in sorted(sqlite_counts.items(), key=lambda x: x[1], reverse=True) + ) + return f"TSVSQLiteLoader({self.download_dir}, {self.filename}, {self.format}) containing {len(self.sqlites)} SQLite DBs ({sqlite_counts_str})" + + def get_sqlite_counts(self): + counts = dict() + for prefix in self.sqlites: + counts[prefix] = self.sqlites[prefix].execute(f"SELECT COUNT(*) FROM {prefix}").fetchone()[0] + return counts + + def load_prefix(self, prefix): + if prefix in self.sqlites: + # We've already loaded this prefix! + return True + + # Set up filenames. + tsv_filename = os.path.join(self.download_dir, prefix, self.filename) + + # If the TSV file doesn't exist, we don't need to do anything. + if not os.path.exists(tsv_filename): + self.sqlites[prefix] = None + return False + + # Write to a SQLite in-memory database so we don't need to hold it in memory all at once. + logger.info(f"Loading {prefix} into SQLite: {get_memory_usage_summary()}") + + # Setting a SQLite database as "" does exactly what we want: create an in-memory database that will spill onto + # a temporary file if needed. + conn = sqlite3.connect('') + conn.execute(f"CREATE TABLE {prefix} (curie1 TEXT, curie2 TEXT)") + + # Load taxa into memory. + logger.info(f"Reading records from {tsv_filename} into memory to load into SQLite: {get_memory_usage_summary()}") + records = [] + record_count = 0 + with open(tsv_filename, 'r') as inf: + for line in inf: + x = line.strip().split('\t', maxsplit=1) + records.append([x[0].upper(), x[1]]) + record_count += 1 + if len(records) % 10_000_000 == 0: + # Insert every 10,000,000 records. + logger.info(f"Inserting {len(records):,} records (total so far: {record_count:,}) from {tsv_filename} into SQLite: {get_memory_usage_summary()}") + conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) + records = [] + + # Insert any remaining records. + logger.info(f"Inserting {len(records):,} records from {tsv_filename} into SQLite: {get_memory_usage_summary()}") + conn.executemany(f"INSERT INTO {prefix} VALUES (?, ?)", records) + logger.info(f"Creating a case-insensitive index for the {record_count:,} records loaded into SQLite: {get_memory_usage_summary()}") + conn.execute(f"CREATE INDEX curie1_idx ON {prefix}(curie1)") + conn.commit() + logger.info(f"Loaded {record_count:,} records from {tsv_filename} into SQLite table {prefix}: {get_memory_usage_summary()}") + self.sqlites[prefix] = conn + return True + + def get_curies(self, curies_to_query: list) -> dict[str, set[str]]: + results = defaultdict(set) + + curies_sorted_by_prefix = sorted(curies_to_query, key=lambda curie: Text.get_prefix(curie)) + curies_grouped_by_prefix = itertools.groupby(curies_sorted_by_prefix, key=lambda curie: Text.get_prefix(curie)) + for prefix, curies_group in curies_grouped_by_prefix: + curies = list(curies_group) + logger.debug(f"Looking up {prefix} for {curies} curies") + if prefix not in self.sqlites: + logger.debug(f"No SQLite for {prefix} found, trying to load it.") + if not self.load_prefix(prefix): + # Nothing to load. + logger.debug(f"No TSV file for {prefix} found, so can't query it for {curies}") + for curie in curies: + results[curie] = set() + continue + if self.sqlites[prefix] is None: + logger.debug(f"No {self.filename} file for {prefix} found, so can't query it for {curies}") + for curie in curies: + results[curie] = set() + continue + + # Query the SQLite. + query = f"SELECT curie1, curie2 FROM {prefix} WHERE curie1 = ?" + for curie in curies: + query_result = self.sqlites[prefix].execute(query, [curie.upper()]).fetchall() + if not query_result: + results[curie] = set() + continue + + for row in query_result: + curie1 = curie + curie2 = row[1] + results[curie1].add(curie2) + + return dict(results) + def close(self): + """ + Close all of the SQLite connections. + """ + for prefix, db in self.sqlites.items(): + if db is not None: + db.close() + self.sqlites = dict() + + def __del__(self): + self.close() + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() class InformationContentFactory: """ @@ -244,10 +373,10 @@ def __init__(self, ic_file): # Sort the dictionary items by value in descending order sorted_by_prefix = sorted(count_by_prefix.items(), key=lambda item: item[1], reverse=True) - print(f"Loaded {len(self.ic)} InformationContent values from {len(count_by_prefix.keys())} prefixes:") + logger.info(f"Loaded {len(self.ic)} InformationContent values from {len(count_by_prefix.keys())} prefixes:") # Now you can print the sorted items for key, value in sorted_by_prefix: - print(f'- {key}: {value}') + logger.info(f'- {key}: {value}') # We see a number of URLs being mapped to None (250,871 at present). Let's optionally raise an error if that # happens. @@ -259,12 +388,12 @@ def __init__(self, ic_file): unmapped_urls_by_netloc[netloc].append(url) # Print them in reverse count order. - print(f"Found {len(unmapped_urls)} unmapped URLs:") + logger.info(f"Found {len(unmapped_urls)} unmapped URLs:") netlocs_by_count = sorted(unmapped_urls_by_netloc.items(), key=lambda item: len(item[1]), reverse=True) for netloc, urls in netlocs_by_count: - print(f" - {netloc} [{len(urls)}]") + logger.info(f" - {netloc} [{len(urls)}]") for url in sorted(urls): - print(f" - {url}") + logger.info(f" - {url}") assert None not in sorted_by_prefix, ("Found invalid CURIEs in information content values, probably " "because they couldn't be mapped from URLs to CURIEs.") @@ -285,6 +414,7 @@ def get_ic(self, node): class NodeFactory: def __init__(self,label_dir,biolink_version): + self.biolink_version = biolink_version self.toolkit = get_biolink_model_toolkit(biolink_version) self.ancestor_map = {} self.prefix_map = {} @@ -306,7 +436,7 @@ def get_ancestors(self,input_type): def get_prefixes(self,input_type): if input_type in self.prefix_map: return self.prefix_map[input_type] - print(input_type) + logger.info(f"NodeFactory({self.label_dir}, {self.biolink_version}).get_prefixes({input_type}) called") j = self.toolkit.get_element(input_type) prefs = j['id_prefixes'] # biolink doesnt yet include UMLS as a valid prefix for biological process. There is a PR here: @@ -361,16 +491,15 @@ def clean_list(self,input_identifiers): wrote = True break if not wrote: - print(input_identifiers) - exit() + raise ValueError(f"Can't clean up list {v}") return cleaned def load_extra_labels(self,prefix): if self.label_dir is None: - print (f"WARNING: no label_dir specified in load_extra_labels({self}, {prefix}), can't load extra labels for {prefix}. Skipping.") + logger.warning(f"no label_dir specified in load_extra_labels({self}, {prefix}), can't load extra labels for {prefix}. Skipping.") return if prefix is None: - print (f"WARNING: no prefix specified in load_extra_labels({self}, {prefix}), can't load extra labels. Skipping.") + logger.warning(f"no prefix specified in load_extra_labels({self}, {prefix}), can't load extra labels. Skipping.") return labelfname = os.path.join(self.label_dir,prefix,'labels') lbs = {} @@ -404,7 +533,7 @@ def apply_labels(self, input_identifiers, labels): continue self.common_labels[x[0]] = x[1] count_common_file_labels += 1 - logging.info(f"Loaded {count_common_file_labels} common labels from {common_labels_path}") + logger.info(f"Loaded {count_common_file_labels:,} common labels from {common_labels_path}: {get_memory_usage_summary()}") #Originally we needed to clean up the identifer lists, because there would be both labeledids and # string ids and we had to reconcile them. @@ -412,15 +541,14 @@ def apply_labels(self, input_identifiers, labels): labeled_list = [] for iid in input_identifiers: if isinstance(iid,LabeledID): - print('LabeledID dont belong here, pass in labels seperately',iid) - exit() + raise ValueError(f"LabeledID don't belong here ({iid}), pass in labels separately.") if iid in labels: labeled_list.append( LabeledID(identifier=iid, label = labels[iid])) else: try: prefix = Text.get_prefix(iid) except ValueError as e: - print(f"ERROR: Unable to apply_labels({self}, {input_identifiers}, {labels}): could not obtain prefix for identifier {iid}") + logger.error(f"Unable to apply_labels({self}, {input_identifiers}, {labels}): could not obtain prefix for identifier {iid}") raise e if prefix not in self.extra_labels: self.load_extra_labels(prefix) @@ -442,7 +570,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): if len(input_identifiers) == 0: return None if len(input_identifiers) > 1000: - print(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') + logger.warning(f'this seems like a lot of input_identifiers in node.create_node() [{len(input_identifiers)}]: {input_identifiers}') cleaned = self.apply_labels(input_identifiers,labels) try: idmap = defaultdict(list) @@ -457,7 +585,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): print(type(i)) print(Text.get_curie(i)) print(Text.get_curie(i).upper()) - exit() + raise RuntimeError('something very bad') identifiers = [] accepted_ids = set() #Converting identifiers from LabeledID to dicts @@ -477,7 +605,7 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): try: newids.sort() except TypeError as e: - print(newids) + logger.error(f"Could not sort {newids} because of a TypeError: {e}") raise e if pupper == PUBCHEMCOMPOUND.upper() and len(newids) > 1: newids = pubchemsort(newids,cleaned) @@ -486,23 +614,14 @@ def create_node(self,input_identifiers,node_type,labels={},extra_prefixes=[]): for k,vals in idmap.items(): for v in vals: if v not in accepted_ids and (k,node_type) not in self.ignored_prefixes: - print(f'Ignoring prefix {k} for type {node_type}, identifier {v}') + logger.warning(f'Ignoring prefix {k} for type {node_type}, identifier {v}') self.ignored_prefixes.add( (k,node_type) ) if len(identifiers) == 0: return None - best_id = identifiers[0]['identifier'] - # identifiers is in preferred order, so choose the first non-empty label to be the node label - labels = list(filter(lambda x:len(x) > 0, [ l['label'] for l in identifiers if 'label' in l ])) - label = None - if len(labels) > 0: - label = labels[0] - node = { 'identifiers': identifiers, 'type': node_type } - #if label is not None: - # node['id']['label'] = label return node def pubchemsort(pc_ids, labeled_ids): @@ -563,3 +682,10 @@ def pubchemsort(pc_ids, labeled_ids): best_pubchem = pcelement pc_ids.remove(best_pubchem) return [best_pubchem] + pc_ids + +if __name__ == '__main__': + tsvdb = TSVSQLiteLoader('babel_downloads/', filename='taxa', file_format='curie-curie') + logger.info(f"Started TSVDuckDBLoader {tsvdb}: {get_memory_usage_summary()}") + result = tsvdb.get_curies(['UniProtKB:I6L8L4', 'UniProtKB:C6H147']) + logger.info(f"Got result from {tsvdb}: {result} with {get_memory_usage_summary()}") + tsvdb.close() diff --git a/src/properties.py b/src/properties.py new file mode 100644 index 00000000..84cd49a1 --- /dev/null +++ b/src/properties.py @@ -0,0 +1,191 @@ +# +# properties.py: handle node- and clique-level properties for Babel. +# +# Property files are JSONL files that can be read into and out of the Property dataclass. +# So writing them is easy: you just add each property on its own line, and if you go through +# Property(...).to_json_line() we can even validate it for you (eventually). +# +# We generally need to read multiple properties files so you can run queries over all of them, which you can do by +# using the PropertyList class. +# +import gzip +import json +from collections import defaultdict +from dataclasses import dataclass, field + +# +# SUPPORTED PROPERTIES +# + +# HAS_ALTERNATIVE_ID indicates that CURIE has an alternative ID that should be included in the clique, but NOT +# treated as part of the clique for the purposes of choosing the clique leader. This is used for e.g. ChEBI secondary +# IDs or other deprecated identifiers. +HAS_ALTERNATIVE_ID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' + +# Properties currently supported in the property store in one set for validation. +supported_predicates = { + HAS_ALTERNATIVE_ID, +} + +# +# The Property dataclass can be used to encapsulate a property for a CURIE. It has helper code to read +# and write these properties. +# + +@dataclass(frozen=True) +class Property: + """ + A property value for a CURIE. + """ + + curie: str + predicate: str + value: str + source: str = "" # TODO: making this a list would be better, but that would make a Property non-frozen, which + # would make it harder to uniquify. + + @staticmethod + def valid_keys(): + return ['curie', 'predicate', 'value', 'source'] + + def __post_init__(self): + """ + Make sure this Property makes sense. + """ + if self.predicate not in supported_predicates: + raise ValueError(f'Predicate {self.predicate} is not supported (supported predicates: {supported_predicates})') + + @staticmethod + def from_dict(prop_dict, source=None): + """ + Read this dictionary into a Property. + + :param prop_dict: A dictionary containing the property values. + :param source: The source of this property, if any. + :return: A Property version of this JSON line. + """ + + # Check if this dictionary includes keys that aren't valid in a Property. + unexpected_keys = prop_dict.keys() - Property.valid_keys() + if len(unexpected_keys) > 0: + raise ValueError(f'Unexpected keys in dictionary to be converted to Property ({unexpected_keys}): {json.dumps(prop_dict, sort_keys=True, indent=2)}') + + prop = Property(**prop_dict) + return prop + + # TODO: we should have some validation code in here so people don't make nonsense properties, which means + # validating both the property and the value. + + def to_json_line(self): + """ + Returns this property as a JSONL line, including the final newline (so you can write it directly to a file). + + :return: A string containing the JSONL line of this property. + """ + return json.dumps({ + 'curie': self.curie, + 'predicate': self.predicate, + 'value': self.value, + 'source': self.source, + }) + '\n' + +# +# The PropertyList object can be used to load and query properties from a particular source. +# +# We could write them into a DuckDB file as we load them so they can overflow onto disk as needed, but that's overkill +# for right now, so we'll just load them all into memory. +# + +class PropertyList: + """ + This class can be used to load multiple property files for simultaneous querying. + + In order to support the existing property files, we will additionally support the two main alternate formats we use: + - A three column TSV file, with columns: CURIE, property, value + - A four column TSV file, with columns: CURIE, property, value, source + + But eventually all of those files will be subsumed into JSONL files. + """ + + def __init__(self): + """ + Create a new PropertyList object. + + Since most of our queries will be CURIE-based, we'll index properties by CURIE, but we'll also keep + a set of all properties. + """ + self._properties = set[Property]() + self._properties_by_curie = defaultdict(set[Property]) + + def count_unique(self): + """ + Return the number of unique Properties in this PropertyList. + + :return: The number of unique Properties in this PropertyList. + """ + return len(self._properties) + + @property + def properties(self) -> set[Property]: + return self._properties + + def get_all(self, curie: str, predicate: str = None) -> set[Property]: + """ + Get all properties for a given CURIE. + + :param curie: The CURIE to look up properties. + :param predicate: If specified, only return properties with this predicate. + :return: The set of properties for this CURIE. + """ + props = self._properties_by_curie[curie] + + if predicate is None: + return props + + if predicate not in supported_predicates: + raise ValueError(f'Predicate {predicate} is not supported (supported predicates: {supported_predicates})') + + return set(filter(lambda p: p.predicate == predicate, props)) + + def add_properties(self, props: set[Property]): + """ + Add a set of Property values to the list. + + :param props: A set of Property values. + :param source: The source of these properties, if any. + :return: The number of unique properties added. + """ + + props_to_be_added = (props - self._properties) + + self._properties.update(props) + for prop in props: + self._properties_by_curie[prop.curie].add(prop) + + return len(props_to_be_added) + + def add_properties_jsonl_gz(self, filename_gz: str): + """ + Add all the properties in a JSONL Gzipped file. + + :param filename_gz: The properties JSONL Gzipped filename to load. + :return: The number of unique properties loaded. + """ + + props_to_add = set[Property]() + with gzip.open(filename_gz, 'rt') as f: + for line in f: + props_to_add.add(Property.from_dict(json.loads(line), source=filename_gz)) + + return self.add_properties(props_to_add) + +if __name__ == '__main__': + pl = PropertyList() + ps = set[Property]() + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'B', source='E and F')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'D')) + ps.add(Property('A', HAS_ALTERNATIVE_ID, 'C')) + pl.add_properties(ps) + print(pl.properties) + assert len(pl.properties) == 3 diff --git a/src/reports/compendia_per_file_reports.py b/src/reports/compendia_per_file_reports.py index ebe0f1de..1370ae2a 100644 --- a/src/reports/compendia_per_file_reports.py +++ b/src/reports/compendia_per_file_reports.py @@ -20,28 +20,30 @@ def get_datetime_as_string(): return datetime.now().isoformat() -def assert_files_in_directory(dir, files, report_file): +def assert_files_in_directory(dir, expected_files, report_file): """ Asserts that the list of files in a given directory are the list of files provided. :param dir: The directory to check files in. - :param files: The files to compare the list against. + :param expected_files: The files to compare the list against. :param report_file: Write a report to this file. We assume that this file is not intended to be read, but is created so that we can check this assertion has been checked. """ - logging.info(f"Expect files in directory {dir} to be equal to {files}") all_file_list = os.listdir(dir) # On Sterling, we sometimes have `.nfs*` files that represent NFS cached files that weren't properly deleted. # These shouldn't interfere with these tests. file_list = filter(lambda fn: not fn.startswith('.nfs'), all_file_list) - assert set(file_list) == set(files) + file_list_set = set(file_list) + expected_files_set = set(expected_files) + assert file_list_set == expected_files_set, f"Expected files in directory {dir} to be equal to {expected_files_set} but found {file_list_set}: " + \ + f"{file_list_set - expected_files_set} added, {expected_files_set - file_list_set} missing." # If we passed, write the output to the check_file. with open(report_file, "w") as f: - f.write(f"Confirmed that {dir} contains only the files {files} at {get_datetime_as_string()}\n") + f.write(f"Confirmed that {dir} contains only the files {expected_files} at {get_datetime_as_string()}\n") def generate_content_report_for_compendium(compendium_path, report_path): diff --git a/src/sdfreader.py b/src/sdfreader.py index c56b8592..8836ff10 100644 --- a/src/sdfreader.py +++ b/src/sdfreader.py @@ -30,10 +30,10 @@ def chebi_sdf_entry_to_dict(sdf_chunk, interesting_keys = {}): current_key = line.replace('>','').replace('<','').strip().replace(' ', '').lower() current_key = 'formula' if current_key == 'formulae' else current_key if current_key in interesting_keys: - final_dict[interesting_keys[current_key]] = '' + final_dict[interesting_keys[current_key]] = [] continue if current_key == 'chebiid': chebi_id = line if current_key in interesting_keys: - final_dict[interesting_keys[current_key]] += line - return (chebi_id, final_dict) \ No newline at end of file + final_dict[interesting_keys[current_key]].append(line) + return (chebi_id, final_dict) diff --git a/src/snakefiles/anatomy.snakefile b/src/snakefiles/anatomy.snakefile index 6e80a026..062a3b03 100644 --- a/src/snakefiles/anatomy.snakefile +++ b/src/snakefiles/anatomy.snakefile @@ -49,14 +49,22 @@ rule get_anatomy_obo_relationships: config['intermediate_directory']+'/anatomy/concords/UBERON', config['intermediate_directory']+'/anatomy/concords/CL', config['intermediate_directory']+'/anatomy/concords/GO', + uberon_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UBERON.yaml', + cl_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-CL.yaml', + go_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-GO.yaml', run: - anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords') + anatomy.build_anatomy_obo_relationships(config['intermediate_directory']+'/anatomy/concords', { + 'UBERON': output.uberon_metadata, + 'CL': output.cl_metadata, + 'GO': output.go_metadata, + }) rule get_wikidata_cell_relationships: output: config['intermediate_directory']+'/anatomy/concords/WIKIDATA', + wikidata_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-WIKIDATA.yaml', run: - anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords') + anatomy.build_wikidata_cell_relationships(config['intermediate_directory']+'/anatomy/concords', output.wikidata_metadata) rule get_anatomy_umls_relationships: input: @@ -64,21 +72,24 @@ rule get_anatomy_umls_relationships: infile=config['intermediate_directory']+"/anatomy/ids/UMLS" output: outfile=config['intermediate_directory']+'/anatomy/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/anatomy/concords/metadata-UMLS.yaml', run: - anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile) + anatomy.build_anatomy_umls_relationships(input.mrconso, input.infile, output.outfile, output.umls_metadata) rule anatomy_compendia: input: labels=os.path.join(config["download_directory"], 'common', config["common"]["labels"][0]), synonyms=os.path.join(config["download_directory"], 'common', config["common"]["synonyms"][0]), concords=expand("{dd}/anatomy/concords/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_concords']), + metadata_yamls=expand("{dd}/anatomy/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['anatomy_concords']), idlists=expand("{dd}/anatomy/ids/{ap}",dd=config['intermediate_directory'],ap=config['anatomy_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs'])), + expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['anatomy_outputs']), run: - anatomy.build_compendia(input.concords, input.idlists, input.icrdf_filename) + anatomy.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_anatomy_completeness: input: @@ -124,6 +135,7 @@ rule anatomy: input: config['output_directory']+'/reports/anatomy_completeness.txt', synonyms=expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['anatomy_outputs']), + metadata=expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['anatomy_outputs']), reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['anatomy_outputs']) output: synonyms_gzipped=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['anatomy_outputs']), diff --git a/src/snakefiles/cell_line.snakefile b/src/snakefiles/cell_line.snakefile index b8e72965..62cb17fc 100644 --- a/src/snakefiles/cell_line.snakefile +++ b/src/snakefiles/cell_line.snakefile @@ -11,7 +11,7 @@ rule get_clo_ids: input: infile=config['download_directory']+"/CLO/clo.owl" output: - outfile=config['intermediate_directory']+"/cell_line/ids/CLO" + outfile=config['intermediate_directory']+"/cell_line/ids/CLO", run: clo.write_clo_ids(input.infile, output.outfile) @@ -24,12 +24,14 @@ rule cell_line_compendia: ids=config['intermediate_directory']+"/cell_line/ids/CLO", labelfile=config['download_directory'] + '/CLO/labels', synonymfile=config['download_directory'] + '/CLO/synonyms', + metadatafile=config['download_directory'] + '/CLO/metadata.yaml', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: config['output_directory']+"/compendia/CellLine.txt", - temp(config['output_directory']+"/synonyms/CellLine.txt") + temp(config['output_directory']+"/synonyms/CellLine.txt"), + config['output_directory']+"/metadata/CellLine.txt.yaml", run: - cell_line.build_compendia(input.ids,input.icrdf_filename) + cell_line.build_compendia(input.ids, [input.metadatafile], input.icrdf_filename) rule check_cell_line_completeness: input: @@ -52,6 +54,7 @@ rule cell_line: config['output_directory']+'/reports/cell_line_completeness.txt', config['output_directory'] + "/reports/CellLine.txt", cell_line_synonyms=config['output_directory'] + "/synonyms/CellLine.txt", + metadata=config['output_directory']+"/metadata/CellLine.txt.yaml", output: config['output_directory'] + "/synonyms/CellLine.txt.gz", x=config['output_directory']+'/reports/cell_line_done' diff --git a/src/snakefiles/chemical.snakefile b/src/snakefiles/chemical.snakefile index 249fe30c..15446cd9 100644 --- a/src/snakefiles/chemical.snakefile +++ b/src/snakefiles/chemical.snakefile @@ -109,9 +109,10 @@ rule get_chemical_drugcentral_relationships: input: xreffile=config['download_directory']+"/DrugCentral/xrefs" output: - outfile=config['intermediate_directory']+'/chemicals/concords/DrugCentral' + outfile=config['intermediate_directory']+'/chemicals/concords/DrugCentral', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-DrugCentral.yaml', run: - chemicals.build_drugcentral_relations(input.xreffile,output.outfile) + chemicals.build_drugcentral_relations(input.xreffile,output.outfile, output.metadata_yaml) rule get_chemical_umls_relationships: input: @@ -119,8 +120,9 @@ rule get_chemical_umls_relationships: infile=config['intermediate_directory']+"/chemicals/ids/UMLS", output: outfile=config['intermediate_directory']+'/chemicals/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-UMLS.yaml', run: - chemicals.build_chemical_umls_relationships(input.mrconso, input.infile, output.outfile) + chemicals.build_chemical_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule get_chemical_rxnorm_relationships: input: @@ -128,23 +130,27 @@ rule get_chemical_rxnorm_relationships: conso=config['download_directory'] + "/RxNorm/RXNCONSO.RRF" output: outfile=config['intermediate_directory']+'/chemicals/concords/RXNORM', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-RXNORM.yaml', run: - chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile,output.outfile) + chemicals.build_chemical_rxnorm_relationships(input.conso, input.infile, output.outfile, output.metadata_yaml) rule get_chemical_wikipedia_relationships: output: - outfile = config['intermediate_directory'] + '/chemicals/concords/wikipedia_mesh_chebi' + outfile = config['intermediate_directory'] + '/chemicals/concords/wikipedia_mesh_chebi', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-wikipedia_mesh_chebi.yaml' run: - chemicals.get_wikipedia_relationships(output.outfile) + chemicals.get_wikipedia_relationships(output.outfile, output.metadata_yaml) rule get_chemical_mesh_relationships: input: infile = config['intermediate_directory'] + '/chemicals/ids/MESH' output: casout = config['intermediate_directory'] + '/chemicals/concords/mesh_cas', - uniout = config['intermediate_directory'] + '/chemicals/concords/mesh_unii' + uniout = config['intermediate_directory'] + '/chemicals/concords/mesh_unii', + casout_metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-mesh_cas.yaml', + uniout_metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-mesh_unii.yaml', run: - chemicals.get_mesh_relationships(input.infile,output.casout,output.uniout) + chemicals.get_mesh_relationships(input.infile,output.casout,output.uniout,output.casout_metadata_yaml,output.uniout_metadata_yaml) #This is about a 2 hour step and requires something more than 256G of RAM. 512G works. rule get_chemical_unichem_relationships: @@ -161,35 +167,40 @@ rule get_chemical_pubchem_mesh_concord: pubchemfile=config['download_directory'] + '/PUBCHEM.COMPOUND/CID-MeSH', meshlabels=config['download_directory'] + '/MESH/labels' output: - outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH' + outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_MESH', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-PUBCHEM_MESH.yaml' run: - chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile) + chemicals.make_pubchem_mesh_concord(input.pubchemfile,input.meshlabels,output.outfile, output.metadata_yaml) rule get_chemical_pubchem_cas_concord: input: pubchemsynonyms=config['download_directory'] + '/PUBCHEM.COMPOUND/synonyms' output: - outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_CAS' + outfile = config['intermediate_directory'] + '/chemicals/concords/PUBCHEM_CAS', + metadata_yaml = config['intermediate_directory'] + '/chemicals/concords/metadata-PUBCHEM_CAS.yaml' run: - chemicals.make_pubchem_cas_concord(input.pubchemsynonyms, output.outfile) + chemicals.make_pubchem_cas_concord(input.pubchemsynonyms, output.outfile, output.metadata_yaml) # There are some gtopdb inchikey relations that for some reason are not in unichem rule get_gtopdb_inchikey_concord: input: infile=config['download_directory']+'/GTOPDB/ligands.tsv' output: - outfile=config['intermediate_directory'] + '/chemicals/concords/GTOPDB' + outfile=config['intermediate_directory'] + '/chemicals/concords/GTOPDB', + metadata_yaml=config['intermediate_directory'] + '/chemicals/concords/metadata-GTOPDB.yaml', run: - chemicals.make_gtopdb_relations(input.infile,output.outfile) + chemicals.make_gtopdb_relations(input.infile,output.outfile, output.metadata_yaml) rule get_chebi_concord: input: sdf=config['download_directory']+'/CHEBI/ChEBI_complete.sdf', dbx=config['download_directory']+'/CHEBI/database_accession.tsv' output: - outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI' + outfile=config['intermediate_directory']+'/chemicals/concords/CHEBI', + propfile=config['intermediate_directory']+'/chemicals/properties/get_chebi_concord.jsonl.gz', + metadata_yaml=config['intermediate_directory']+'/chemicals/concords/metadata-CHEBI.yaml' run: - chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile) + chemicals.make_chebi_relations(input.sdf,input.dbx,output.outfile,propfile_gz=output.propfile,metadata_yaml=output.metadata_yaml) rule chemical_unichem_concordia: input: @@ -205,24 +216,33 @@ rule untyped_chemical_compendia: synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['chemical_synonyms']), unichemgroup = config['intermediate_directory']+'/chemicals/partials/UNICHEM', concords = expand('{dd}/chemicals/concords/{cc}',dd=config['intermediate_directory'], cc=config['chemical_concords'] ), + metadata_yamls = expand('{dd}/chemicals/concords/metadata-{cc}.yaml',dd=config['intermediate_directory'], cc=config['chemical_concords'] ), idlists=expand("{dd}/chemicals/ids/{ap}",dd=config['intermediate_directory'],ap=config['chemical_ids']), output: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', + untyped_meta = config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml' run: - chemicals.build_untyped_compendia(input.concords,input.idlists,input.unichemgroup,output.untyped_file,output.typesfile) + chemicals.build_untyped_compendia(input.concords,input.idlists,input.unichemgroup,output.untyped_file,output.typesfile, output.untyped_meta, input.metadata_yamls) rule chemical_compendia: input: typesfile = config['intermediate_directory'] + '/chemicals/partials/types', untyped_file = config['intermediate_directory'] + '/chemicals/partials/untyped_compendium', + metadata_yamls = [ + config['intermediate_directory'] + '/chemicals/partials/metadata-untyped_compendium.yaml' + ], + properties_jsonl_gz = [ + config['intermediate_directory'] + '/chemicals/properties/get_chebi_concord.jsonl.gz' + ], icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs'])), + expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), run: - chemicals.build_compendia(input.typesfile,input.untyped_file, input.icrdf_filename) + chemicals.build_compendia(input.typesfile, input.untyped_file, input.properties_jsonl_gz, input.metadata_yamls, input.icrdf_filename) rule check_chemical_completeness: input: @@ -294,7 +314,8 @@ rule chemical: input: config['output_directory']+'/reports/chemical_completeness.txt', synonyms = expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['chemical_outputs']), - reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['chemical_outputs']) + reports = expand("{od}/reports/{ap}",od=config['output_directory'], ap = config['chemical_outputs']), + metadata = expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['chemical_outputs']), output: synonyms_gzipped = expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['chemical_outputs']), x=config['output_directory']+'/reports/chemicals_done' diff --git a/src/snakefiles/datacollect.snakefile b/src/snakefiles/datacollect.snakefile index aa9641f6..1340086f 100644 --- a/src/snakefiles/datacollect.snakefile +++ b/src/snakefiles/datacollect.snakefile @@ -73,9 +73,10 @@ rule get_complexportal_labels_and_synonyms: infile = config['download_directory']+'/ComplexPortal'+'/559292.tsv' output: lfile = config['download_directory']+'/ComplexPortal'+'/559292_labels.tsv', - sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv' + sfile = config['download_directory']+'/ComplexPortal'+'/559292_synonyms.tsv', + metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml' run: - complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile) + complexportal.make_labels_and_synonyms(input.infile, output.lfile, output.sfile, output.metadata_yaml) ### MODS @@ -122,19 +123,6 @@ rule get_uniprotkb_labels: run: uniprotkb.pull_uniprot_labels(input.sprot_input,input.trembl_input,output.outfile) -rule get_umls_gene_protein_mappings: - output: - umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv', - umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene', - umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB', - run: - uniprotkb.download_umls_gene_protein_mappings( - config['UMLS_UniProtKB_download_raw_url'], - output.umls_uniprotkb_filename, - output.umls_gene_concords, - output.umls_protein_concords, - ) - ### MESH rule get_mesh: @@ -257,7 +245,7 @@ rule get_ncbigene_labels_synonyms_and_taxa: rule get_ensembl: output: - ensembl_dir=dir(config['download_directory']+'/ENSEMBL'), + ensembl_dir=directory(config['download_directory']+'/ENSEMBL'), complete_file=config['download_directory']+'/ENSEMBL/BioMartDownloadComplete' run: ensembl.pull_ensembl(output.ensembl_dir, output.complete_file) @@ -292,8 +280,9 @@ rule get_hgncfamily_labels: infile=rules.get_hgncfamily.output.outfile output: outfile = config['download_directory'] + '/HGNC.FAMILY/labels', + metadata_yaml = config['download_directory'] + '/HGNC.FAMILY/metadata.yaml', run: - hgncfamily.pull_labels(input.infile,output.outfile) + hgncfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml) ### PANTHER.FAMILY @@ -308,8 +297,9 @@ rule get_pantherfamily_labels: infile=rules.get_pantherfamily.output.outfile output: outfile = config['download_directory'] + '/PANTHER.FAMILY/labels', + metadata_yaml = config['download_directory'] + '/PANTHER.FAMILY/metadata.yaml', run: - pantherfamily.pull_labels(input.infile,output.outfile) + pantherfamily.pull_labels(input.infile,output.outfile, output.metadata_yaml) ### OMIM @@ -641,9 +631,10 @@ rule get_chebi: rule get_clo: output: - config['download_directory']+'/CLO/clo.owl' + config['download_directory']+'/CLO/clo.owl', + metadata=config['download_directory']+'/CLO/metadata.yaml', run: - clo.pull_clo() + clo.pull_clo(output.metadata) rule get_CLO_labels: input: diff --git a/src/snakefiles/diseasephenotype.snakefile b/src/snakefiles/diseasephenotype.snakefile index 8a77dc11..6a51803c 100644 --- a/src/snakefiles/diseasephenotype.snakefile +++ b/src/snakefiles/diseasephenotype.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.diseasephenotype as diseasephenotype import src.assess_compendia as assessments import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Disease / Phenotypic Feature @@ -84,16 +85,24 @@ rule get_disease_obo_relationships: config['intermediate_directory']+'/disease/concords/MONDO', config['intermediate_directory']+'/disease/concords/MONDO_close', config['intermediate_directory']+'/disease/concords/HP', + mondo_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-MONDO.yaml', + mondo_close_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-MONDO_close.yaml', + hp_metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-HP.yaml', run: - diseasephenotype.build_disease_obo_relationships(config['intermediate_directory']+'/disease/concords') + diseasephenotype.build_disease_obo_relationships(config['intermediate_directory']+'/disease/concords', { + 'MONDO': output.mondo_metadata_yaml, + 'MONDO_close': output.mondo_close_metadata_yaml, + 'HP': output.hp_metadata_yaml, + }) rule get_disease_efo_relationships: input: infile=config['intermediate_directory']+"/disease/ids/EFO", output: - outfile=config['intermediate_directory']+'/disease/concords/EFO' + outfile=config['intermediate_directory']+'/disease/concords/EFO', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-EFO.yaml', run: - diseasephenotype.build_disease_efo_relationships(input.infile,output.outfile) + diseasephenotype.build_disease_efo_relationships(input.infile,output.outfile, output.metadata_yaml) rule get_disease_umls_relationships: input: @@ -103,23 +112,27 @@ rule get_disease_umls_relationships: ncit=config['intermediate_directory'] + '/disease/ids/NCIT' output: outfile=config['intermediate_directory']+'/disease/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-UMLS.yaml', run: - diseasephenotype.build_disease_umls_relationships(input.mrconso, input.infile,output.outfile,input.omim,input.ncit) + diseasephenotype.build_disease_umls_relationships(input.mrconso, input.infile,output.outfile,input.omim,input.ncit, output.metadata_yaml) rule get_disease_doid_relationships: input: infile = config['download_directory']+'/DOID/doid.json' output: outfile=config['intermediate_directory']+'/disease/concords/DOID', + metadata_yaml=config['intermediate_directory']+'/disease/concords/metadata-DOID.yaml', run: - diseasephenotype.build_disease_doid_relationships(input.infile,output.outfile) + diseasephenotype.build_disease_doid_relationships(input.infile,output.outfile,output.metadata_yaml) rule disease_manual_concord: input: infile = 'input_data/manual_concords/disease.txt' output: - outfile = config['intermediate_directory']+'/disease/concords/Manual' + outfile = config['intermediate_directory']+'/disease/concords/Manual', + metadata_yaml = config['intermediate_directory']+'/disease/concords/metadata-Manual.yaml' run: + count_manual_concords = 0 with open(input.infile, 'r') as inp, open(output.outfile, 'w') as outp: for line in inp: # Remove any lines starting with '#', which we treat as comments. @@ -131,6 +144,19 @@ rule disease_manual_concord: if len(elements) != 3: raise RuntimeError(f"Found {len(elements)} elements on line {lstripped_line}, expected 3: {elements}") outp.writelines(["\t".join(elements)]) + count_manual_concords += 1 + + write_concord_metadata( + output.metadata_yaml, + name='Manual Disease/Phenotype Concords', + description='Manually curated Disease/Phenotype cross-references from the Babel repository', + sources=[{ + 'name': 'Babel repository', + 'url': 'https://github.com/TranslatorSRI/Babel', + }], + url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/disease.txt', + concord_filename=output.outfile, + ) rule disease_compendia: input: @@ -141,15 +167,19 @@ rule disease_compendia: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['disease_labelsandsynonyms']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['disease_labelsandsynonyms']), concords=expand("{dd}/disease/concords/{ap}",dd=config['intermediate_directory'],ap=config['disease_concords']), + metadata_yamls=expand("{dd}/disease/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['disease_concords']), idlists=expand("{dd}/disease/ids/{ap}",dd=config['intermediate_directory'],ap=config['disease_ids']), icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['disease_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['disease_outputs'])) run: - diseasephenotype.build_compendium(input.concords,input.idlists,input.close_matches,{'HP':input.bad_hpo_xrefs, - 'MONDO':input.bad_mondo_xrefs, - 'UMLS':input.bad_umls_xrefs}, input.icrdf_filename ) + diseasephenotype.build_compendium(input.concords, input.metadata_yamls, input.idlists,input.close_matches, + { + 'HP':input.bad_hpo_xrefs, + 'MONDO':input.bad_mondo_xrefs, + 'UMLS':input.bad_umls_xrefs + }, input.icrdf_filename ) rule check_disease_completeness: input: @@ -185,4 +215,4 @@ rule disease: x=config['output_directory']+'/reports/disease_done' run: util.gzip_files(input.synonyms) - util.write_done(output.x) \ No newline at end of file + util.write_done(output.x) diff --git a/src/snakefiles/drugchemical.snakefile b/src/snakefiles/drugchemical.snakefile index 9640c13b..6265964e 100644 --- a/src/snakefiles/drugchemical.snakefile +++ b/src/snakefiles/drugchemical.snakefile @@ -1,6 +1,7 @@ import src.createcompendia.drugchemical as drugchemical import src.synonyms.synonymconflation as synonymconflation import src.snakefiles.util as util +from src.metadata.provenance import write_concord_metadata ### Drug / Chemical @@ -9,38 +10,45 @@ rule rxnorm_relationships: rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF", rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml' run: - drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml) rule umls_relationships: input: umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF", umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF", output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml' run: - drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords) + drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml) rule pubchem_rxnorm_relationships: input: infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json', output: - outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM' + outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM', + metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml' run: - drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords) + drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule drugchemical_conflation: input: drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt', chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']), rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM', + rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml', umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS', + umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml', pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM', + pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml', drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv', icrdf_filename=config['download_directory']+'/icRDF.tsv', output: - outfile=config['output_directory']+'/conflation/DrugChemical.txt' + outfile=config['output_directory']+'/conflation/DrugChemical.txt', + metadata_yaml=config['output_directory']+'/metadata/DrugChemical.yaml', run: drugchemical.build_conflation( input.drugchemical_manual_concord, @@ -50,7 +58,13 @@ rule drugchemical_conflation: input.drug_compendium, input.chemical_compendia, input.icrdf_filename, - output.outfile) + output.outfile, + input_metadata_yamls=[ + input.rxnorm_metadata, + input.umls_metadata, + input.pubchem_metadata + ], + output_metadata_yaml=output.metadata_yaml) rule drugchemical_conflated_synonyms: input: diff --git a/src/snakefiles/duckdb.snakefile b/src/snakefiles/duckdb.snakefile index 55fba662..fd78b98d 100644 --- a/src/snakefiles/duckdb.snakefile +++ b/src/snakefiles/duckdb.snakefile @@ -23,7 +23,7 @@ rule export_compendia_to_duckdb: input: compendium_file=config['output_directory'] + "/compendia/{filename}.txt", output: - duckdb_filename=temp(config['output_directory'] + "/duckdb/duckdbs/filename={filename}/compendium.duckdb"), + duckdb_filename=config['output_directory'] + "/duckdb/duckdbs/filename={filename}/compendium.duckdb", clique_parquet_file=config['output_directory'] + "/duckdb/parquet/filename={filename}/Clique.parquet", run: duckdb_exporters.export_compendia_to_parquet(input.compendium_file, output.clique_parquet_file, output.duckdb_filename) @@ -47,7 +47,7 @@ rule export_synonyms_to_duckdb: input: synonyms_file=config['output_directory'] + "/synonyms/{filename}.txt.gz", output: - duckdb_filename=temp(config['output_directory'] + "/duckdb/duckdbs/filename={filename}/synonyms.duckdb"), + duckdb_filename=config['output_directory'] + "/duckdb/duckdbs/filename={filename}/synonyms.duckdb", synonyms_parquet_filename=config['output_directory'] + "/duckdb/parquet/filename={filename}/Synonyms.parquet", run: duckdb_exporters.export_synonyms_to_parquet(input.synonyms_file, output.duckdb_filename, output.synonyms_parquet_filename) diff --git a/src/snakefiles/gene.snakefile b/src/snakefiles/gene.snakefile index 7bb30832..4c57d517 100644 --- a/src/snakefiles/gene.snakefile +++ b/src/snakefiles/gene.snakefile @@ -1,6 +1,8 @@ import src.createcompendia.gene as gene import src.assess_compendia as assessments import src.snakefiles.util as util +from src.datahandlers import uniprotkb +from src.metadata.provenance import write_concord_metadata rule gene_mods_ids: input: @@ -33,7 +35,7 @@ rule gene_ensembl_ids: output: outfile=config['intermediate_directory']+"/gene/ids/ENSEMBL" run: - gene.write_ensembl_ids(config['download_directory'] + '/ENSEMBL',output.outfile) + gene.write_ensembl_gene_ids(config['download_directory'] + '/ENSEMBL',output.outfile) rule gene_hgnc_ids: input: @@ -57,57 +59,101 @@ rule get_gene_ncbigene_ensembl_relationships: infile=config['download_directory']+"/NCBIGene/gene2ensembl.gz", idfile=config['intermediate_directory'] + "/gene/ids/NCBIGene" output: - outfile=config['intermediate_directory']+'/gene/concords/NCBIGeneENSEMBL' + outfile=config['intermediate_directory']+'/gene/concords/NCBIGeneENSEMBL', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-NCBIGeneENSEMBL.yaml' run: - gene.build_gene_ncbi_ensembl_relationships(input.infile,input.idfile,output.outfile) + gene.build_gene_ncbi_ensembl_relationships(input.infile,input.idfile,output.outfile, output.metadata_yaml) rule get_gene_ncbigene_relationships: input: infile=config['download_directory']+"/NCBIGene/gene_info.gz", idfile=config['intermediate_directory']+"/gene/ids/NCBIGene" output: - outfile=config['intermediate_directory']+'/gene/concords/NCBIGene' + outfile=config['intermediate_directory']+'/gene/concords/NCBIGene', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-NCBIGene.yaml' run: - gene.build_gene_ncbigene_xrefs(input.infile,input.idfile,output.outfile) + gene.build_gene_ncbigene_xrefs(input.infile,input.idfile,output.outfile, output.metadata_yaml) rule get_gene_ensembl_relationships: input: infile =config['download_directory'] + '/ENSEMBL/BioMartDownloadComplete' output: - outfile=config['intermediate_directory']+'/gene/concords/ENSEMBL' + outfile=config['intermediate_directory']+'/gene/concords/ENSEMBL', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-ENSEMBL.yaml' run: - gene.build_gene_ensembl_relationships(config['download_directory']+'/ENSEMBL',output.outfile) + gene.build_gene_ensembl_relationships(config['download_directory']+'/ENSEMBL',output.outfile, output.metadata_yaml) rule get_gene_medgen_relationships: input: infile=config['download_directory']+'/NCBIGene/mim2gene_medgen' output: - outfile=config['intermediate_directory']+'/gene/concords/medgen' + outfile=config['intermediate_directory']+'/gene/concords/medgen', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-medgen.yaml', run: - gene.build_gene_medgen_relationships(input.infile, output.outfile) + gene.build_gene_medgen_relationships(input.infile, output.outfile, output.metadata_yaml) rule get_gene_umls_relationships: input: mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", infile=config['intermediate_directory']+'/gene/ids/UMLS' output: - outfile=config['intermediate_directory']+'/gene/concords/UMLS' + outfile=config['intermediate_directory']+'/gene/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/gene/concords/metadata-UMLS.yaml', run: - gene.build_gene_umls_hgnc_relationships(input.mrconso, input.infile, output.outfile) + gene.build_gene_umls_hgnc_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + +rule get_umls_gene_protein_mappings: + output: + umls_uniprotkb_filename=config['download_directory']+'/UMLS_UniProtKB/UMLS_UniProtKB.tsv', + umls_gene_concords=config['output_directory']+'/intermediate/gene/concords/UMLS_NCBIGene', + umls_ncbigene_metadata_yaml=config['output_directory']+'/intermediate/gene/concords/metadata-UMLS_NCBIGene.yaml', + umls_protein_concords=config['output_directory']+'/intermediate/protein/concords/UMLS_UniProtKB', + umls_protein_metadata_yaml=config['output_directory']+'/intermediate/protein/concords/metadata-UMLS_UniProtKB.yaml', + run: + uniprotkb.download_umls_gene_protein_mappings( + config['UMLS_UniProtKB_download_raw_url'], + output.umls_uniprotkb_filename, + output.umls_gene_concords, + output.umls_protein_concords, + ) + + write_concord_metadata( + output.umls_ncbigene_metadata_yaml, + name='get_umls_gene_protein_mappings', + description=f"Download UMLS-UniProtKB gene mappings from {config['UMLS_UniProtKB_download_raw_url']}", + sources=[{ + 'type': 'download', + 'name': 'UMLS-UniProtKB mappings', + 'url': config['UMLS_UniProtKB_download_raw_url'], + }], + concord_filename=output.umls_gene_concords, + ) + write_concord_metadata( + output.umls_protein_metadata_yaml, + name='get_umls_gene_protein_mappings', + description=f"Download UMLS-UniProtKB protein mappings from {config['UMLS_UniProtKB_download_raw_url']}", + sources=[{ + 'type': 'download', + 'name': 'UMLS-UniProtKB mappings', + 'url': config['UMLS_UniProtKB_download_raw_url'], + }], + concord_filename=output.umls_protein_concords, + ) rule gene_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['gene_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['gene_labels']), concords=expand("{dd}/gene/concords/{ap}",dd=config['intermediate_directory'],ap=config['gene_concords']), + metadata_yamls=expand("{dd}/gene/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['gene_concords']), idlists=expand("{dd}/gene/ids/{ap}",dd=config['intermediate_directory'],ap=config['gene_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['gene_outputs'])) run: - gene.build_gene_compendia(input.concords,input.idlists, input.icrdf_filename) + gene.build_gene_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_gene_completeness: input: diff --git a/src/snakefiles/genefamily.snakefile b/src/snakefiles/genefamily.snakefile index a38d2729..4bddff1c 100644 --- a/src/snakefiles/genefamily.snakefile +++ b/src/snakefiles/genefamily.snakefile @@ -15,7 +15,7 @@ rule genefamily_hgncfamily_ids: input: infile=config['download_directory']+'/HGNC.FAMILY/labels' output: - outfile=config['intermediate_directory']+"/genefamily/ids/HGNC.FAMILY" + outfile=config['intermediate_directory']+"/genefamily/ids/HGNC.FAMILY", shell: #This one is a simple enough transform to do with awk "awk '{{print $1\"\tbiolink:GeneFamily\"}}' {input.infile} > {output.outfile}" @@ -24,12 +24,14 @@ rule genefamily_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['genefamily_labels']), idlists=expand("{dd}/genefamily/ids/{ap}",dd=config['intermediate_directory'],ap=config['genefamily_ids']), + metadata_yamls=expand("{dd}/{ap}/metadata.yaml",dd=config['download_directory'],ap=config['genefamily_labels']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['genefamily_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['genefamily_outputs'])), + metadata_yaml=config['output_directory']+'/metadata/GeneFamily.txt.yaml', run: - genefamily.build_compendia(input.idlists, input.icrdf_filename) + genefamily.build_compendia(input.idlists, input.metadata_yamls, input.icrdf_filename) rule check_genefamily_completeness: input: diff --git a/src/snakefiles/geneprotein.snakefile b/src/snakefiles/geneprotein.snakefile index 199cc6fc..72f286b1 100644 --- a/src/snakefiles/geneprotein.snakefile +++ b/src/snakefiles/geneprotein.snakefile @@ -1,5 +1,6 @@ import src.createcompendia.geneprotein as geneprotein -import src.assess_compendia as assessments +from src.synonyms import synonymconflation +from util import gzip_files ### Gene / Protein @@ -7,9 +8,10 @@ rule geneprotein_uniprot_relationships: input: infile = config['download_directory'] + '/UniProtKB/idmapping.dat' output: - outfile_concords = config['intermediate_directory'] + '/geneprotein/concords/UniProtNCBI' + outfile_concords = config['intermediate_directory'] + '/geneprotein/concords/UniProtNCBI', + metadata_yaml = config['intermediate_directory'] + '/geneprotein/concords/metadata-UniProtNCBI.yaml' run: - geneprotein.build_uniprotkb_ncbigene_relationships(input.infile,output.outfile_concords) + geneprotein.build_uniprotkb_ncbigene_relationships(input.infile,output.outfile_concords, output.metadata_yaml) rule geneprotein_conflation: input: @@ -21,9 +23,26 @@ rule geneprotein_conflation: run: geneprotein.build_conflation(input.geneprotein_concord,input.gene_compendium,input.protein_compendium,output.outfile) +rule geneprotein_conflated_synonyms: + input: + geneprotein_conflations=[config['output_directory']+'/conflation/GeneProtein.txt'], + gene_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['gene_outputs']), + protein_compendia=expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']), + gene_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['gene_outputs']), + protein_synonyms_gz=expand("{od}/synonyms/{ap}.gz", od = config['output_directory'], ap = config['protein_outputs']) + output: + geneprotein_conflated_synonyms_gz=config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz' + run: + synonymconflation.conflate_synonyms( + input.gene_synonyms_gz + input.protein_synonyms_gz, + input.gene_compendia + input.protein_compendia, + input.geneprotein_conflations, + output.geneprotein_conflated_synonyms_gz) + rule geneprotein: input: - config['output_directory']+'/conflation/GeneProtein.txt' + config['output_directory']+'/conflation/GeneProtein.txt', + config['output_directory']+'/synonyms/GeneProteinConflated.txt.gz' output: x=config['output_directory']+'/reports/geneprotein_done' shell: diff --git a/src/snakefiles/macromolecular_complex.snakefile b/src/snakefiles/macromolecular_complex.snakefile index cd8581f5..5494e65d 100644 --- a/src/snakefiles/macromolecular_complex.snakefile +++ b/src/snakefiles/macromolecular_complex.snakefile @@ -6,7 +6,7 @@ rule macromolecular_complex_ids: input: infile = config['download_directory']+'/ComplexPortal/559292_labels.tsv' output: - outfile = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal' + outfile = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', shell: "awk '{{print $1\"\tbiolink:MacromolecularComplex\"}}' {input.infile} > {output.outfile}" @@ -15,12 +15,14 @@ rule macromolecular_complex_compendia: labels = config['download_directory']+'/ComplexPortal/559292_labels.tsv', synonyms = config['download_directory']+'/ComplexPortal/559292_synonyms.tsv', idlists = config['intermediate_directory']+'/macromolecular_complex/ids/ComplexPortal', + metadata_yaml = config['download_directory']+'/ComplexPortal/metadata.yaml', icrdf_filename = config['download_directory'] + '/icRDF.tsv', output: config['output_directory']+'/compendia/MacromolecularComplex.txt', - temp(config['output_directory']+'/synonyms/MacromolecularComplex.txt') + temp(config['output_directory']+'/synonyms/MacromolecularComplex.txt'), + output_metadata_yaml = config['output_directory']+'/metadata/MacromolecularComplex.txt.yaml', run: - macromolecular_complex.build_compendia([input.idlists], icrdf_filename=input.icrdf_filename) + macromolecular_complex.build_compendia([input.idlists], [input.metadata_yaml], icrdf_filename=input.icrdf_filename) rule check_macromolecular_complex_completeness: input: @@ -41,6 +43,7 @@ rule check_macromolecular_complex: rule macromolecular_complex: input: synonym=config['output_directory']+'/synonyms/MacromolecularComplex.txt', + output_metadata_yaml = config['output_directory']+'/metadata/MacromolecularComplex.txt.yaml', completeness=config['output_directory']+'/reports/macromolecular_complex_completeness.txt', reports = config['output_directory']+'/reports/MacromolecularComplex.txt' output: diff --git a/src/snakefiles/process.snakefile b/src/snakefiles/process.snakefile index 79b97ffd..fd3a16ab 100644 --- a/src/snakefiles/process.snakefile +++ b/src/snakefiles/process.snakefile @@ -64,16 +64,18 @@ rule process_umls_ids: rule get_process_go_relationships: output: config['intermediate_directory']+'/process/concords/GO', + metadata_yaml = config['intermediate_directory']+'/process/concords/metadata-GO.yaml' run: - pap.build_process_obo_relationships(config['intermediate_directory']+'/process/concords') + pap.build_process_obo_relationships(config['intermediate_directory']+'/process/concords', output.metadata_yaml) rule get_process_rhea_relationships: input: infile=config['download_directory']+"/RHEA/rhea.rdf", output: outfile=config['intermediate_directory']+'/process/concords/RHEA', + metadata_yaml=config['intermediate_directory']+'/process/concords/metadata-RHEA.yaml', run: - pap.build_process_rhea_relationships(output.outfile) + pap.build_process_rhea_relationships(output.outfile, output.metadata_yaml) rule get_process_umls_relationships: @@ -82,21 +84,23 @@ rule get_process_umls_relationships: infile=config['intermediate_directory']+"/process/ids/UMLS", output: outfile=config['intermediate_directory']+'/process/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/process/concords/metadata-UMLS.yaml' run: - pap.build_process_umls_relationships(input.mrconso, input.infile, output.outfile) + pap.build_process_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule process_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['process_labels']), #synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['process_labelsandsynonyms']), concords=expand("{dd}/process/concords/{ap}",dd=config['intermediate_directory'],ap=config['process_concords']), + metadata_yamls=expand("{dd}/process/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['process_concords']), idlists=expand("{dd}/process/ids/{ap}",dd=config['intermediate_directory'],ap=config['process_ids']), icrdf_filename=config['download_directory']+'/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['process_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['process_outputs'])) run: - pap.build_compendia(input.concords,input.idlists,input.icrdf_filename) + pap.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_process_completeness: input: @@ -140,4 +144,4 @@ rule process: x=config['output_directory']+'/reports/process_done' run: util.gzip_files(input.synonyms) - util.write_done(output.x) \ No newline at end of file + util.write_done(output.x) diff --git a/src/snakefiles/protein.snakefile b/src/snakefiles/protein.snakefile index b550d748..b493fae6 100644 --- a/src/snakefiles/protein.snakefile +++ b/src/snakefiles/protein.snakefile @@ -42,29 +42,32 @@ rule protein_ensembl_ids: output: outfile=config['intermediate_directory']+"/protein/ids/ENSEMBL" run: - protein.write_ensembl_ids(config['download_directory'] + '/ENSEMBL',output.outfile) + protein.write_ensembl_protein_ids(config['download_directory'] + '/ENSEMBL',output.outfile) rule get_protein_uniprotkb_ensembl_relationships: input: infile = config['download_directory'] + '/UniProtKB/idmapping.dat' output: - outfile = config['intermediate_directory'] + '/protein/concords/UniProtKB' + outfile = config['intermediate_directory'] + '/protein/concords/UniProtKB', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-UniProtKB.yaml', run: - protein.build_protein_uniprotkb_ensemble_relationships(input.infile,output.outfile) + protein.build_protein_uniprotkb_ensemble_relationships(input.infile,output.outfile, output.metadata_yaml) rule get_protein_pr_uniprotkb_relationships: output: - outfile = config['intermediate_directory'] + '/protein/concords/PR' + outfile = config['intermediate_directory'] + '/protein/concords/PR', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-PR.yaml' run: - protein.build_pr_uniprot_relationships(output.outfile) + protein.build_pr_uniprot_relationships(output.outfile, metadata_yaml=output.metadata_yaml) rule get_protein_ncit_uniprotkb_relationships: input: infile = config['download_directory'] + '/NCIT/NCIt-SwissProt_Mapping.txt' output: - outfile = config['intermediate_directory'] + '/protein/concords/NCIT_UniProtKB' + outfile = config['intermediate_directory'] + '/protein/concords/NCIT_UniProtKB', + metadata_yaml = config['intermediate_directory'] + '/protein/concords/metadata-NCIT_UniProtKB.yaml', run: - protein.build_ncit_uniprot_relationships(input.infile, output.outfile) + protein.build_ncit_uniprot_relationships(input.infile, output.outfile, output.metadata_yaml) rule get_protein_ncit_umls_relationships: input: @@ -72,14 +75,27 @@ rule get_protein_ncit_umls_relationships: infile=config['intermediate_directory']+"/protein/ids/UMLS" output: outfile=config['intermediate_directory']+'/protein/concords/NCIT_UMLS', + metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-NCIT_UMLS.yaml' run: - protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile) + protein.build_umls_ncit_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + +rule get_protein_umls_relationships: + input: + mrconso=config['download_directory']+"/UMLS/MRCONSO.RRF", + infile=config['intermediate_directory']+"/protein/ids/UMLS" + output: + outfile=config['intermediate_directory']+'/protein/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/protein/concords/metadata-UMLS.yaml' + run: + protein.build_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) + rule protein_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['protein_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['protein_synonyms']), concords=expand("{dd}/protein/concords/{ap}",dd=config['intermediate_directory'],ap=config['protein_concords']), + metadata_yamls=expand("{dd}/protein/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['protein_concords']), idlists=expand("{dd}/protein/ids/{ap}",dd=config['intermediate_directory'],ap=config['protein_ids']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', # Include the taxon information from UniProtKB @@ -88,7 +104,7 @@ rule protein_compendia: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['protein_outputs']), temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['protein_outputs'])) run: - protein.build_protein_compendia(input.concords,input.idlists, input.icrdf_filename) + protein.build_protein_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_protein_completeness: input: diff --git a/src/snakefiles/publications.snakefile b/src/snakefiles/publications.snakefile index 04704035..687c6b2b 100644 --- a/src/snakefiles/publications.snakefile +++ b/src/snakefiles/publications.snakefile @@ -36,6 +36,7 @@ rule generate_pubmed_concords: status_file = config['download_directory'] + '/PubMed/statuses.jsonl.gz', pmid_id_file = config['intermediate_directory'] + '/publications/ids/PMID', pmid_doi_concord_file = config['intermediate_directory'] + '/publications/concords/PMID_DOI', + metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', run: publications.parse_pubmed_into_tsvs( input.baseline_dir, @@ -43,7 +44,8 @@ rule generate_pubmed_concords: output.titles_file, output.status_file, output.pmid_id_file, - output.pmid_doi_concord_file) + output.pmid_doi_concord_file, + output.metadata_yaml) rule generate_pubmed_compendia: input: @@ -52,6 +54,7 @@ rule generate_pubmed_compendia: titles = [ config['download_directory'] + '/PubMed/titles.tsv', ], + metadata_yaml = config['intermediate_directory'] + '/publications/concords/metadata.yaml', icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: publication_compendium = config['output_directory'] + '/compendia/Publication.txt', @@ -60,6 +63,7 @@ rule generate_pubmed_compendia: run: publications.generate_compendium( [input.pmid_doi_concord_file], + [input.metadata_yaml], [input.pmid_id_file], input.titles, output.publication_compendium, diff --git a/src/snakefiles/taxon.snakefile b/src/snakefiles/taxon.snakefile index d3cdd7be..59d7c772 100644 --- a/src/snakefiles/taxon.snakefile +++ b/src/snakefiles/taxon.snakefile @@ -33,30 +33,34 @@ rule get_taxon_umls_relationships: infile=config['intermediate_directory']+"/taxon/ids/UMLS" output: outfile=config['intermediate_directory']+'/taxon/concords/UMLS', + metadata_yaml=config['intermediate_directory']+'/taxon/concords/metadata-UMLS.yaml', run: - taxon.build_taxon_umls_relationships(input.mrconso, input.infile, output.outfile) + taxon.build_taxon_umls_relationships(input.mrconso, input.infile, output.outfile, output.metadata_yaml) rule get_taxon_relationships: input: meshfile=config['download_directory']+"/MESH/mesh.nt", meshids=config['intermediate_directory']+"/taxon/ids/MESH", output: - outfile=config['intermediate_directory']+'/taxon/concords/NCBI_MESH' + outfile=config['intermediate_directory']+'/taxon/concords/NCBI_MESH', + metadata_yaml=config['intermediate_directory']+'/taxon/concords/metadata-NCBI_MESH.yaml', run: - taxon.build_relationships(output.outfile,input.meshids) + taxon.build_relationships(output.outfile,input.meshids, output.metadata_yaml) rule taxon_compendia: input: labels=expand("{dd}/{ap}/labels",dd=config['download_directory'],ap=config['taxon_labels']), synonyms=expand("{dd}/{ap}/synonyms",dd=config['download_directory'],ap=config['taxon_synonyms']), concords=expand("{dd}/taxon/concords/{ap}",dd=config['intermediate_directory'],ap=config['taxon_concords']), + metadata_yamls=expand("{dd}/taxon/concords/metadata-{ap}.yaml",dd=config['intermediate_directory'],ap=config['taxon_concords']), idlists=expand("{dd}/taxon/ids/{ap}",dd=config['intermediate_directory'],ap=config['taxon_ids']), icrdf_filename=config['download_directory'] + '/icRDF.tsv', output: expand("{od}/compendia/{ap}", od = config['output_directory'], ap = config['taxon_outputs']), - temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs'])) + temp(expand("{od}/synonyms/{ap}", od = config['output_directory'], ap = config['taxon_outputs'])), + output_metadata=expand("{od}/metadata/{ap}.yaml", od = config['output_directory'], ap = config['taxon_outputs']), run: - taxon.build_compendia(input.concords,input.idlists, input.icrdf_filename) + taxon.build_compendia(input.concords, input.metadata_yamls, input.idlists, input.icrdf_filename) rule check_taxon_completeness: input: diff --git a/src/snakefiles/util.py b/src/snakefiles/util.py index b28f1a08..cf46e27a 100644 --- a/src/snakefiles/util.py +++ b/src/snakefiles/util.py @@ -62,6 +62,7 @@ def get_all_synonyms(config): config['cell_line_outputs'] + config['genefamily_outputs'] + config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] + # Publication.txt is empty, but it's still created, so it needs to be here. @@ -87,6 +88,7 @@ def get_all_synonyms_except_drugchemicalconflated(config): config['cell_line_outputs'] + config['genefamily_outputs'] + # config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] ) @@ -110,6 +112,7 @@ def get_all_synonyms_with_drugchemicalconflated(config): config['cell_line_outputs'] + config['genefamily_outputs'] + config['drugchemicalconflated_synonym_outputs'] + + config['geneproteinconflated_synonym_outputs'] + config['umls_outputs'] + config['macromolecularcomplex_outputs'] ) diff --git a/src/synonyms/synonymconflation.py b/src/synonyms/synonymconflation.py index 46864f03..70353ee2 100644 --- a/src/synonyms/synonymconflation.py +++ b/src/synonyms/synonymconflation.py @@ -213,7 +213,7 @@ def conflate_synonyms(synonym_files_gz, compendia_files, conflation_file, output if 'taxa' in synonym: if 'taxa' not in final_conflation: final_conflation['taxa'] = set() - final_conflation.update(synonym['taxa']) + final_conflation['taxa'].update(synonym['taxa']) # Convert the taxa into a list. final_conflation['taxa'] = sorted(final_conflation['taxa']) diff --git a/src/util.py b/src/util.py index 24eed6e2..f06921d3 100644 --- a/src/util.py +++ b/src/util.py @@ -1,19 +1,48 @@ import logging import json import os +import sys +from time import gmtime import curies import yaml +import psutil from collections import namedtuple import copy from logging.handlers import RotatingFileHandler from bmt import Toolkit +from humanfriendly import format_size from src.LabeledID import LabeledID from src.prefixes import OMIM, OMIMPS, UMLS, SNOMEDCT, KEGGPATHWAY, KEGGREACTION, NCIT, ICD10, ICD10CM, ICD11FOUNDATION import src.prefixes as prefixes +def get_logger(name, loglevel=logging.INFO): + """ + Get a logger with the specified name. + + The LoggingUtil is inconsistently used, and we don't want rolling logs anyway -- just logging everything to STDERR + so that Snakemake can capture it is fine. However, we do want every logger to be configured identically and without + duplicated handlers. + """ + + # Set up the root handler for a logger. Ideally we would call this in one central location, but I'm not sure + # what they would be for Snakemake. basicConfig() should be safe to call from multiple threads after Python 3.2, but + # we might as well check. + if not logging.getLogger().hasHandlers(): + formatter = logging.Formatter('%(levelname)s %(name)s [%(asctime)s]: %(message)s', "%Y-%m-%dT%H:%M:%S%z") + formatter.converter = gmtime + + stream_handler = logging.StreamHandler(sys.stderr) + stream_handler.setFormatter(formatter) + logging.basicConfig(level=logging.INFO, handlers=[stream_handler]) + + # Set up a logger for the specified loglevel and return it. + logger = logging.getLogger(name) + logger.setLevel(loglevel) + return logger + #loggers = {} class LoggingUtil(object): """ Logging utility controlling format and setting initial logging level """ @@ -72,7 +101,7 @@ class Munge(object): @staticmethod def gene (gene): return gene.split ("/")[-1:][0] if gene.startswith ("http://") else gene - + class Text: """ Utilities for processing text. """ prefixmap = { x.lower(): x for k,x in vars(prefixes).items() if not k.startswith("__")} @@ -114,7 +143,7 @@ def recurie(cls,text,new_prefix=None): @staticmethod def un_curie (text): return ':'.join(text.split (':', 1)[1:]) if ':' in text else text - + @staticmethod def short (obj, limit=80): text = str(obj) if obj else None @@ -171,7 +200,7 @@ def opt_to_curie (text): return Text.recurie(r) else: raise ValueError(f"Unable to opt_to_curie({text}): output calculated as {r}, which has no colon.") - + return r @staticmethod @@ -217,7 +246,7 @@ def load_yaml (path): with open (path, 'r') as stream: result = yaml.load (stream.read ()) return result - + def get_resource_obj (resource_name, format='json'): result = None path = Resource.get_resource_path (resource_name) @@ -285,16 +314,22 @@ def to_named_tuple (type_name, d): return namedtuple(type_name, d.keys())(**d) +# Cache the config.yaml so we don't need to load it every time get_config() is called. +config_yaml = None def get_config(): """ Retrieve the configuration data from the 'config.yaml' file. :return: The configuration data loaded from the 'config.yaml' file. """ + global config_yaml + if config_yaml is not None: + return config_yaml + cname = os.path.join(os.path.dirname(__file__),'..', 'config.yaml') with open(cname,'r') as yaml_file: - data = yaml.safe_load(yaml_file) - return data + config_yaml = yaml.safe_load(yaml_file) + return config_yaml def get_biolink_model_toolkit(biolink_version): @@ -330,3 +365,15 @@ def get_biolink_prefix_map(): f'https://raw.githubusercontent.com/biolink/biolink-model/v' + biolink_version + '/project/prefixmap/biolink_model_prefix_map.json' ) + +def get_memory_usage_summary(): + """ + Provide a short summary of current memory usage to write into logs. + + :return: A string summarizing current memory usage. + """ + process = psutil.Process() + process.memory_percent() + mem_info = process.memory_info() + + return f"Using {process.memory_percent():.2f}% of available memory (RSS: {format_size(mem_info.rss, binary=True)}, VMS: {format_size(mem_info.vms, binary=True)})"