diff --git a/src/createcompendia/leftover_umls.py b/src/createcompendia/leftover_umls.py index 604ac248..7169bb91 100644 --- a/src/createcompendia/leftover_umls.py +++ b/src/createcompendia/leftover_umls.py @@ -48,8 +48,9 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym Path(umls_compendium).touch() with open(umls_compendium, 'w') as compendiumf, open(report, 'w') as reportf: - # This defaults to the version of the Biolink model that is included with this BMT. - biolink_toolkit = Toolkit() + + umls_type_by_id = dict() + preferred_name_by_id = dict() for compendium in compendia: logging.info(f"Starting compendium: {compendium}") @@ -69,35 +70,13 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym reportf.write(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.\n") # print(umls_ids_in_other_compendia) - # Load all the semantic types. - umls_type_by_id = dict() - preferred_name_by_id = dict() - types_by_id = dict() - types_by_tui = dict() - with open(mrsty, 'r') as inf: - for line in inf: - x = line.strip().split('|') - umls_id = f"{UMLS}:{x[0]}" - tui = x[1] - # stn = x[2] - sty = x[3] - - if umls_id not in types_by_id: - types_by_id[umls_id] = dict() - if tui not in types_by_id[umls_id]: - types_by_id[umls_id][tui] = set() - types_by_id[umls_id][tui].add(sty) - - if tui not in types_by_tui: - types_by_tui[tui] = set() - types_by_tui[tui].add(sty) - - logging.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.") - reportf.write(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n") + umls_to_biolink = umls.UMLSToBiolinkTypeConverter(mrsty) + logging.info(f"Completed loading {len(umls_to_biolink.types_by_id.keys())} UMLS IDs from MRSTY.RRF.") + reportf.write(f"Completed loading {len(umls_to_biolink.types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n") with open('babel_outputs/reports/umls-types.tsv', 'w') as outf: - for tui in sorted(types_by_tui.keys()): - for sty in sorted(list(types_by_tui[tui])): + for tui in sorted(umls_to_biolink.types_by_tui.keys()): + for sty in sorted(list(umls_to_biolink.types_by_tui[tui])): outf.write(f"{tui}\t{sty}\n") # Create a compendium that consists solely of all MRCONSO entries that haven't been referenced. @@ -121,50 +100,20 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym # The STR value should be the label. label = x[14] - # Lookup type. - def umls_type_to_biolink_type(umls_tui): - biolink_type = biolink_toolkit.get_element_by_mapping(f'STY:{umls_tui}', most_specific=True, formatted=True, mixin=True) - if biolink_type is None: - logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}") - return biolink_type - - umls_type_results = types_by_id.get(umls_id, {'biolink:NamedThing': {'Named thing'}}) - biolink_types = set(list(map(umls_type_to_biolink_type, umls_type_results.keys()))) - - # How to deal with multiple Biolink types? We currently only have the following multiple - # types, so we can resolve these manually: - biolink_types_as_set = set(map(lambda t: "(None)" if t is None else t, list(biolink_types))) - biolink_types_as_str = '|'.join(sorted(list(biolink_types_as_set))) - - if None in biolink_types: - # One of the TUIs couldn't be converted; let's delete all of them so that we can report this. - biolink_types = list() - - # Some Biolink multiple types we handle manually. - if biolink_types_as_set == {DEVICE, DRUG}: - biolink_types = [DRUG] - elif biolink_types_as_set == {DRUG, SMALL_MOLECULE}: - biolink_types = [SMALL_MOLECULE] - elif biolink_types_as_set == {AGENT, PHYSICAL_ENTITY}: - biolink_types = [AGENT] - elif biolink_types_as_set == {PHYSICAL_ENTITY, PUBLICATION}: - biolink_types = [PUBLICATION] - elif biolink_types_as_set == {ACTIVITY, PROCEDURE}: - biolink_types = [PROCEDURE] - elif biolink_types_as_set == {DRUG, FOOD}: - biolink_types = [FOOD] - - if len(biolink_types) == 0: + biolink_types = umls_to_biolink.get_biolink_types(umls_id) + if len(biolink_types) > 1: + count_multiple_umls_type += 1 + biolink_type = umls_to_biolink.choose_single_biolink_type(umls_id, biolink_types) + + if biolink_type is None: + umls_type_results = umls_to_biolink.types_by_id.get(umls_id, {'biolink:NamedThing': {'Named thing'}}) logging.debug(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping") reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n") count_no_umls_type += 1 - continue - if len(biolink_types) > 1: - logging.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping") - reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n") - count_multiple_umls_type += 1 - continue - biolink_type = list(biolink_types)[0] + + # Default to it being a biolink:NamedThing. + biolink_type = 'biolink:NamedThing' + umls_type_by_id[umls_id] = biolink_type preferred_name_by_id[umls_id] = label diff --git a/src/datahandlers/umls.py b/src/datahandlers/umls.py index cd852b68..8c72c5da 100644 --- a/src/datahandlers/umls.py +++ b/src/datahandlers/umls.py @@ -1,6 +1,9 @@ +from bmt import Toolkit + from src.prefixes import UMLS, RXCUI from src.babel_utils import make_local_name -from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE +from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE, DEVICE, SMALL_MOLECULE, PHYSICAL_ENTITY, AGENT, \ + PUBLICATION, ACTIVITY, PROCEDURE, FOOD import shutil from zipfile import ZipFile @@ -350,3 +353,87 @@ def pull_umls(mrconso): continue synonyms.write(f'{UMLS}:{cui}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{s}\n') +class UMLSToBiolinkTypeConverter: + """ + A Python class for converting UMLS IDs to Biolink types. + """ + + def __init__(self, mrsty_filename, biolink_model_url=None): + """ + Create an UMLSToBiolinkTypeConverter with a MRSTY file and optional Biolink Model URL. + + :param mrsty_filename: The path to the MRSTY.RRF file. + :param biolink_model_url: The link to the biolink-model.yaml file. If not provided, the default Biolink Model URL will be used. + """ + + # Set up Biolink Toolkit. + if biolink_model_url: + self.biolink_toolkit = Toolkit(biolink_model_url) + else: + self.biolink_toolkit = Toolkit() + + # Load MRSTY filename. + # See https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.Tf/ for column information. + self.types_by_id = dict() + self.types_by_tui = dict() + with open(mrsty_filename, 'r') as inf: + for line in inf: + x = line.strip().split('|') + curie = f"{UMLS}:{x[0]}" + tui = x[1] + # stn = x[2] + sty = x[3] + + if curie not in self.types_by_id: + self.types_by_id[curie] = dict() + if tui not in self.types_by_id[curie]: + self.types_by_id[curie][tui] = set() + self.types_by_id[curie][tui].add(sty) + + if tui not in self.types_by_tui: + self.types_by_tui[tui] = set() + self.types_by_tui[tui].add(sty) + + def umls_type_to_biolink_type(self, umls_tui: str): + biolink_type = self.biolink_toolkit.get_element_by_mapping(f'STY:{umls_tui}', most_specific=True, formatted=True, mixin=True) + if biolink_type is None: + logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}") + return biolink_type + + def get_biolink_types(self, curie): + umls_type_results = self.types_by_id.get(curie, {'biolink:NamedThing': {'Named thing'}}) + return set(list(map(self.umls_type_to_biolink_type, umls_type_results.keys()))) + + def choose_single_biolink_type(self, curie: str, biolink_types: list[str]): + """ + Given a set of Biolink types, choose the best one for a UMLS CURIE. + + :param curie: The CURIE to normalize. We don't actually use this right now. + :param biolink_types: The list of Biolink types for this CURIE. + :return: A single Biolink type for this CURIE, or None if one could not be determined. + """ + + if len(biolink_types) == 0: + return None + + if len(biolink_types) == 1: + return biolink_types[0] + + biolink_types_as_set = set(biolink_types) + + # Some Biolink multiple types we handle manually. + if biolink_types_as_set == {DEVICE, DRUG}: + return DRUG + elif biolink_types_as_set == {DRUG, SMALL_MOLECULE}: + return SMALL_MOLECULE + elif biolink_types_as_set == {AGENT, PHYSICAL_ENTITY}: + return AGENT + elif biolink_types_as_set == {PHYSICAL_ENTITY, PUBLICATION}: + return PUBLICATION + elif biolink_types_as_set == {ACTIVITY, PROCEDURE}: + return PROCEDURE + elif biolink_types_as_set == {DRUG, FOOD}: + return FOOD + + # No idea -- raise an Exception. + raise RuntimeError(f"Could not choose a single Biolink type for {curie} with types {biolink_types_as_set}: no manual resolution.")