Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 19 additions & 70 deletions src/createcompendia/leftover_umls.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym
Path(umls_compendium).touch()

with open(umls_compendium, 'w') as compendiumf, open(report, 'w') as reportf:
# This defaults to the version of the Biolink model that is included with this BMT.
biolink_toolkit = Toolkit()

umls_type_by_id = dict()
preferred_name_by_id = dict()

for compendium in compendia:
logging.info(f"Starting compendium: {compendium}")
Expand All @@ -69,35 +70,13 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym
reportf.write(f"Completed all compendia with {len(umls_ids_in_other_compendia)} UMLS IDs.\n")
# print(umls_ids_in_other_compendia)

# Load all the semantic types.
umls_type_by_id = dict()
preferred_name_by_id = dict()
types_by_id = dict()
types_by_tui = dict()
with open(mrsty, 'r') as inf:
for line in inf:
x = line.strip().split('|')
umls_id = f"{UMLS}:{x[0]}"
tui = x[1]
# stn = x[2]
sty = x[3]

if umls_id not in types_by_id:
types_by_id[umls_id] = dict()
if tui not in types_by_id[umls_id]:
types_by_id[umls_id][tui] = set()
types_by_id[umls_id][tui].add(sty)

if tui not in types_by_tui:
types_by_tui[tui] = set()
types_by_tui[tui].add(sty)

logging.info(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
reportf.write(f"Completed loading {len(types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n")
umls_to_biolink = umls.UMLSToBiolinkTypeConverter(mrsty)
logging.info(f"Completed loading {len(umls_to_biolink.types_by_id.keys())} UMLS IDs from MRSTY.RRF.")
reportf.write(f"Completed loading {len(umls_to_biolink.types_by_id.keys())} UMLS IDs from MRSTY.RRF.\n")

with open('babel_outputs/reports/umls-types.tsv', 'w') as outf:
for tui in sorted(types_by_tui.keys()):
for sty in sorted(list(types_by_tui[tui])):
for tui in sorted(umls_to_biolink.types_by_tui.keys()):
for sty in sorted(list(umls_to_biolink.types_by_tui[tui])):
outf.write(f"{tui}\t{sty}\n")

# Create a compendium that consists solely of all MRCONSO entries that haven't been referenced.
Expand All @@ -121,50 +100,20 @@ def write_leftover_umls(compendia, umls_labels_filename, mrconso, mrsty, synonym
# The STR value should be the label.
label = x[14]

# Lookup type.
def umls_type_to_biolink_type(umls_tui):
biolink_type = biolink_toolkit.get_element_by_mapping(f'STY:{umls_tui}', most_specific=True, formatted=True, mixin=True)
if biolink_type is None:
logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
return biolink_type

umls_type_results = types_by_id.get(umls_id, {'biolink:NamedThing': {'Named thing'}})
biolink_types = set(list(map(umls_type_to_biolink_type, umls_type_results.keys())))

# How to deal with multiple Biolink types? We currently only have the following multiple
# types, so we can resolve these manually:
biolink_types_as_set = set(map(lambda t: "(None)" if t is None else t, list(biolink_types)))
biolink_types_as_str = '|'.join(sorted(list(biolink_types_as_set)))

if None in biolink_types:
# One of the TUIs couldn't be converted; let's delete all of them so that we can report this.
biolink_types = list()

# Some Biolink multiple types we handle manually.
if biolink_types_as_set == {DEVICE, DRUG}:
biolink_types = [DRUG]
elif biolink_types_as_set == {DRUG, SMALL_MOLECULE}:
biolink_types = [SMALL_MOLECULE]
elif biolink_types_as_set == {AGENT, PHYSICAL_ENTITY}:
biolink_types = [AGENT]
elif biolink_types_as_set == {PHYSICAL_ENTITY, PUBLICATION}:
biolink_types = [PUBLICATION]
elif biolink_types_as_set == {ACTIVITY, PROCEDURE}:
biolink_types = [PROCEDURE]
elif biolink_types_as_set == {DRUG, FOOD}:
biolink_types = [FOOD]

if len(biolink_types) == 0:
biolink_types = umls_to_biolink.get_biolink_types(umls_id)
if len(biolink_types) > 1:
count_multiple_umls_type += 1
biolink_type = umls_to_biolink.choose_single_biolink_type(umls_id, biolink_types)

if biolink_type is None:
umls_type_results = umls_to_biolink.types_by_id.get(umls_id, {'biolink:NamedThing': {'Named thing'}})
logging.debug(f"No UMLS type found for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"NO_UMLS_TYPE [{umls_id}]: {umls_type_results} -> {biolink_types}\n")
count_no_umls_type += 1
continue
if len(biolink_types) > 1:
logging.debug(f"Multiple UMLS types not yet supported for {umls_id}: {umls_type_results} -> {biolink_types}, skipping")
reportf.write(f"MULTIPLE_UMLS_TYPES [{umls_id}]\t{biolink_types_as_str}\t{umls_type_results} -> {biolink_types}\n")
count_multiple_umls_type += 1
continue
biolink_type = list(biolink_types)[0]

# Default to it being a biolink:NamedThing.
biolink_type = 'biolink:NamedThing'

umls_type_by_id[umls_id] = biolink_type
preferred_name_by_id[umls_id] = label

Expand Down
89 changes: 88 additions & 1 deletion src/datahandlers/umls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from bmt import Toolkit

from src.prefixes import UMLS, RXCUI
from src.babel_utils import make_local_name
from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE
from src.categories import DRUG, CHEMICAL_ENTITY, MOLECULAR_MIXTURE, DEVICE, SMALL_MOLECULE, PHYSICAL_ENTITY, AGENT, \
PUBLICATION, ACTIVITY, PROCEDURE, FOOD

import shutil
from zipfile import ZipFile
Expand Down Expand Up @@ -350,3 +353,87 @@ def pull_umls(mrconso):
continue
synonyms.write(f'{UMLS}:{cui}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{s}\n')

class UMLSToBiolinkTypeConverter:
"""
A Python class for converting UMLS IDs to Biolink types.
"""

def __init__(self, mrsty_filename, biolink_model_url=None):
"""
Create an UMLSToBiolinkTypeConverter with a MRSTY file and optional Biolink Model URL.

:param mrsty_filename: The path to the MRSTY.RRF file.
:param biolink_model_url: The link to the biolink-model.yaml file. If not provided, the default Biolink Model URL will be used.
"""

# Set up Biolink Toolkit.
if biolink_model_url:
self.biolink_toolkit = Toolkit(biolink_model_url)
else:
self.biolink_toolkit = Toolkit()

# Load MRSTY filename.
# See https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.Tf/ for column information.
self.types_by_id = dict()
self.types_by_tui = dict()
with open(mrsty_filename, 'r') as inf:
for line in inf:
x = line.strip().split('|')
curie = f"{UMLS}:{x[0]}"
tui = x[1]
# stn = x[2]
sty = x[3]

if curie not in self.types_by_id:
self.types_by_id[curie] = dict()
if tui not in self.types_by_id[curie]:
self.types_by_id[curie][tui] = set()
self.types_by_id[curie][tui].add(sty)

if tui not in self.types_by_tui:
self.types_by_tui[tui] = set()
self.types_by_tui[tui].add(sty)

def umls_type_to_biolink_type(self, umls_tui: str):
biolink_type = self.biolink_toolkit.get_element_by_mapping(f'STY:{umls_tui}', most_specific=True, formatted=True, mixin=True)
if biolink_type is None:
logging.debug(f"No Biolink type found for UMLS TUI {umls_tui}")
return biolink_type

def get_biolink_types(self, curie):
umls_type_results = self.types_by_id.get(curie, {'biolink:NamedThing': {'Named thing'}})
return set(list(map(self.umls_type_to_biolink_type, umls_type_results.keys())))

def choose_single_biolink_type(self, curie: str, biolink_types: list[str]):
"""
Given a set of Biolink types, choose the best one for a UMLS CURIE.

:param curie: The CURIE to normalize. We don't actually use this right now.
:param biolink_types: The list of Biolink types for this CURIE.
:return: A single Biolink type for this CURIE, or None if one could not be determined.
"""

if len(biolink_types) == 0:
return None

if len(biolink_types) == 1:
return biolink_types[0]

biolink_types_as_set = set(biolink_types)

# Some Biolink multiple types we handle manually.
if biolink_types_as_set == {DEVICE, DRUG}:
return DRUG
elif biolink_types_as_set == {DRUG, SMALL_MOLECULE}:
return SMALL_MOLECULE
elif biolink_types_as_set == {AGENT, PHYSICAL_ENTITY}:
return AGENT
elif biolink_types_as_set == {PHYSICAL_ENTITY, PUBLICATION}:
return PUBLICATION
elif biolink_types_as_set == {ACTIVITY, PROCEDURE}:
return PROCEDURE
elif biolink_types_as_set == {DRUG, FOOD}:
return FOOD

# No idea -- raise an Exception.
raise RuntimeError(f"Could not choose a single Biolink type for {curie} with types {biolink_types_as_set}: no manual resolution.")