From 65ec899c04e310e616c9ad51f302c1a25b53f7ba Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Wed, 30 Apr 2025 20:54:41 -0400 Subject: [PATCH] Added a basic SSSOM to Concord converter. --- requirements.txt | 2 ++ src/datahandlers/sssom_concords.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 src/datahandlers/sssom_concords.py diff --git a/requirements.txt b/requirements.txt index e330149d..6bfc61da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,5 @@ beautifulsoup4 curies # Added by Gaurav, May 2024 duckdb +# Added by Gaurav, Apr 2025 +sssom diff --git a/src/datahandlers/sssom_concords.py b/src/datahandlers/sssom_concords.py new file mode 100644 index 00000000..337c55c1 --- /dev/null +++ b/src/datahandlers/sssom_concords.py @@ -0,0 +1,58 @@ +# +# This file contains functions for helping to convert SSSOM files into concord files. +# This will be helpful in incorporating SSSOM mappings from the Mapping Commons and +# other SSSOM-based sources into Babel. +# +import logging + +from sssom import parsers + +def convert_sssom_to_concord(sssom_filename, concord_filename, sssom_format='tsv', threshold=0, filter_predicates=None): + """ + + :param sssom_filename: + :param concord_filename: + :return: + """ + + if not filter_predicates: + filter_predicates = set() + + # Step 1. Load the SSSOM file (or URL). + logging.info(f"Loading SSSOM file {sssom_filename} with format {sssom_format}.") + if sssom_format == 'tsv': + mappings = parsers.parse_sssom_table(sssom_filename) + elif sssom_format == 'rdf': + mappings = parsers.parse_sssom_rdf(sssom_filename) + else: + raise ValueError(f"Unknown SSSOM format (only 'tsv' and 'rdf' are supported): {sssom_format}.") + + df = mappings.df + + # Step 2. Filter mappings using the threshold. + if 'confidence' in df.columns: + df_filtered = df[(df['confidence'] > threshold)] + logging.info(f"Filtered {df.size} to {df_filtered.size} by filtering by confidence > {threshold}") + else: + df_filtered = df + + # Step 3. Filter mappings using particular predicates. + if filter_predicates: + df_filtered = df_filtered[(df_filtered['predicate_id'].isin(filter_predicates))] + logging.info(f"Filtered {len(df_filtered)} from an original set of {len(df)} mappings using predicates: {filter_predicates}") + + # Step 4. Write the filtered mappings to a file. + count_mappings = 0 + with open(concord_filename, 'w') as outf: + for index in df_filtered.index: + subject_id = df_filtered['subject_id'][index] + object_id = df_filtered['object_id'][index] + predicate_id = df_filtered['predicate_id'][index] + + if subject_id == 'sssom:NoTermFound' or object_id == 'sssom:NoTermFound': + continue + + outf.print(f"{subject_id}\t{predicate_id}\t{object_id}\n") + count_mappings += 1 + + logging.info(f"Extracted {count_mappings} mappings to {concord_filename}")