From 57efd45a5e9ba3179aa49e1edf08ed631138c09c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Dec 2024 18:03:48 -0500 Subject: [PATCH 01/10] First stab at a simple comparator script. --- requirements.txt | 2 ++ scripts/comparator.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 scripts/comparator.py diff --git a/requirements.txt b/requirements.txt index 31d87eb7..fb634550 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,5 @@ curies duckdb # Added by Gaurav, Jul 2024 sssom +# Added by Gaurav, Dec 2024, to support command-line applications +click \ No newline at end of file diff --git a/scripts/comparator.py b/scripts/comparator.py new file mode 100644 index 00000000..1cc79d4e --- /dev/null +++ b/scripts/comparator.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# +# comparator.py - A script for comparing Babel files from different runs +# + +import click + +@click.command() +@click.option('--file-type', type=click.Choice(['compendium']), default='compendium') +@click.argument('file1', type=click.File('r'), required=True) +@click.argument('file2', type=click.File('r'), required=True) +def comparator(file_type, file1, file2): + """ + Compares two compendium or synonym files. + + :param file_type: Specifies the type of the files to compare. + Options are 'compendium' or 'synonyms' (not yet supported). + Defaults to 'compendium'. + :param file1: First file to compare. + :param file2: Second file to compare. + """ + with open(file1, 'r') as f1, open(file2, 'r') as f2: + for line1, line2 in zip(f1, f2): + # We can't really process them by-line, alas. + pass + return True + +if __name__ == "__main__": + comparator() \ No newline at end of file From 9c38f821366b9638ea8e279a9ffbb0eadac07be0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Dec 2024 18:38:22 -0500 Subject: [PATCH 02/10] Basic functioning diff method. --- scripts/comparator.py | 107 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 13 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 1cc79d4e..65639995 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -1,30 +1,111 @@ #!/usr/bin/env python3 - # # comparator.py - A script for comparing Babel files from different runs # +import json +import os +import logging +from collections import defaultdict import click +logging.basicConfig(level=logging.INFO) + +class CompendiumFile: + def __init__(self, path): + self.path = path + + # TODO: replace with DuckDB or something else more memory efficient. + self.curie_to_preferred_id = dict() + self.curie_to_label = dict() + self.curie_to_description = defaultdict(set) + self.curie_to_taxa = defaultdict(set) + self.preferred_id_to_type = defaultdict() + self.preferred_id_to_preferred_name = defaultdict() + self.preferred_id_to_ic = dict() + self.row_count = 0 + + def load(self): + with open(self.path, "r") as f: + for row in f: + self.row_count += 1 + if self.row_count % 1000000 == 0: + logging.info(f"Now loading line {self.row_count:,} from {self.path}") + + clique = json.loads(row) + + preferred_curie = clique['identifiers'][0]['i'] + self.preferred_id_to_type[preferred_curie] = clique['type'] + self.preferred_id_to_preferred_name[preferred_curie] = clique['preferred_name'] + self.preferred_id_to_ic = clique['ic'] + + for identifier in clique['identifiers']: + curie = identifier['i'] + self.curie_to_preferred_id[curie] = preferred_curie + self.curie_to_label[curie] = identifier.get('l', '') + self.curie_to_description[curie].update(identifier.get('d', [])) + self.curie_to_taxa[curie].update(identifier.get('t', [])) + + logging.info(f"Loaded {self.row_count:,} lines from {self.path}.") + + +def compare_compendium_files(path1, path2): + """ Compare two compendium files. + + @param path1: First path to compare. + @param path2: Second path to compare. + @return A comparison between the two compendium files as a dictionary. + """ + + compendium1 = CompendiumFile(path1) + compendium2 = CompendiumFile(path2) + + # TODO: Figure out how to do this in parallel. + compendium1.load() + compendium2.load() + + # Craft results and return. + return { + 'compendium1': { + 'path': path1, + 'curie_count': len(compendium1.curie_to_preferred_id), + 'clique_count': len(compendium1.preferred_id_to_type), + 'types': list(sorted(set(compendium1.preferred_id_to_type.values()))), + }, + 'compendium2': { + 'path': path2, + 'curie_count': len(compendium2.curie_to_preferred_id), + 'clique_count': len(compendium2.preferred_id_to_type), + 'types': list(set(sorted(compendium2.preferred_id_to_type.values()))), + }, + } + + @click.command() -@click.option('--file-type', type=click.Choice(['compendium']), default='compendium') -@click.argument('file1', type=click.File('r'), required=True) -@click.argument('file2', type=click.File('r'), required=True) -def comparator(file_type, file1, file2): +@click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium') +@click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) +@click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) +def comparator(input_type, input1, input2): """ Compares two compendium or synonym files. - :param file_type: Specifies the type of the files to compare. + :param input_type: Specifies the type of the files to compare. Options are 'compendium' or 'synonyms' (not yet supported). Defaults to 'compendium'. - :param file1: First file to compare. - :param file2: Second file to compare. + :param input1: First path (file or directory) to compare. + :param input2: Second file (file or directory) to compare. """ - with open(file1, 'r') as f1, open(file2, 'r') as f2: - for line1, line2 in zip(f1, f2): - # We can't really process them by-line, alas. - pass - return True + + # Some features haven't been implemented yet. + if input_type != 'compendium': + raise NotImplementedError(f"Input type '{input_type}' is not yet supported.") + if not os.path.isfile(input1) or not os.path.isfile(input2): + raise NotImplementedError(f"Only file-based comparisons are currently supported.") + + # Do the comparison. + results = compare_compendium_files(input1, input2) + print(json.dumps(results, indent=2)) + if __name__ == "__main__": comparator() \ No newline at end of file From 55c3cec209b122943f792213941db64f9bef69d4 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 16 Dec 2024 18:57:45 -0500 Subject: [PATCH 03/10] Added support for directory comparisons. --- scripts/comparator.py | 48 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 65639995..cf0977ac 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -15,6 +15,9 @@ class CompendiumFile: def __init__(self, path): self.path = path + self.file_exists = os.path.exists(self.path) + self.row_count = 0 + # TODO: replace with DuckDB or something else more memory efficient. self.curie_to_preferred_id = dict() self.curie_to_label = dict() @@ -23,9 +26,13 @@ def __init__(self, path): self.preferred_id_to_type = defaultdict() self.preferred_id_to_preferred_name = defaultdict() self.preferred_id_to_ic = dict() - self.row_count = 0 + def load(self): + if not os.path.exists(self.path): + logging.warning(f"Compendium file {self.path} does not exist.") + return + with open(self.path, "r") as f: for row in f: self.row_count += 1 @@ -68,12 +75,16 @@ def compare_compendium_files(path1, path2): return { 'compendium1': { 'path': path1, + 'file_exists': compendium1.file_exists, + 'row_count': compendium1.row_count, 'curie_count': len(compendium1.curie_to_preferred_id), 'clique_count': len(compendium1.preferred_id_to_type), 'types': list(sorted(set(compendium1.preferred_id_to_type.values()))), }, 'compendium2': { 'path': path2, + 'file_exists': compendium2.file_exists, + 'row_count': compendium2.row_count, 'curie_count': len(compendium2.curie_to_preferred_id), 'clique_count': len(compendium2.preferred_id_to_type), 'types': list(set(sorted(compendium2.preferred_id_to_type.values()))), @@ -99,11 +110,40 @@ def comparator(input_type, input1, input2): # Some features haven't been implemented yet. if input_type != 'compendium': raise NotImplementedError(f"Input type '{input_type}' is not yet supported.") - if not os.path.isfile(input1) or not os.path.isfile(input2): - raise NotImplementedError(f"Only file-based comparisons are currently supported.") # Do the comparison. - results = compare_compendium_files(input1, input2) + if os.path.isfile(input1) and os.path.isfile(input2): + results = compare_compendium_files(input1, input2) + elif os.path.isdir(input1) and os.path.isdir(input2): + results = { + 'directory1': {'path': input1}, + 'directory2': {'path': input2}, + 'comparisons': [], + } + + # Make a list of all the files in the directories input1 and input2. + files1 = os.listdir(input1) + files2 = os.listdir(input2) + all_filenames = set(files1 + files2) + for filename in sorted(all_filenames): + if filename.startswith('.'): + continue + path1 = os.path.join(input1, filename) + path2 = os.path.join(input2, filename) + + if os.path.isdir(path1): + logging.warning(f"Skipping directory {path1} in comparison.") + continue + + if os.path.isdir(path2): + logging.warning(f"Skipping directory {path2} in comparison.") + continue + + result = compare_compendium_files(path1, path2) + results['comparisons'].append(result) + else: + raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.") + print(json.dumps(results, indent=2)) From acc98500692449802edf0158d79f46aa0eac2e9c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 15:12:32 -0500 Subject: [PATCH 04/10] Added some documentation. --- scripts/comparator.py | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/scripts/comparator.py b/scripts/comparator.py index cf0977ac..22a26dc8 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -12,7 +12,43 @@ logging.basicConfig(level=logging.INFO) class CompendiumFile: + """ + CompendiumFile represents a data handler for managing and processing a compendium + file. It is used to load compendium data from a file, map identifiers to their + preferred IDs, extract associated labels, descriptions, taxonomic information, + and other metadata. + + The class provides methods to load data from a specified file path and maintains + the mappings and metadata in memory for further processing. + + :ivar path: The path to the compendium file. + :type path: str + :ivar file_exists: A boolean indicating if the compendium file exists at the specified path. + :type file_exists: bool + :ivar row_count: The number of rows processed from the compendium file. + :type row_count: int + :ivar curie_to_preferred_id: A dictionary mapping CURIEs to their preferred identifiers. + :type curie_to_preferred_id: dict + :ivar curie_to_label: A dictionary mapping CURIEs to their associated labels. + :type curie_to_label: dict + :ivar curie_to_description: A defaultdict mapping CURIEs to sets of descriptions. + :type curie_to_description: defaultdict + :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers. + :type curie_to_taxa: defaultdict + :ivar preferred_id_to_type: A defaultdict mapping preferred identifiers to their types. + :type preferred_id_to_type: defaultdict + :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names. + :type preferred_id_to_preferred_name: defaultdict + :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores. + :type preferred_id_to_ic: dict + """ + def __init__(self, path): + """ + Initialize a CompendiumFile object with the specified path. We don't load the file until load() is called. + + :param path: File path to initialize and load metadata from. + """ self.path = path self.file_exists = os.path.exists(self.path) @@ -29,6 +65,21 @@ def __init__(self, path): def load(self): + """ + Loads compendium data from the specified file path into various mappings. + + This method reads data from a JSON lines file located at the path specified + by the instance attribute `path`. Each line in the file should represent a + clique object in JSON format. The method populates multiple mappings + based on the contents of the file, including mappings between CURIEs and + their preferred identifiers, labels, descriptions, taxa, types, and + information content (IC). + + The method tracks and logs the progress of the file loading process. It will + log a warning if the specified file path does not exist, and progress + information is logged for every million lines processed. At the end, the + method logs the total number of lines read. + """ if not os.path.exists(self.path): logging.warning(f"Compendium file {self.path} does not exist.") return From a936a5765b7743f0f5b325ed6a8ec7359cc0d7e3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 15:31:04 -0500 Subject: [PATCH 05/10] Woo parallelization. --- scripts/comparator.py | 61 +++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 22a26dc8..949a4337 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -2,10 +2,13 @@ # # comparator.py - A script for comparing Babel files from different runs # +import concurrent import json import os import logging +import threading from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor import click @@ -118,9 +121,13 @@ def compare_compendium_files(path1, path2): compendium1 = CompendiumFile(path1) compendium2 = CompendiumFile(path2) - # TODO: Figure out how to do this in parallel. - compendium1.load() - compendium2.load() + # Load the two files in parallel. + thread_compendium1 = threading.Thread(target=compendium1.load) + thread_compendium2 = threading.Thread(target=compendium2.load) + thread_compendium1.start() + thread_compendium2.start() + thread_compendium1.join() + thread_compendium2.join() # Craft results and return. return { @@ -147,7 +154,8 @@ def compare_compendium_files(path1, path2): @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium') @click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) @click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) -def comparator(input_type, input1, input2): +@click.option('--max-workers', '-j', type=int, default=None, help='Maximum number of workers to use for parallel processing.') +def comparator(input_type, input1, input2, max_workers): """ Compares two compendium or synonym files. @@ -156,6 +164,7 @@ def comparator(input_type, input1, input2): Defaults to 'compendium'. :param input1: First path (file or directory) to compare. :param input2: Second file (file or directory) to compare. + :param max_workers: Maximum number of workers to use for parallel processing. """ # Some features haven't been implemented yet. @@ -176,22 +185,34 @@ def comparator(input_type, input1, input2): files1 = os.listdir(input1) files2 = os.listdir(input2) all_filenames = set(files1 + files2) - for filename in sorted(all_filenames): - if filename.startswith('.'): - continue - path1 = os.path.join(input1, filename) - path2 = os.path.join(input2, filename) - - if os.path.isdir(path1): - logging.warning(f"Skipping directory {path1} in comparison.") - continue - - if os.path.isdir(path2): - logging.warning(f"Skipping directory {path2} in comparison.") - continue - - result = compare_compendium_files(path1, path2) - results['comparisons'].append(result) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for filename in sorted(all_filenames): + if filename.startswith('.'): + continue + path1 = os.path.join(input1, filename) + path2 = os.path.join(input2, filename) + + if os.path.isdir(path1): + logging.warning(f"Skipping directory {path1} in comparison.") + continue + + if os.path.isdir(path2): + logging.warning(f"Skipping directory {path2} in comparison.") + continue + + futures.append(executor.submit(compare_compendium_files, path1, path2)) + + for future in concurrent.futures.as_completed(futures): + try: + results['comparisons'].append(future.result()) + except Exception as exc: + logging.error(f"Error comparing files: {exc}") + raise exc + + print(json.dumps(results, indent=2)) + else: raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.") From 02adb2cfb58cab0333cf883d3765f93d2db82738 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 15:52:51 -0500 Subject: [PATCH 06/10] Added labels to cliques. --- scripts/comparator.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/scripts/comparator.py b/scripts/comparator.py index 949a4337..9cc652e4 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -109,6 +109,46 @@ def load(self): logging.info(f"Loaded {self.row_count:,} lines from {self.path}.") + def diffs_to(self, older_compendium_file: 'CompendiumFile'): + """ + Generate diff counts between this compendium file and the older compendium file. + + :param older_compendium_file: A CompendiumFile object representing the older compendium file. + :return: A dictionary. + """ + + # Step 1. Figure out which identifiers have changed cliques between these two compendia. + identifiers_added = set() + identifiers_not_changed = set() + identifiers_changed = set() + identifiers_deleted = set() + for curie, preferred_curie in self.curie_to_preferred_id.items(): + if curie not in older_compendium_file.curie_to_preferred_id: + identifiers_added.add((curie, self.curie_to_label[curie], None, '', preferred_curie, self.preferred_id_to_preferred_name[preferred_curie])) + else: + old_preferred_curie = older_compendium_file.curie_to_preferred_id.get(curie) + if preferred_curie == old_preferred_curie: + identifiers_not_changed.add((curie, self.curie_to_label[curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], preferred_curie, self.preferred_id_to_preferred_name[preferred_curie])) + else: + identifiers_changed.add((curie, self.curie_to_label[curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], preferred_curie, self.preferred_id_to_preferred_name[preferred_curie])) + + for old_curie, old_preferred_curie in older_compendium_file.curie_to_preferred_id.items(): + if old_curie not in self.curie_to_preferred_id: + identifiers_deleted.add((old_curie, older_compendium_file.curie_to_label[old_curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], None, '')) + + # Step 2. Figure out the clique change. + clique_count = len(self.preferred_id_to_type.keys()) + old_clique_count = len(older_compendium_file.preferred_id_to_type.keys()) + + # Step 3. Report on all the identifiers. + return { + 'net_identifier_change': len(identifiers_added) - len(identifiers_deleted), + 'net_clique_change': (clique_count - old_clique_count), + 'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)), + 'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)), + 'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)), + } + def compare_compendium_files(path1, path2): """ Compare two compendium files. @@ -147,6 +187,7 @@ def compare_compendium_files(path1, path2): 'clique_count': len(compendium2.preferred_id_to_type), 'types': list(set(sorted(compendium2.preferred_id_to_type.values()))), }, + 'diffs': compendium2.diffs_to(compendium1), } From 3a610850e362085c8e0f3095dff7f2ceee0f36ec Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 16:20:40 -0500 Subject: [PATCH 07/10] Clique diffs. --- scripts/comparator.py | 106 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 10 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 9cc652e4..73f9196e 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -7,6 +7,7 @@ import os import logging import threading +import time from collections import defaultdict from concurrent.futures import ThreadPoolExecutor @@ -38,8 +39,8 @@ class CompendiumFile: :type curie_to_description: defaultdict :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers. :type curie_to_taxa: defaultdict - :ivar preferred_id_to_type: A defaultdict mapping preferred identifiers to their types. - :type preferred_id_to_type: defaultdict + :ivar preferred_id_to_type: A dict mapping preferred identifiers to their types. + :type preferred_id_to_type: dict :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names. :type preferred_id_to_preferred_name: defaultdict :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores. @@ -58,11 +59,12 @@ def __init__(self, path): self.row_count = 0 # TODO: replace with DuckDB or something else more memory efficient. + self.preferred_id_to_clique = defaultdict(list) self.curie_to_preferred_id = dict() self.curie_to_label = dict() self.curie_to_description = defaultdict(set) self.curie_to_taxa = defaultdict(set) - self.preferred_id_to_type = defaultdict() + self.preferred_id_to_type = dict() self.preferred_id_to_preferred_name = defaultdict() self.preferred_id_to_ic = dict() @@ -83,6 +85,9 @@ def load(self): information is logged for every million lines processed. At the end, the method logs the total number of lines read. """ + + time_started = time.time_ns() + if not os.path.exists(self.path): logging.warning(f"Compendium file {self.path} does not exist.") return @@ -98,7 +103,8 @@ def load(self): preferred_curie = clique['identifiers'][0]['i'] self.preferred_id_to_type[preferred_curie] = clique['type'] self.preferred_id_to_preferred_name[preferred_curie] = clique['preferred_name'] - self.preferred_id_to_ic = clique['ic'] + self.preferred_id_to_ic[preferred_curie] = clique['ic'] + self.preferred_id_to_clique[preferred_curie] = list(map(lambda x: x['i'], clique['identifiers'])) for identifier in clique['identifiers']: curie = identifier['i'] @@ -107,7 +113,8 @@ def load(self): self.curie_to_description[curie].update(identifier.get('d', [])) self.curie_to_taxa[curie].update(identifier.get('t', [])) - logging.info(f"Loaded {self.row_count:,} lines from {self.path}.") + time_ended = time.time_ns() + logging.info(f"Loaded {self.row_count:,} lines from {self.path} in {(time_ended - time_started) / 1_000_000_000:.2f} seconds.") def diffs_to(self, older_compendium_file: 'CompendiumFile'): """ @@ -122,6 +129,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): identifiers_not_changed = set() identifiers_changed = set() identifiers_deleted = set() + for curie, preferred_curie in self.curie_to_preferred_id.items(): if curie not in older_compendium_file.curie_to_preferred_id: identifiers_added.add((curie, self.curie_to_label[curie], None, '', preferred_curie, self.preferred_id_to_preferred_name[preferred_curie])) @@ -136,17 +144,88 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): if old_curie not in self.curie_to_preferred_id: identifiers_deleted.add((old_curie, older_compendium_file.curie_to_label[old_curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], None, '')) - # Step 2. Figure out the clique change. + # Step 2. Figure out the clique changes. clique_count = len(self.preferred_id_to_type.keys()) old_clique_count = len(older_compendium_file.preferred_id_to_type.keys()) + cliques_additions = {} + cliques_deletions = {} + clique_changes = {} + for preferred_curie, typ in self.preferred_id_to_type.items(): + if preferred_curie not in older_compendium_file.preferred_id_to_type: + # Addition. + cliques_additions[preferred_curie] = { + 'type': typ, + 'preferred_curie': preferred_curie, + 'preferred_name': self.preferred_id_to_preferred_name[preferred_curie], + 'identifiers': self.preferred_id_to_clique[preferred_curie], + } + else: + clique_change = { + 'type': typ, + 'preferred_curie': preferred_curie, + 'preferred_name': self.preferred_id_to_preferred_name[preferred_curie], + 'identifiers': self.preferred_id_to_clique[preferred_curie], + } + + # But did anything actually change? + flag_actually_changed = False + + old_typ = older_compendium_file.preferred_id_to_type[preferred_curie] + if old_typ != typ: + flag_actually_changed = True + clique_change['type'] = { + 'old': old_typ, + 'new': typ, + } + + clique_label = self.preferred_id_to_preferred_name[preferred_curie] + old_clique_label = older_compendium_file.preferred_id_to_preferred_name[preferred_curie] + if clique_label != old_clique_label: + flag_actually_changed = True + clique_change['preferred_name'] = { + 'old': old_clique_label, + 'new': clique_label, + } + + ids = self.preferred_id_to_clique[preferred_curie] + old_ids = older_compendium_file.preferred_id_to_clique[preferred_curie] + if ids != old_ids: + flag_actually_changed = True + clique_change['identifiers'] = { + 'old': old_ids, + 'new': ids, + 'added': sorted(set(ids) - set(old_ids)), + 'deleted': sorted(set(old_ids) - set(ids)), + } + + if flag_actually_changed: + clique_changes[preferred_curie] = clique_change + + for old_preferred_curie, typ in older_compendium_file.preferred_id_to_type.items(): + if old_preferred_curie not in self.preferred_id_to_type: + # Deletion. + cliques_deletions[old_preferred_curie] = { + 'type': typ, + 'preferred_curie': old_preferred_curie, + 'preferred_name': older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], + 'identifiers': older_compendium_file.preferred_id_to_clique[old_preferred_curie], + } + # Step 3. Report on all the identifiers. return { 'net_identifier_change': len(identifiers_added) - len(identifiers_deleted), 'net_clique_change': (clique_count - old_clique_count), - 'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)), - 'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)), - 'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)), + 'identifiers': { + 'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)), + 'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)), + 'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)), + }, + 'cliques': { + 'additions': cliques_additions, + 'deletions': cliques_deletions, + 'changes': clique_changes, + }, } @@ -158,6 +237,8 @@ def compare_compendium_files(path1, path2): @return A comparison between the two compendium files as a dictionary. """ + time_started = time.time_ns() + compendium1 = CompendiumFile(path1) compendium2 = CompendiumFile(path2) @@ -170,7 +251,7 @@ def compare_compendium_files(path1, path2): thread_compendium2.join() # Craft results and return. - return { + result = { 'compendium1': { 'path': path1, 'file_exists': compendium1.file_exists, @@ -190,6 +271,11 @@ def compare_compendium_files(path1, path2): 'diffs': compendium2.diffs_to(compendium1), } + time_ended = time.time_ns() + logging.info(f"Comparison of {path1} and {path2} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.") + + return result + @click.command() @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium') From 3c47597881fe68738f7a244ecd287d0a68c828fa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 16:34:10 -0500 Subject: [PATCH 08/10] Cleaned up clique outputs. --- scripts/comparator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 73f9196e..3e2f3350 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -116,6 +116,15 @@ def load(self): time_ended = time.time_ns() logging.info(f"Loaded {self.row_count:,} lines from {self.path} in {(time_ended - time_started) / 1_000_000_000:.2f} seconds.") + def add_labels(self, ids: list[str]): + """ + Return a list of labels for the IDs in ids. + + :param ids: A list of identifiers. + :return: A list of labels. + """ + return list(map(lambda x: self.curie_to_label.get(x, ''), ids)) + def diffs_to(self, older_compendium_file: 'CompendiumFile'): """ Generate diff counts between this compendium file and the older compendium file. @@ -174,7 +183,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): old_typ = older_compendium_file.preferred_id_to_type[preferred_curie] if old_typ != typ: flag_actually_changed = True - clique_change['type'] = { + clique_change['type_changed'] = { 'old': old_typ, 'new': typ, } @@ -183,7 +192,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): old_clique_label = older_compendium_file.preferred_id_to_preferred_name[preferred_curie] if clique_label != old_clique_label: flag_actually_changed = True - clique_change['preferred_name'] = { + clique_change['preferred_name_changed'] = { 'old': old_clique_label, 'new': clique_label, } @@ -192,9 +201,11 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): old_ids = older_compendium_file.preferred_id_to_clique[preferred_curie] if ids != old_ids: flag_actually_changed = True - clique_change['identifiers'] = { + clique_change['identifiers_changed'] = { 'old': old_ids, + 'old_with_labels': list(map(lambda x: f"{x[0]} '{x[1]}'", zip(old_ids, older_compendium_file.add_labels(old_ids)))), 'new': ids, + 'new_with_labels': list(map(lambda x: f"{x[0]} '{x[1]}'", zip(ids, self.add_labels(ids)))), 'added': sorted(set(ids) - set(old_ids)), 'deleted': sorted(set(old_ids) - set(ids)), } From 47ea74ae04f56e942cd24d20b5c26eeff3291daa Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 19 Dec 2024 16:40:02 -0500 Subject: [PATCH 09/10] Cleaned up code and documentation a bit. --- scripts/comparator.py | 48 +++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 3e2f3350..4c6897cd 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -2,6 +2,14 @@ # # comparator.py - A script for comparing Babel files from different runs # +# You can run this script on a single compendium file: +# python comparator.py dir1/compendia/Disease.txt dir2/compendia/Disease.txt +# Or on an entire directory: +# python comparator.py dir1/compendia dir2/compendia +# +# It currently only writes out a JSON document to STDOUT, but in the future we might add a TSV output as well. +# + import concurrent import json import os @@ -17,34 +25,8 @@ class CompendiumFile: """ - CompendiumFile represents a data handler for managing and processing a compendium - file. It is used to load compendium data from a file, map identifiers to their - preferred IDs, extract associated labels, descriptions, taxonomic information, - and other metadata. - - The class provides methods to load data from a specified file path and maintains - the mappings and metadata in memory for further processing. - - :ivar path: The path to the compendium file. - :type path: str - :ivar file_exists: A boolean indicating if the compendium file exists at the specified path. - :type file_exists: bool - :ivar row_count: The number of rows processed from the compendium file. - :type row_count: int - :ivar curie_to_preferred_id: A dictionary mapping CURIEs to their preferred identifiers. - :type curie_to_preferred_id: dict - :ivar curie_to_label: A dictionary mapping CURIEs to their associated labels. - :type curie_to_label: dict - :ivar curie_to_description: A defaultdict mapping CURIEs to sets of descriptions. - :type curie_to_description: defaultdict - :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers. - :type curie_to_taxa: defaultdict - :ivar preferred_id_to_type: A dict mapping preferred identifiers to their types. - :type preferred_id_to_type: dict - :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names. - :type preferred_id_to_preferred_name: defaultdict - :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores. - :type preferred_id_to_ic: dict + Represents a compendium file at a particular path. The load() method will load the file into a series of in-memory + dictionaries, and the diffs_to() method will generate a diff between this compendium file and another compendium file. """ def __init__(self, path): @@ -159,6 +141,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): cliques_additions = {} cliques_deletions = {} + cliques_unchanged = {} clique_changes = {} for preferred_curie, typ in self.preferred_id_to_type.items(): if preferred_curie not in older_compendium_file.preferred_id_to_type: @@ -170,6 +153,8 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): 'identifiers': self.preferred_id_to_clique[preferred_curie], } else: + # The clique is present in both self and older_compendium_file, so we need to determine if it's + # changed or not. clique_change = { 'type': typ, 'preferred_curie': preferred_curie, @@ -210,8 +195,11 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): 'deleted': sorted(set(old_ids) - set(ids)), } + # If something actually changed, add it to the clique changes list. if flag_actually_changed: clique_changes[preferred_curie] = clique_change + else: + cliques_unchanged[preferred_curie] = clique_change for old_preferred_curie, typ in older_compendium_file.preferred_id_to_type.items(): if old_preferred_curie not in self.preferred_id_to_type: @@ -223,7 +211,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): 'identifiers': older_compendium_file.preferred_id_to_clique[old_preferred_curie], } - # Step 3. Report on all the identifiers. + # Step 3. Report on all the identifiers and cliques. return { 'net_identifier_change': len(identifiers_added) - len(identifiers_deleted), 'net_clique_change': (clique_count - old_clique_count), @@ -349,8 +337,6 @@ def comparator(input_type, input1, input2, max_workers): logging.error(f"Error comparing files: {exc}") raise exc - print(json.dumps(results, indent=2)) - else: raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.") From 4c10fcbf42d757b4fbffb5ba86f7152e2c59c991 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 3 Jan 2025 14:14:51 -0500 Subject: [PATCH 10/10] Clarified old and new paths. --- scripts/comparator.py | 81 ++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/scripts/comparator.py b/scripts/comparator.py index 4c6897cd..641784b5 100644 --- a/scripts/comparator.py +++ b/scripts/comparator.py @@ -228,22 +228,22 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'): } -def compare_compendium_files(path1, path2): +def compare_compendium_files(path_old, path_new): """ Compare two compendium files. - @param path1: First path to compare. - @param path2: Second path to compare. - @return A comparison between the two compendium files as a dictionary. + :param path_old: The older folder to compare + :param path_new: The newer folder to compare. + :return A comparison between the two compendium files as a dictionary. """ time_started = time.time_ns() - compendium1 = CompendiumFile(path1) - compendium2 = CompendiumFile(path2) + compendium_old = CompendiumFile(path_old) + compendium_new = CompendiumFile(path_new) # Load the two files in parallel. - thread_compendium1 = threading.Thread(target=compendium1.load) - thread_compendium2 = threading.Thread(target=compendium2.load) + thread_compendium1 = threading.Thread(target=compendium_old.load) + thread_compendium2 = threading.Thread(target=compendium_new.load) thread_compendium1.start() thread_compendium2.start() thread_compendium1.join() @@ -251,45 +251,46 @@ def compare_compendium_files(path1, path2): # Craft results and return. result = { - 'compendium1': { - 'path': path1, - 'file_exists': compendium1.file_exists, - 'row_count': compendium1.row_count, - 'curie_count': len(compendium1.curie_to_preferred_id), - 'clique_count': len(compendium1.preferred_id_to_type), - 'types': list(sorted(set(compendium1.preferred_id_to_type.values()))), + 'compendium_old': { + 'path': path_old, + 'file_exists': compendium_old.file_exists, + 'row_count': compendium_old.row_count, + 'curie_count': len(compendium_old.curie_to_preferred_id), + 'clique_count': len(compendium_old.preferred_id_to_type), + 'types': list(sorted(set(compendium_old.preferred_id_to_type.values()))), }, - 'compendium2': { - 'path': path2, - 'file_exists': compendium2.file_exists, - 'row_count': compendium2.row_count, - 'curie_count': len(compendium2.curie_to_preferred_id), - 'clique_count': len(compendium2.preferred_id_to_type), - 'types': list(set(sorted(compendium2.preferred_id_to_type.values()))), + 'compendium_new': { + 'path': path_new, + 'file_exists': compendium_new.file_exists, + 'row_count': compendium_new.row_count, + 'curie_count': len(compendium_new.curie_to_preferred_id), + 'clique_count': len(compendium_new.preferred_id_to_type), + 'types': list(set(sorted(compendium_new.preferred_id_to_type.values()))), }, - 'diffs': compendium2.diffs_to(compendium1), + 'diffs': compendium_new.diffs_to(compendium_old), } time_ended = time.time_ns() - logging.info(f"Comparison of {path1} and {path2} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.") + logging.info(f"Comparison of {path_old} to {path_new} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.") return result @click.command() @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium') -@click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) -@click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) +@click.argument('input_old', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) +@click.argument('input_new', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True) @click.option('--max-workers', '-j', type=int, default=None, help='Maximum number of workers to use for parallel processing.') -def comparator(input_type, input1, input2, max_workers): +def comparator(input_type, input_old, input_new, max_workers): """ - Compares two compendium or synonym files. + Compares either two compendium files or two directories containing compendium files. + \f :param input_type: Specifies the type of the files to compare. Options are 'compendium' or 'synonyms' (not yet supported). Defaults to 'compendium'. - :param input1: First path (file or directory) to compare. - :param input2: Second file (file or directory) to compare. + :param input_old: Older path (file or directory) to compare. + :param input_new: Newer path (file or directory) to compare. :param max_workers: Maximum number of workers to use for parallel processing. """ @@ -298,18 +299,18 @@ def comparator(input_type, input1, input2, max_workers): raise NotImplementedError(f"Input type '{input_type}' is not yet supported.") # Do the comparison. - if os.path.isfile(input1) and os.path.isfile(input2): - results = compare_compendium_files(input1, input2) - elif os.path.isdir(input1) and os.path.isdir(input2): + if os.path.isfile(input_old) and os.path.isfile(input_new): + results = compare_compendium_files(input_old, input_new) + elif os.path.isdir(input_old) and os.path.isdir(input_new): results = { - 'directory1': {'path': input1}, - 'directory2': {'path': input2}, + 'directory1': {'path': input_old}, + 'directory2': {'path': input_new}, 'comparisons': [], } # Make a list of all the files in the directories input1 and input2. - files1 = os.listdir(input1) - files2 = os.listdir(input2) + files1 = os.listdir(input_old) + files2 = os.listdir(input_new) all_filenames = set(files1 + files2) with ThreadPoolExecutor(max_workers=max_workers) as executor: @@ -317,8 +318,8 @@ def comparator(input_type, input1, input2, max_workers): for filename in sorted(all_filenames): if filename.startswith('.'): continue - path1 = os.path.join(input1, filename) - path2 = os.path.join(input2, filename) + path1 = os.path.join(input_old, filename) + path2 = os.path.join(input_new, filename) if os.path.isdir(path1): logging.warning(f"Skipping directory {path1} in comparison.") @@ -338,7 +339,7 @@ def comparator(input_type, input1, input2, max_workers): raise exc else: - raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.") + raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input_old} and {input_new}.") print(json.dumps(results, indent=2))