From 57efd45a5e9ba3179aa49e1edf08ed631138c09c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Dec 2024 18:03:48 -0500
Subject: [PATCH 01/10] First stab at a simple comparator script.

---
 requirements.txt      |  2 ++
 scripts/comparator.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 scripts/comparator.py

diff --git a/requirements.txt b/requirements.txt
index 31d87eb7..fb634550 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,5 @@ curies
 duckdb
 # Added by Gaurav, Jul 2024
 sssom
+# Added by Gaurav, Dec 2024, to support command-line applications
+click
\ No newline at end of file
diff --git a/scripts/comparator.py b/scripts/comparator.py
new file mode 100644
index 00000000..1cc79d4e
--- /dev/null
+++ b/scripts/comparator.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+#
+# comparator.py - A script for comparing Babel files from different runs
+#
+
+import click
+
+@click.command()
+@click.option('--file-type', type=click.Choice(['compendium']), default='compendium')
+@click.argument('file1', type=click.File('r'), required=True)
+@click.argument('file2', type=click.File('r'), required=True)
+def comparator(file_type, file1, file2):
+    """
+    Compares two compendium or synonym files.
+
+    :param file_type: Specifies the type of the files to compare.
+        Options are 'compendium' or 'synonyms' (not yet supported).
+        Defaults to 'compendium'.
+    :param file1: First file to compare.
+    :param file2: Second file to compare.
+    """
+    with open(file1, 'r') as f1, open(file2, 'r') as f2:
+        for line1, line2 in zip(f1, f2):
+            # We can't really process them by-line, alas.
+            pass
+    return True
+
+if __name__ == "__main__":
+    comparator()
\ No newline at end of file

From 9c38f821366b9638ea8e279a9ffbb0eadac07be0 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Dec 2024 18:38:22 -0500
Subject: [PATCH 02/10] Basic functioning diff method.

---
 scripts/comparator.py | 107 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 13 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 1cc79d4e..65639995 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -1,30 +1,111 @@
 #!/usr/bin/env python3
-
 #
 # comparator.py - A script for comparing Babel files from different runs
 #
+import json
+import os
+import logging
+from collections import defaultdict
 
 import click
 
+logging.basicConfig(level=logging.INFO)
+
+class CompendiumFile:
+    def __init__(self, path):
+        self.path = path
+
+        # TODO: replace with DuckDB or something else more memory efficient.
+        self.curie_to_preferred_id = dict()
+        self.curie_to_label = dict()
+        self.curie_to_description = defaultdict(set)
+        self.curie_to_taxa = defaultdict(set)
+        self.preferred_id_to_type = defaultdict()
+        self.preferred_id_to_preferred_name = defaultdict()
+        self.preferred_id_to_ic = dict()
+        self.row_count = 0
+
+    def load(self):
+        with open(self.path, "r") as f:
+            for row in f:
+                self.row_count += 1
+                if self.row_count % 1000000 == 0:
+                    logging.info(f"Now loading line {self.row_count:,} from {self.path}")
+
+                clique = json.loads(row)
+
+                preferred_curie = clique['identifiers'][0]['i']
+                self.preferred_id_to_type[preferred_curie] = clique['type']
+                self.preferred_id_to_preferred_name[preferred_curie] = clique['preferred_name']
+                self.preferred_id_to_ic = clique['ic']
+
+                for identifier in clique['identifiers']:
+                    curie = identifier['i']
+                    self.curie_to_preferred_id[curie] = preferred_curie
+                    self.curie_to_label[curie] = identifier.get('l', '')
+                    self.curie_to_description[curie].update(identifier.get('d', []))
+                    self.curie_to_taxa[curie].update(identifier.get('t', []))
+
+        logging.info(f"Loaded {self.row_count:,} lines from {self.path}.")
+
+
+def compare_compendium_files(path1, path2):
+    """ Compare two compendium files.
+
+    @param path1: First path to compare.
+    @param path2: Second path to compare.
+    @return A comparison between the two compendium files as a dictionary.
+    """
+
+    compendium1 = CompendiumFile(path1)
+    compendium2 = CompendiumFile(path2)
+
+    # TODO: Figure out how to do this in parallel.
+    compendium1.load()
+    compendium2.load()
+
+    # Craft results and return.
+    return {
+        'compendium1': {
+            'path': path1,
+            'curie_count': len(compendium1.curie_to_preferred_id),
+            'clique_count': len(compendium1.preferred_id_to_type),
+            'types': list(sorted(set(compendium1.preferred_id_to_type.values()))),
+        },
+        'compendium2': {
+            'path': path2,
+            'curie_count': len(compendium2.curie_to_preferred_id),
+            'clique_count': len(compendium2.preferred_id_to_type),
+            'types': list(set(sorted(compendium2.preferred_id_to_type.values()))),
+        },
+    }
+
+
 @click.command()
-@click.option('--file-type', type=click.Choice(['compendium']), default='compendium')
-@click.argument('file1', type=click.File('r'), required=True)
-@click.argument('file2', type=click.File('r'), required=True)
-def comparator(file_type, file1, file2):
+@click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium')
+@click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
+@click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
+def comparator(input_type, input1, input2):
     """
     Compares two compendium or synonym files.
 
-    :param file_type: Specifies the type of the files to compare.
+    :param input_type: Specifies the type of the files to compare.
         Options are 'compendium' or 'synonyms' (not yet supported).
         Defaults to 'compendium'.
-    :param file1: First file to compare.
-    :param file2: Second file to compare.
+    :param input1: First path (file or directory) to compare.
+    :param input2: Second file (file or directory) to compare.
     """
-    with open(file1, 'r') as f1, open(file2, 'r') as f2:
-        for line1, line2 in zip(f1, f2):
-            # We can't really process them by-line, alas.
-            pass
-    return True
+
+    # Some features haven't been implemented yet.
+    if input_type != 'compendium':
+        raise NotImplementedError(f"Input type '{input_type}' is not yet supported.")
+    if not os.path.isfile(input1) or not os.path.isfile(input2):
+        raise NotImplementedError(f"Only file-based comparisons are currently supported.")
+
+    # Do the comparison.
+    results = compare_compendium_files(input1, input2)
+    print(json.dumps(results, indent=2))
+
 
 if __name__ == "__main__":
     comparator()
\ No newline at end of file

From 55c3cec209b122943f792213941db64f9bef69d4 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 16 Dec 2024 18:57:45 -0500
Subject: [PATCH 03/10] Added support for directory comparisons.

---
 scripts/comparator.py | 48 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 65639995..cf0977ac 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -15,6 +15,9 @@ class CompendiumFile:
     def __init__(self, path):
         self.path = path
 
+        self.file_exists = os.path.exists(self.path)
+        self.row_count = 0
+
         # TODO: replace with DuckDB or something else more memory efficient.
         self.curie_to_preferred_id = dict()
         self.curie_to_label = dict()
@@ -23,9 +26,13 @@ def __init__(self, path):
         self.preferred_id_to_type = defaultdict()
         self.preferred_id_to_preferred_name = defaultdict()
         self.preferred_id_to_ic = dict()
-        self.row_count = 0
+
 
     def load(self):
+        if not os.path.exists(self.path):
+            logging.warning(f"Compendium file {self.path} does not exist.")
+            return
+
         with open(self.path, "r") as f:
             for row in f:
                 self.row_count += 1
@@ -68,12 +75,16 @@ def compare_compendium_files(path1, path2):
     return {
         'compendium1': {
             'path': path1,
+            'file_exists': compendium1.file_exists,
+            'row_count': compendium1.row_count,
             'curie_count': len(compendium1.curie_to_preferred_id),
             'clique_count': len(compendium1.preferred_id_to_type),
             'types': list(sorted(set(compendium1.preferred_id_to_type.values()))),
         },
         'compendium2': {
             'path': path2,
+            'file_exists': compendium2.file_exists,
+            'row_count': compendium2.row_count,
             'curie_count': len(compendium2.curie_to_preferred_id),
             'clique_count': len(compendium2.preferred_id_to_type),
             'types': list(set(sorted(compendium2.preferred_id_to_type.values()))),
@@ -99,11 +110,40 @@ def comparator(input_type, input1, input2):
     # Some features haven't been implemented yet.
     if input_type != 'compendium':
         raise NotImplementedError(f"Input type '{input_type}' is not yet supported.")
-    if not os.path.isfile(input1) or not os.path.isfile(input2):
-        raise NotImplementedError(f"Only file-based comparisons are currently supported.")
 
     # Do the comparison.
-    results = compare_compendium_files(input1, input2)
+    if os.path.isfile(input1) and os.path.isfile(input2):
+        results = compare_compendium_files(input1, input2)
+    elif os.path.isdir(input1) and os.path.isdir(input2):
+        results = {
+            'directory1': {'path': input1},
+            'directory2': {'path': input2},
+            'comparisons': [],
+        }
+
+        # Make a list of all the files in the directories input1 and input2.
+        files1 = os.listdir(input1)
+        files2 = os.listdir(input2)
+        all_filenames = set(files1 + files2)
+        for filename in sorted(all_filenames):
+            if filename.startswith('.'):
+                continue
+            path1 = os.path.join(input1, filename)
+            path2 = os.path.join(input2, filename)
+
+            if os.path.isdir(path1):
+                logging.warning(f"Skipping directory {path1} in comparison.")
+                continue
+
+            if os.path.isdir(path2):
+                logging.warning(f"Skipping directory {path2} in comparison.")
+                continue
+
+            result = compare_compendium_files(path1, path2)
+            results['comparisons'].append(result)
+    else:
+        raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.")
+    
     print(json.dumps(results, indent=2))
 
 

From acc98500692449802edf0158d79f46aa0eac2e9c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 15:12:32 -0500
Subject: [PATCH 04/10] Added some documentation.

---
 scripts/comparator.py | 51 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index cf0977ac..22a26dc8 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -12,7 +12,43 @@
 logging.basicConfig(level=logging.INFO)
 
 class CompendiumFile:
+    """
+    CompendiumFile represents a data handler for managing and processing a compendium
+    file. It is used to load compendium data from a file, map identifiers to their
+    preferred IDs, extract associated labels, descriptions, taxonomic information,
+    and other metadata.
+
+    The class provides methods to load data from a specified file path and maintains
+    the mappings and metadata in memory for further processing.
+
+    :ivar path: The path to the compendium file.
+    :type path: str
+    :ivar file_exists: A boolean indicating if the compendium file exists at the specified path.
+    :type file_exists: bool
+    :ivar row_count: The number of rows processed from the compendium file.
+    :type row_count: int
+    :ivar curie_to_preferred_id: A dictionary mapping CURIEs to their preferred identifiers.
+    :type curie_to_preferred_id: dict
+    :ivar curie_to_label: A dictionary mapping CURIEs to their associated labels.
+    :type curie_to_label: dict
+    :ivar curie_to_description: A defaultdict mapping CURIEs to sets of descriptions.
+    :type curie_to_description: defaultdict
+    :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers.
+    :type curie_to_taxa: defaultdict
+    :ivar preferred_id_to_type: A defaultdict mapping preferred identifiers to their types.
+    :type preferred_id_to_type: defaultdict
+    :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names.
+    :type preferred_id_to_preferred_name: defaultdict
+    :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores.
+    :type preferred_id_to_ic: dict
+    """
+
     def __init__(self, path):
+        """
+        Initialize a CompendiumFile object with the specified path. We don't load the file until load() is called.
+
+        :param path: File path to initialize and load metadata from.
+        """
         self.path = path
 
         self.file_exists = os.path.exists(self.path)
@@ -29,6 +65,21 @@ def __init__(self, path):
 
 
     def load(self):
+        """
+        Loads compendium data from the specified file path into various mappings.
+
+        This method reads data from a JSON lines file located at the path specified
+        by the instance attribute `path`. Each line in the file should represent a
+        clique object in JSON format. The method populates multiple mappings
+        based on the contents of the file, including mappings between CURIEs and
+        their preferred identifiers, labels, descriptions, taxa, types, and
+        information content (IC).
+
+        The method tracks and logs the progress of the file loading process. It will
+        log a warning if the specified file path does not exist, and progress
+        information is logged for every million lines processed. At the end, the
+        method logs the total number of lines read.
+        """
         if not os.path.exists(self.path):
             logging.warning(f"Compendium file {self.path} does not exist.")
             return

From a936a5765b7743f0f5b325ed6a8ec7359cc0d7e3 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 15:31:04 -0500
Subject: [PATCH 05/10] Woo parallelization.

---
 scripts/comparator.py | 61 +++++++++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 22a26dc8..949a4337 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -2,10 +2,13 @@
 #
 # comparator.py - A script for comparing Babel files from different runs
 #
+import concurrent
 import json
 import os
 import logging
+import threading
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 
 import click
 
@@ -118,9 +121,13 @@ def compare_compendium_files(path1, path2):
     compendium1 = CompendiumFile(path1)
     compendium2 = CompendiumFile(path2)
 
-    # TODO: Figure out how to do this in parallel.
-    compendium1.load()
-    compendium2.load()
+    # Load the two files in parallel.
+    thread_compendium1 = threading.Thread(target=compendium1.load)
+    thread_compendium2 = threading.Thread(target=compendium2.load)
+    thread_compendium1.start()
+    thread_compendium2.start()
+    thread_compendium1.join()
+    thread_compendium2.join()
 
     # Craft results and return.
     return {
@@ -147,7 +154,8 @@ def compare_compendium_files(path1, path2):
 @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium')
 @click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
 @click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
-def comparator(input_type, input1, input2):
+@click.option('--max-workers', '-j', type=int, default=None, help='Maximum number of workers to use for parallel processing.')
+def comparator(input_type, input1, input2, max_workers):
     """
     Compares two compendium or synonym files.
 
@@ -156,6 +164,7 @@ def comparator(input_type, input1, input2):
         Defaults to 'compendium'.
     :param input1: First path (file or directory) to compare.
     :param input2: Second file (file or directory) to compare.
+    :param max_workers: Maximum number of workers to use for parallel processing.
     """
 
     # Some features haven't been implemented yet.
@@ -176,22 +185,34 @@ def comparator(input_type, input1, input2):
         files1 = os.listdir(input1)
         files2 = os.listdir(input2)
         all_filenames = set(files1 + files2)
-        for filename in sorted(all_filenames):
-            if filename.startswith('.'):
-                continue
-            path1 = os.path.join(input1, filename)
-            path2 = os.path.join(input2, filename)
-
-            if os.path.isdir(path1):
-                logging.warning(f"Skipping directory {path1} in comparison.")
-                continue
-
-            if os.path.isdir(path2):
-                logging.warning(f"Skipping directory {path2} in comparison.")
-                continue
-
-            result = compare_compendium_files(path1, path2)
-            results['comparisons'].append(result)
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = []
+            for filename in sorted(all_filenames):
+                if filename.startswith('.'):
+                    continue
+                path1 = os.path.join(input1, filename)
+                path2 = os.path.join(input2, filename)
+
+                if os.path.isdir(path1):
+                    logging.warning(f"Skipping directory {path1} in comparison.")
+                    continue
+
+                if os.path.isdir(path2):
+                    logging.warning(f"Skipping directory {path2} in comparison.")
+                    continue
+
+                futures.append(executor.submit(compare_compendium_files, path1, path2))
+
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    results['comparisons'].append(future.result())
+                except Exception as exc:
+                    logging.error(f"Error comparing files: {exc}")
+                    raise exc
+
+        print(json.dumps(results, indent=2))
+
     else:
         raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.")
     

From 02adb2cfb58cab0333cf883d3765f93d2db82738 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 15:52:51 -0500
Subject: [PATCH 06/10] Added labels to cliques.

---
 scripts/comparator.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 949a4337..9cc652e4 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -109,6 +109,46 @@ def load(self):
 
         logging.info(f"Loaded {self.row_count:,} lines from {self.path}.")
 
+    def diffs_to(self, older_compendium_file: 'CompendiumFile'):
+        """
+        Generate diff counts between this compendium file and the older compendium file.
+
+        :param older_compendium_file: A CompendiumFile object representing the older compendium file.
+        :return: A dictionary.
+        """
+
+        # Step 1. Figure out which identifiers have changed cliques between these two compendia.
+        identifiers_added = set()
+        identifiers_not_changed = set()
+        identifiers_changed = set()
+        identifiers_deleted = set()
+        for curie, preferred_curie in self.curie_to_preferred_id.items():
+            if curie not in older_compendium_file.curie_to_preferred_id:
+                identifiers_added.add((curie, self.curie_to_label[curie], None, '', preferred_curie, self.preferred_id_to_preferred_name[preferred_curie]))
+            else:
+                old_preferred_curie = older_compendium_file.curie_to_preferred_id.get(curie)
+                if preferred_curie == old_preferred_curie:
+                    identifiers_not_changed.add((curie, self.curie_to_label[curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], preferred_curie, self.preferred_id_to_preferred_name[preferred_curie]))
+                else:
+                    identifiers_changed.add((curie, self.curie_to_label[curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], preferred_curie, self.preferred_id_to_preferred_name[preferred_curie]))
+
+        for old_curie, old_preferred_curie in older_compendium_file.curie_to_preferred_id.items():
+            if old_curie not in self.curie_to_preferred_id:
+                identifiers_deleted.add((old_curie, older_compendium_file.curie_to_label[old_curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], None, ''))
+
+        # Step 2. Figure out the clique change.
+        clique_count = len(self.preferred_id_to_type.keys())
+        old_clique_count = len(older_compendium_file.preferred_id_to_type.keys())
+
+        # Step 3. Report on all the identifiers.
+        return {
+            'net_identifier_change': len(identifiers_added) - len(identifiers_deleted),
+            'net_clique_change': (clique_count - old_clique_count),
+            'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)),
+            'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)),
+            'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)),
+        }
+
 
 def compare_compendium_files(path1, path2):
     """ Compare two compendium files.
@@ -147,6 +187,7 @@ def compare_compendium_files(path1, path2):
             'clique_count': len(compendium2.preferred_id_to_type),
             'types': list(set(sorted(compendium2.preferred_id_to_type.values()))),
         },
+        'diffs': compendium2.diffs_to(compendium1),
     }
 
 

From 3a610850e362085c8e0f3095dff7f2ceee0f36ec Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 16:20:40 -0500
Subject: [PATCH 07/10] Clique diffs.

---
 scripts/comparator.py | 106 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 10 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 9cc652e4..73f9196e 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -7,6 +7,7 @@
 import os
 import logging
 import threading
+import time
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 
@@ -38,8 +39,8 @@ class CompendiumFile:
     :type curie_to_description: defaultdict
     :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers.
     :type curie_to_taxa: defaultdict
-    :ivar preferred_id_to_type: A defaultdict mapping preferred identifiers to their types.
-    :type preferred_id_to_type: defaultdict
+    :ivar preferred_id_to_type: A dict mapping preferred identifiers to their types.
+    :type preferred_id_to_type: dict
     :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names.
     :type preferred_id_to_preferred_name: defaultdict
     :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores.
@@ -58,11 +59,12 @@ def __init__(self, path):
         self.row_count = 0
 
         # TODO: replace with DuckDB or something else more memory efficient.
+        self.preferred_id_to_clique = defaultdict(list)
         self.curie_to_preferred_id = dict()
         self.curie_to_label = dict()
         self.curie_to_description = defaultdict(set)
         self.curie_to_taxa = defaultdict(set)
-        self.preferred_id_to_type = defaultdict()
+        self.preferred_id_to_type = dict()
         self.preferred_id_to_preferred_name = defaultdict()
         self.preferred_id_to_ic = dict()
 
@@ -83,6 +85,9 @@ def load(self):
         information is logged for every million lines processed. At the end, the
         method logs the total number of lines read.
         """
+
+        time_started = time.time_ns()
+
         if not os.path.exists(self.path):
             logging.warning(f"Compendium file {self.path} does not exist.")
             return
@@ -98,7 +103,8 @@ def load(self):
                 preferred_curie = clique['identifiers'][0]['i']
                 self.preferred_id_to_type[preferred_curie] = clique['type']
                 self.preferred_id_to_preferred_name[preferred_curie] = clique['preferred_name']
-                self.preferred_id_to_ic = clique['ic']
+                self.preferred_id_to_ic[preferred_curie] = clique['ic']
+                self.preferred_id_to_clique[preferred_curie] = list(map(lambda x: x['i'], clique['identifiers']))
 
                 for identifier in clique['identifiers']:
                     curie = identifier['i']
@@ -107,7 +113,8 @@ def load(self):
                     self.curie_to_description[curie].update(identifier.get('d', []))
                     self.curie_to_taxa[curie].update(identifier.get('t', []))
 
-        logging.info(f"Loaded {self.row_count:,} lines from {self.path}.")
+        time_ended = time.time_ns()
+        logging.info(f"Loaded {self.row_count:,} lines from {self.path} in {(time_ended - time_started) / 1_000_000_000:.2f} seconds.")
 
     def diffs_to(self, older_compendium_file: 'CompendiumFile'):
         """
@@ -122,6 +129,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
         identifiers_not_changed = set()
         identifiers_changed = set()
         identifiers_deleted = set()
+
         for curie, preferred_curie in self.curie_to_preferred_id.items():
             if curie not in older_compendium_file.curie_to_preferred_id:
                 identifiers_added.add((curie, self.curie_to_label[curie], None, '', preferred_curie, self.preferred_id_to_preferred_name[preferred_curie]))
@@ -136,17 +144,88 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
             if old_curie not in self.curie_to_preferred_id:
                 identifiers_deleted.add((old_curie, older_compendium_file.curie_to_label[old_curie], old_preferred_curie, older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie], None, ''))
 
-        # Step 2. Figure out the clique change.
+        # Step 2. Figure out the clique changes.
         clique_count = len(self.preferred_id_to_type.keys())
         old_clique_count = len(older_compendium_file.preferred_id_to_type.keys())
 
+        cliques_additions = {}
+        cliques_deletions = {}
+        clique_changes = {}
+        for preferred_curie, typ in self.preferred_id_to_type.items():
+            if preferred_curie not in older_compendium_file.preferred_id_to_type:
+                # Addition.
+                cliques_additions[preferred_curie] = {
+                    'type': typ,
+                    'preferred_curie': preferred_curie,
+                    'preferred_name': self.preferred_id_to_preferred_name[preferred_curie],
+                    'identifiers': self.preferred_id_to_clique[preferred_curie],
+                }
+            else:
+                clique_change = {
+                    'type': typ,
+                    'preferred_curie': preferred_curie,
+                    'preferred_name': self.preferred_id_to_preferred_name[preferred_curie],
+                    'identifiers': self.preferred_id_to_clique[preferred_curie],
+                }
+
+                # But did anything actually change?
+                flag_actually_changed = False
+
+                old_typ = older_compendium_file.preferred_id_to_type[preferred_curie]
+                if old_typ != typ:
+                    flag_actually_changed = True
+                    clique_change['type'] = {
+                        'old': old_typ,
+                        'new': typ,
+                    }
+
+                clique_label = self.preferred_id_to_preferred_name[preferred_curie]
+                old_clique_label = older_compendium_file.preferred_id_to_preferred_name[preferred_curie]
+                if clique_label != old_clique_label:
+                    flag_actually_changed = True
+                    clique_change['preferred_name'] = {
+                        'old': old_clique_label,
+                        'new': clique_label,
+                    }
+
+                ids = self.preferred_id_to_clique[preferred_curie]
+                old_ids = older_compendium_file.preferred_id_to_clique[preferred_curie]
+                if ids != old_ids:
+                    flag_actually_changed = True
+                    clique_change['identifiers'] = {
+                        'old': old_ids,
+                        'new': ids,
+                        'added': sorted(set(ids) - set(old_ids)),
+                        'deleted': sorted(set(old_ids) - set(ids)),
+                    }
+
+                if flag_actually_changed:
+                    clique_changes[preferred_curie] = clique_change
+
+        for old_preferred_curie, typ in older_compendium_file.preferred_id_to_type.items():
+            if old_preferred_curie not in self.preferred_id_to_type:
+                # Deletion.
+                cliques_deletions[old_preferred_curie] = {
+                    'type': typ,
+                    'preferred_curie': old_preferred_curie,
+                    'preferred_name': older_compendium_file.preferred_id_to_preferred_name[old_preferred_curie],
+                    'identifiers': older_compendium_file.preferred_id_to_clique[old_preferred_curie],
+                }
+
         # Step 3. Report on all the identifiers.
         return {
             'net_identifier_change': len(identifiers_added) - len(identifiers_deleted),
             'net_clique_change': (clique_count - old_clique_count),
-            'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)),
-            'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)),
-            'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)),
+            'identifiers': {
+                'additions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (to clique {x[4]} '{x[5]}')", identifiers_added)),
+                'deletions': sorted(map(lambda x: f"{x[0]} '{x[1]}' (from clique {x[2]} '{x[3]}')", identifiers_deleted)),
+                'changes': sorted(map(lambda x: f"{x[0]} '{x[1]}' moved from {x[2]} '{x[3]}' to {x[4]} '{x[5]}'", identifiers_changed)),
+            },
+            'cliques': {
+                'additions': cliques_additions,
+                'deletions': cliques_deletions,
+                'changes': clique_changes,
+            },
         }
 
 
@@ -158,6 +237,8 @@ def compare_compendium_files(path1, path2):
     @return A comparison between the two compendium files as a dictionary.
     """
 
+    time_started = time.time_ns()
+
     compendium1 = CompendiumFile(path1)
     compendium2 = CompendiumFile(path2)
 
@@ -170,7 +251,7 @@ def compare_compendium_files(path1, path2):
     thread_compendium2.join()
 
     # Craft results and return.
-    return {
+    result = {
         'compendium1': {
             'path': path1,
             'file_exists': compendium1.file_exists,
@@ -190,6 +271,11 @@ def compare_compendium_files(path1, path2):
         'diffs': compendium2.diffs_to(compendium1),
     }
 
+    time_ended = time.time_ns()
+    logging.info(f"Comparison of {path1} and {path2} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.")
+
+    return result
+
 
 @click.command()
 @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium')

From 3c47597881fe68738f7a244ecd287d0a68c828fa Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 16:34:10 -0500
Subject: [PATCH 08/10] Cleaned up clique outputs.

---
 scripts/comparator.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 73f9196e..3e2f3350 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -116,6 +116,15 @@ def load(self):
         time_ended = time.time_ns()
         logging.info(f"Loaded {self.row_count:,} lines from {self.path} in {(time_ended - time_started) / 1_000_000_000:.2f} seconds.")
 
+    def add_labels(self, ids: list[str]):
+        """
+        Return a list of labels for the IDs in ids.
+
+        :param ids: A list of identifiers.
+        :return: A list of labels.
+        """
+        return list(map(lambda x: self.curie_to_label.get(x, ''), ids))
+
     def diffs_to(self, older_compendium_file: 'CompendiumFile'):
         """
         Generate diff counts between this compendium file and the older compendium file.
@@ -174,7 +183,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                 old_typ = older_compendium_file.preferred_id_to_type[preferred_curie]
                 if old_typ != typ:
                     flag_actually_changed = True
-                    clique_change['type'] = {
+                    clique_change['type_changed'] = {
                         'old': old_typ,
                         'new': typ,
                     }
@@ -183,7 +192,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                 old_clique_label = older_compendium_file.preferred_id_to_preferred_name[preferred_curie]
                 if clique_label != old_clique_label:
                     flag_actually_changed = True
-                    clique_change['preferred_name'] = {
+                    clique_change['preferred_name_changed'] = {
                         'old': old_clique_label,
                         'new': clique_label,
                     }
@@ -192,9 +201,11 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                 old_ids = older_compendium_file.preferred_id_to_clique[preferred_curie]
                 if ids != old_ids:
                     flag_actually_changed = True
-                    clique_change['identifiers'] = {
+                    clique_change['identifiers_changed'] = {
                         'old': old_ids,
+                        'old_with_labels': list(map(lambda x: f"{x[0]} '{x[1]}'", zip(old_ids, older_compendium_file.add_labels(old_ids)))),
                         'new': ids,
+                        'new_with_labels': list(map(lambda x: f"{x[0]} '{x[1]}'", zip(ids, self.add_labels(ids)))),
                         'added': sorted(set(ids) - set(old_ids)),
                         'deleted': sorted(set(old_ids) - set(ids)),
                     }

From 47ea74ae04f56e942cd24d20b5c26eeff3291daa Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 19 Dec 2024 16:40:02 -0500
Subject: [PATCH 09/10] Cleaned up code and documentation a bit.

---
 scripts/comparator.py | 48 +++++++++++++++----------------------------
 1 file changed, 17 insertions(+), 31 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 3e2f3350..4c6897cd 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -2,6 +2,14 @@
 #
 # comparator.py - A script for comparing Babel files from different runs
 #
+# You can run this script on a single compendium file:
+#   python comparator.py dir1/compendia/Disease.txt dir2/compendia/Disease.txt
+# Or on an entire directory:
+#   python comparator.py dir1/compendia dir2/compendia
+#
+# It currently only writes out a JSON document to STDOUT, but in the future we might add a TSV output as well.
+#
+
 import concurrent
 import json
 import os
@@ -17,34 +25,8 @@
 
 class CompendiumFile:
     """
-    CompendiumFile represents a data handler for managing and processing a compendium
-    file. It is used to load compendium data from a file, map identifiers to their
-    preferred IDs, extract associated labels, descriptions, taxonomic information,
-    and other metadata.
-
-    The class provides methods to load data from a specified file path and maintains
-    the mappings and metadata in memory for further processing.
-
-    :ivar path: The path to the compendium file.
-    :type path: str
-    :ivar file_exists: A boolean indicating if the compendium file exists at the specified path.
-    :type file_exists: bool
-    :ivar row_count: The number of rows processed from the compendium file.
-    :type row_count: int
-    :ivar curie_to_preferred_id: A dictionary mapping CURIEs to their preferred identifiers.
-    :type curie_to_preferred_id: dict
-    :ivar curie_to_label: A dictionary mapping CURIEs to their associated labels.
-    :type curie_to_label: dict
-    :ivar curie_to_description: A defaultdict mapping CURIEs to sets of descriptions.
-    :type curie_to_description: defaultdict
-    :ivar curie_to_taxa: A defaultdict mapping CURIEs to sets of taxonomic identifiers.
-    :type curie_to_taxa: defaultdict
-    :ivar preferred_id_to_type: A dict mapping preferred identifiers to their types.
-    :type preferred_id_to_type: dict
-    :ivar preferred_id_to_preferred_name: A defaultdict mapping preferred identifiers to their preferred names.
-    :type preferred_id_to_preferred_name: defaultdict
-    :ivar preferred_id_to_ic: A dictionary mapping preferred identifiers to their information content scores.
-    :type preferred_id_to_ic: dict
+    Represents a compendium file at a particular path. The load() method will load the file into a series of in-memory
+    dictionaries, and the diffs_to() method will generate a diff between this compendium file and another compendium file.
     """
 
     def __init__(self, path):
@@ -159,6 +141,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
 
         cliques_additions = {}
         cliques_deletions = {}
+        cliques_unchanged = {}
         clique_changes = {}
         for preferred_curie, typ in self.preferred_id_to_type.items():
             if preferred_curie not in older_compendium_file.preferred_id_to_type:
@@ -170,6 +153,8 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                     'identifiers': self.preferred_id_to_clique[preferred_curie],
                 }
             else:
+                # The clique is present in both self and older_compendium_file, so we need to determine if it's
+                # changed or not.
                 clique_change = {
                     'type': typ,
                     'preferred_curie': preferred_curie,
@@ -210,8 +195,11 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                         'deleted': sorted(set(old_ids) - set(ids)),
                     }
 
+                # If something actually changed, add it to the clique changes list.
                 if flag_actually_changed:
                     clique_changes[preferred_curie] = clique_change
+                else:
+                    cliques_unchanged[preferred_curie] = clique_change
 
         for old_preferred_curie, typ in older_compendium_file.preferred_id_to_type.items():
             if old_preferred_curie not in self.preferred_id_to_type:
@@ -223,7 +211,7 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
                     'identifiers': older_compendium_file.preferred_id_to_clique[old_preferred_curie],
                 }
 
-        # Step 3. Report on all the identifiers.
+        # Step 3. Report on all the identifiers and cliques.
         return {
             'net_identifier_change': len(identifiers_added) - len(identifiers_deleted),
             'net_clique_change': (clique_count - old_clique_count),
@@ -349,8 +337,6 @@ def comparator(input_type, input1, input2, max_workers):
                     logging.error(f"Error comparing files: {exc}")
                     raise exc
 
-        print(json.dumps(results, indent=2))
-
     else:
         raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.")
     

From 4c10fcbf42d757b4fbffb5ba86f7152e2c59c991 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Fri, 3 Jan 2025 14:14:51 -0500
Subject: [PATCH 10/10] Clarified old and new paths.

---
 scripts/comparator.py | 81 ++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/scripts/comparator.py b/scripts/comparator.py
index 4c6897cd..641784b5 100644
--- a/scripts/comparator.py
+++ b/scripts/comparator.py
@@ -228,22 +228,22 @@ def diffs_to(self, older_compendium_file: 'CompendiumFile'):
         }
 
 
-def compare_compendium_files(path1, path2):
+def compare_compendium_files(path_old, path_new):
     """ Compare two compendium files.
 
-    @param path1: First path to compare.
-    @param path2: Second path to compare.
-    @return A comparison between the two compendium files as a dictionary.
+    :param path_old: The older folder to compare
+    :param path_new: The newer folder to compare.
+    :return A comparison between the two compendium files as a dictionary.
     """
 
     time_started = time.time_ns()
 
-    compendium1 = CompendiumFile(path1)
-    compendium2 = CompendiumFile(path2)
+    compendium_old = CompendiumFile(path_old)
+    compendium_new = CompendiumFile(path_new)
 
     # Load the two files in parallel.
-    thread_compendium1 = threading.Thread(target=compendium1.load)
-    thread_compendium2 = threading.Thread(target=compendium2.load)
+    thread_compendium1 = threading.Thread(target=compendium_old.load)
+    thread_compendium2 = threading.Thread(target=compendium_new.load)
     thread_compendium1.start()
     thread_compendium2.start()
     thread_compendium1.join()
@@ -251,45 +251,46 @@ def compare_compendium_files(path1, path2):
 
     # Craft results and return.
     result = {
-        'compendium1': {
-            'path': path1,
-            'file_exists': compendium1.file_exists,
-            'row_count': compendium1.row_count,
-            'curie_count': len(compendium1.curie_to_preferred_id),
-            'clique_count': len(compendium1.preferred_id_to_type),
-            'types': list(sorted(set(compendium1.preferred_id_to_type.values()))),
+        'compendium_old': {
+            'path': path_old,
+            'file_exists': compendium_old.file_exists,
+            'row_count': compendium_old.row_count,
+            'curie_count': len(compendium_old.curie_to_preferred_id),
+            'clique_count': len(compendium_old.preferred_id_to_type),
+            'types': list(sorted(set(compendium_old.preferred_id_to_type.values()))),
         },
-        'compendium2': {
-            'path': path2,
-            'file_exists': compendium2.file_exists,
-            'row_count': compendium2.row_count,
-            'curie_count': len(compendium2.curie_to_preferred_id),
-            'clique_count': len(compendium2.preferred_id_to_type),
-            'types': list(set(sorted(compendium2.preferred_id_to_type.values()))),
+        'compendium_new': {
+            'path': path_new,
+            'file_exists': compendium_new.file_exists,
+            'row_count': compendium_new.row_count,
+            'curie_count': len(compendium_new.curie_to_preferred_id),
+            'clique_count': len(compendium_new.preferred_id_to_type),
+            'types': list(set(sorted(compendium_new.preferred_id_to_type.values()))),
         },
-        'diffs': compendium2.diffs_to(compendium1),
+        'diffs': compendium_new.diffs_to(compendium_old),
     }
 
     time_ended = time.time_ns()
-    logging.info(f"Comparison of {path1} and {path2} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.")
+    logging.info(f"Comparison of {path_old} to {path_new} took {(time_ended - time_started) / 1_000_000_000:.2f} seconds.")
 
     return result
 
 
 @click.command()
 @click.option('--input-type', type=click.Choice(['compendium', 'synonyms']), default='compendium')
-@click.argument('input1', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
-@click.argument('input2', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
+@click.argument('input_old', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
+@click.argument('input_new', type=click.Path(exists=True, file_okay=True, dir_okay=True), required=True)
 @click.option('--max-workers', '-j', type=int, default=None, help='Maximum number of workers to use for parallel processing.')
-def comparator(input_type, input1, input2, max_workers):
+def comparator(input_type, input_old, input_new, max_workers):
     """
-    Compares two compendium or synonym files.
+    Compares either two compendium files or two directories containing compendium files.
+    \f
 
     :param input_type: Specifies the type of the files to compare.
         Options are 'compendium' or 'synonyms' (not yet supported).
         Defaults to 'compendium'.
-    :param input1: First path (file or directory) to compare.
-    :param input2: Second file (file or directory) to compare.
+    :param input_old: Older path (file or directory) to compare.
+    :param input_new: Newer path (file or directory) to compare.
     :param max_workers: Maximum number of workers to use for parallel processing.
     """
 
@@ -298,18 +299,18 @@ def comparator(input_type, input1, input2, max_workers):
         raise NotImplementedError(f"Input type '{input_type}' is not yet supported.")
 
     # Do the comparison.
-    if os.path.isfile(input1) and os.path.isfile(input2):
-        results = compare_compendium_files(input1, input2)
-    elif os.path.isdir(input1) and os.path.isdir(input2):
+    if os.path.isfile(input_old) and os.path.isfile(input_new):
+        results = compare_compendium_files(input_old, input_new)
+    elif os.path.isdir(input_old) and os.path.isdir(input_new):
         results = {
-            'directory1': {'path': input1},
-            'directory2': {'path': input2},
+            'directory1': {'path': input_old},
+            'directory2': {'path': input_new},
             'comparisons': [],
         }
 
         # Make a list of all the files in the directories input1 and input2.
-        files1 = os.listdir(input1)
-        files2 = os.listdir(input2)
+        files1 = os.listdir(input_old)
+        files2 = os.listdir(input_new)
         all_filenames = set(files1 + files2)
 
         with ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -317,8 +318,8 @@ def comparator(input_type, input1, input2, max_workers):
             for filename in sorted(all_filenames):
                 if filename.startswith('.'):
                     continue
-                path1 = os.path.join(input1, filename)
-                path2 = os.path.join(input2, filename)
+                path1 = os.path.join(input_old, filename)
+                path2 = os.path.join(input_new, filename)
 
                 if os.path.isdir(path1):
                     logging.warning(f"Skipping directory {path1} in comparison.")
@@ -338,7 +339,7 @@ def comparator(input_type, input1, input2, max_workers):
                     raise exc
 
     else:
-        raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input1} and {input2}.")
+        raise RuntimeError(f"Cannot compare a file to a directory or vice versa: {input_old} and {input_new}.")
     
     print(json.dumps(results, indent=2))