Reed-CompBio · sumedhars · Mar 30, 2025 · Mar 30, 2025 · Apr 2, 2025 · Apr 8, 2025
diff --git a/Snakefile b/Snakefile
@@ -93,6 +93,8 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
 
     if _config.config.analysis_include_ml_aggregate_algo:
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
@@ -103,6 +105,7 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
+        #TODO: add algo similarity outputs per algo
 
     if _config.config.analysis_include_evaluation:
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
@@ -317,6 +320,20 @@ rule ml_analysis:
         ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
 
+# Algorithm similarity evaluation - jaccard similarity
+#TODO: will have to add visualiztion - start with jaccard eval though
+rule algorithm_similarity_eval:
+    input:
+        pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt',
+                          out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
+    output:
+        algo_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', 'jaccard-matrix.txt']),
+        algo_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', 'jaccard-heatmap.png'])
+    run:
+        summary_df = ml.summarize_networks(input.pathways)
+        jaccard = ml.jaccard_similarity_eval(summary_df, output.algo_similarity_matrix, output.algo_similarity_heatmap)
+
+
 # Ensemble the output pathways for each dataset
 rule ensemble: 
     input:

diff --git a/config/config.yaml b/config/config.yaml
@@ -47,13 +47,13 @@ container_registry:
 algorithms:
       - name: "pathlinker"
         params:
-              include: true
+              include: false
               run1:
                   k: range(100,201,100)
 
       - name: "omicsintegrator1"
         params:
-              include: true
+              include: false
               run1:
                   b: [5, 6]
                   w: np.linspace(0,5,2)
@@ -80,7 +80,7 @@ algorithms:
 
       - name: "mincostflow"
         params:
-              include: true
+              include: false
               run1:
                   flow: [1] # The flow must be an int
                   capacity: [1]
@@ -91,7 +91,7 @@ algorithms:
 
       - name: "domino"
         params:
-              include: true
+              include: false
               run1:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
@@ -11,6 +11,7 @@
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import jaccard_score
 
 from spras.util import make_required_dirs
 
@@ -86,6 +87,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
     concated_df = concated_df.fillna(0)
     concated_df = concated_df.astype('int64')
 
+    print(concated_df)
     return concated_df
 
 
@@ -347,3 +349,40 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str):
 
     make_required_dirs(output_file)
     row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)
+
+
+def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str, output_png: str):
+    """
+    Calculates the pairwise Jaccard similarity matrix from the binary representation of `summary_df`.
+    Save the resulting similarity matrix as a tab-delimited file and generates and save a heatmap 
+    visualization of the similarities.
+    @param summary_df: pandas dataframe with algorithm-parameter summary information
+    @param output_file: the filename to save the ensemble network
+    @param output_png: the file name to save the heatmap image
+    """
+    # calculate the jaccard similarity between all combinations of algorithms & parameters
+    binarized_df = summary_df.applymap(lambda x: 1 if x !=0 else 0)
+    algorithms = binarized_df.columns
+    jaccard_matrix = pd.DataFrame(np.identity(len(algorithms)), index=algorithms, columns=algorithms)
+    for i, alg1 in enumerate(algorithms):
+        for _j, alg2 in enumerate(algorithms[i+1:], start=i+1):
+            sim_value = jaccard_score(binarized_df[alg1], binarized_df[alg2])
+            jaccard_matrix.loc[alg1, alg2] = sim_value
+            jaccard_matrix.loc[alg2, alg1] = sim_value
+    # save the jaccard matrix as a csv
+    jaccard_matrix.to_csv(output_file, sep='\t', index=True, header=True)
+    # make a heatmap from the jaccard matrix
+    fig, ax = plt.subplots(figsize=(8, 6))
+    cax = ax.imshow(jaccard_matrix.values, interpolation='nearest', cmap='viridis', vmin=0, vmax=1)
+    ax.set_title("Jaccard similarity heatmap")
+    # set tick labels with algorithm names
+    ax.set_xticks(np.arange(len(algorithms)))
+    ax.set_yticks(np.arange(len(algorithms)))
+    ax.set_xticklabels(algorithms)
+    ax.set_yticklabels(algorithms)
+    plt.colorbar(cax, ax=ax)
+    # annotate each cell with the corresponding similarity value
+    for i in range(len(algorithms)):
+        for j in range(len(algorithms)):
+            ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
+    plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
diff --git a/test/ml/expected/expected-jaccard-heatmap.png b/test/ml/expected/expected-jaccard-heatmap.png
diff --git a/test/ml/expected/expected-jaccard-matrix.txt b/test/ml/expected/expected-jaccard-matrix.txt
@@ -0,0 +1,2 @@
+	data
+data	1.0
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
@@ -1,4 +1,5 @@
 import filecmp
+import shutil
 from pathlib import Path
 
 import pandas as pd
@@ -145,3 +146,12 @@ def test_ensemble_network_empty(self):
         expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
 
         assert en.equals(expected)
+
+    # NOTE: binary comparison of generated binary files may be fragile across matplotlib versions
+    # if this test begins failing unexpectedly, consider loosening or disabling test
+    def test_jaccard_similarity_eval(self):
+        dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
+        ml.jaccard_similarity_eval(dataframe, OUT_DIR + 'jaccard-matrix.txt', OUT_DIR + 'jaccard-heatmap.png')
+
+        assert filecmp.cmp(OUT_DIR + 'jaccard-matrix.txt', EXPECT_DIR + 'expected-jaccard-matrix.txt', shallow=False)
+        assert filecmp.cmp(OUT_DIR + 'jaccard-heatmap.png', EXPECT_DIR + 'expected-jaccard-heatmap.png', shallow=False)