Reed-CompBio · ntalluri · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 3, 2025
diff --git a/Snakefile b/Snakefile
@@ -105,8 +105,11 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
     if _config.config.analysis_include_evaluation:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
+    if _config.config.analysis_include_evaluation_aggregate_algo:
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -372,15 +375,40 @@ def get_dataset_label(wildcards):
     dataset = parts[0]
     return dataset
 
-# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
-rule evaluation:
+# Returns all pathways for a specific dataset
+def collect_pathways_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=dataset_label)
+
+# Run precision and recall for all pathway outputs for a dataset against its paired gold standard
+rule evaluation_pr_per_pathways:
+    input: 
+        gold_standard_file = get_gold_standard_pickle_file,
+        pathways = collect_pathways_per_dataset
+    output: 
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway.png']),
+    run:
+        node_table = Evaluation.from_file(input.gold_standard_file).node_table
+        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
+
+# Returns all pathways for a specific algorithm and dataset
+def collect_pathways_per_algo_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
+    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)
+
+# Run precision and recall per algorithm for all pathway outputs for a dataset against its paired gold standard
+rule evaluation_per_algo_pr_per_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
-        pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
-    output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
+        pathways =  collect_pathways_per_algo_per_dataset,
+    output: 
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway-per-{algorithm}.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway-per-{algorithm}.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        Evaluation.precision(input.pathways, node_table, output.eval_file)
+        Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
 
 # Remove the output directory
 rule clean:

diff --git a/spras/evaluation.py b/spras/evaluation.py
@@ -3,8 +3,15 @@
 from pathlib import Path
 from typing import Dict, Iterable
 
+import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-from sklearn.metrics import precision_score
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+)
+
+from spras.analysis.ml import create_palette
 
 
 class Evaluation:
@@ -72,29 +79,70 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
         # TODO: later iteration - chose between node and edge file, or allow both
 
     @staticmethod
-    def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
+    def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
         """
         Takes in file paths for a specific dataset and an associated gold standard node table.
-        Calculates precision for each pathway file
+        Calculates precision and recall for each pathway file
         Returns output back to output_file
         @param file_paths: file paths of pathway reconstruction algorithm outputs
         @param node_table: the gold standard nodes
-        @param output_file: the filename to save the precision of each pathway
+        @param algorithms: list of algorithms used in current run of SPRAS
+        @param output_file: the filename to save the precision and recall of each pathway
+        @param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC)
         """
         y_true = set(node_table['NODEID'])
         results = []
-
-        for file in file_paths:
-            df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
+        for f in file_paths:
+            df = pd.read_table(f, sep="\t", header=0, usecols=["Node1", "Node2"])
             y_pred = set(df['Node1']).union(set(df['Node2']))
             all_nodes = y_true.union(y_pred)
             y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
             y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]
-
             # default to 0.0 if there is a divide by 0 error
+            # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
             precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
+            recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
+            results.append({"Pathway": f, "Precision": precision, "Recall": recall})
+
+        pr_df = pd.DataFrame(results)
+
+        if not pr_df.empty:
+            pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
+            pr_df.to_csv(output_file, sep="\t", index=False)
+            if output_png is not None:
+                plt.figure(figsize=(10, 7))
+                color_palette = create_palette(algorithms)
+                # plot a line per algorithm
+                for algorithm in algorithms:
+                    subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
+                    if not subset.empty:
+                        plt.plot(
+                            subset["Recall"],
+                            subset["Precision"],
+                            color = color_palette[algorithm],
+                            marker='o',
+                            linestyle='',
+                            label=f"{algorithm}"
+                        )
+
+                plt.xlabel("Recall")
+                plt.ylabel("Precision")
+                plt.xlim(-0.05, 1.05)
+                plt.ylim(-0.05, 1.05)
+                plt.title("Precision and Recall Plot")
+                plt.legend()
+                plt.grid(True)
+                plt.savefig(output_png)
+                plt.close()
+        else: # TODO: I don't think this case will ever hit
+            pr_df.to_csv(output_file, sep="\t", index=False)
+            if output_png is not None:
+                plt.figure(figsize=(10, 7))
+                plt.plot([], [], label="No Pathways Given")
+                plt.title("Precision and Recall Plot")
+                plt.legend()
+                plt.savefig(output_png)
+                plt.close()
+
 
-            results.append({"Pathway": file, "Precision": precision})
 
-        precision_df = pd.DataFrame(results)
-        precision_df.to_csv(output_file, sep="\t", index=False)
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
@@ -0,0 +1,2 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt
@@ -0,0 +1 @@
+
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway.txt b/test/evaluate/expected/expected-precision-recall-per-pathway.txt
@@ -0,0 +1,5 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-456/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-123/pathway.txt	0.6666666666666666	0.6666666666666666
+test/evaluate/input/data-test-params-789/pathway.txt	1.0	1.0
diff --git a/test/evaluate/input/data-test-params-123/pathway.txt b/test/evaluate/input/data-test-params-123/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	C	1	U
diff --git a/test/evaluate/input/data-test-params-456/pathway.txt b/test/evaluate/input/data-test-params-456/pathway.txt
@@ -0,0 +1,2 @@
+Node1	Node2	Rank	Direction
+F	L	1	U
diff --git a/test/evaluate/input/data-test-params-789/pathway.txt b/test/evaluate/input/data-test-params-789/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	Q	1	U
diff --git a/test/evaluate/input/data-test-params-empty/pathway.txt b/test/evaluate/input/data-test-params-empty/pathway.txt
@@ -0,0 +1 @@
+Node1	Node2	Rank	Direction
diff --git a/test/evaluate/input/node_table.csv b/test/evaluate/input/node_table.csv
@@ -0,0 +1,4 @@
+NODEID
+A
+B
+Q
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
@@ -0,0 +1,49 @@
+import filecmp
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+import spras.analysis.ml as ml
+from spras.evaluation import Evaluation
+
+INPUT_DIR = 'test/evaluate/input/'
+OUT_DIR = 'test/evaluate/output/'
+EXPECT_DIR = 'test/evaluate/expected/'
+NODE_TABLE = pd.read_csv(INPUT_DIR + "node_table.csv", header=0)
+class TestEvaluate:
+    @classmethod
+    def setup_class(cls):
+        """
+        Create the expected output directory
+        """
+        Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
+
+    def test_precision_recall_per_pathway(self):
+        file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR + "test-precision-recall-per-pathway.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)
+
+    def test_precision_recall_per_pathway_empty(self):
+
+        file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
+
+    def test_precision_recall_per_pathway_nothing(self):
+
+        file_paths = []
+        algorithms = []
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-nothing.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-nothing.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-nothing.txt', shallow=False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Pathway Precision Recall
		test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+    NODEID
+    A
+    B
+    Q