From d2da12b009c621eb6e1507ddd0d5ee3c944ad9b2 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 3 Mar 2025 15:35:29 -0600
Subject: [PATCH 01/10] updated ml back to og

---
 spras/analysis/ml.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 7d45e091..0bab3f21 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -10,7 +10,7 @@
 from scipy.cluster.hierarchy import dendrogram, fcluster
 from sklearn.cluster import AgglomerativeClustering
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import MinMaxScaler, StandardScaler
+from sklearn.preprocessing import StandardScaler
 
 from spras.util import make_required_dirs
 
@@ -142,14 +142,8 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     if not isinstance(labels, bool):
         raise ValueError(f"labels={labels} must be True or False")
 
-    #TODO: MinMaxScaler changes nothing about the data
-    # scaler = MinMaxScaler()
-    # scaler.fit(X)  # calc mean and standard deviation
-    # X_scaled = scaler.transform(X)
-
-    scaler = StandardScaler()  # TODO: StandardScalar doesn't make sense on binary data because the mean and variance lead to values outside the binary range
+    scaler = StandardScaler()
     scaler.fit(X)  # calc mean and standard deviation
-    scaler.transform(X)
     X_scaled = scaler.transform(X)
 
     # choosing the PCA
@@ -158,28 +152,20 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord:
     X_pca = pca_instance.transform(X_scaled)
     variance = pca_instance.explained_variance_ratio_ * 100
 
-    # calculating the centroid
-    centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples
-
     # making the plot
     label_color_map = create_palette(column_names)
     plt.figure(figsize=(10, 7))
-    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, palette=label_color_map)
-    plt.scatter(centroid[0], centroid[1], color='red', marker='X', s=100, label='Centroid')
+    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, legend=True, palette=label_color_map)
     plt.title("PCA")
-    plt.legend()
     plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)")
     plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)")
 
     # saving the coordinates of each algorithm
     make_required_dirs(output_coord)
     coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)])
-    coordinates_df.insert(0, 'datapoint_labels', columns.tolist())
-    centroid_row = ['centroid'] + centroid.tolist()
-    coordinates_df.loc[len(coordinates_df)] = centroid_row
+    coordinates_df.insert(0, 'algorithm', columns.tolist())
     coordinates_df.to_csv(output_coord, sep='\t', index=False)
 
-
     # saving the principal components
     make_required_dirs(output_var)
     with open(output_var, "w") as f:
@@ -360,4 +346,4 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str):
         lambda edge: edge.split(DIR_CONST)[1] if DIR_CONST in edge else edge.split(UNDIR_CONST)[1])
 
     make_required_dirs(output_file)
-    row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)
+    row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)
\ No newline at end of file

From e80d1242edc6e207dd068dd6aee0ea4da50f4f46 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 3 Mar 2025 15:39:19 -0600
Subject: [PATCH 02/10] updated snakemake file to only include no param tuning

---
 Snakefile | 51 ++-------------------------------------------------
 1 file changed, 2 insertions(+), 49 deletions(-)

diff --git a/Snakefile b/Snakefile
index caee3428..a4322f5e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -106,15 +106,9 @@ def make_final_input(wildcards):
     if _config.config.analysis_include_evaluation:
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        # TODO: should we provide the node ensemble frequencies       
     if _config.config.analysis_include_evaluation_aggregate_algo:
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
         final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-        # TODO: should we provide the node ensemble frequencies per algortihm
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -380,44 +374,25 @@ def get_dataset_label(wildcards):
     dataset = parts[0]
     return dataset
 
-
 # Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
 rule evaluation:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
-        ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt",
-        pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt"
     output: 
         pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
         pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
-        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']),
-        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
-        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
-        Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
-        pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
-
+        
 # Returns all pathways for a specific algorithm and dataset
 def collect_pathways_per_algo_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
     return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)
 
-# Returns ensemble file for a specific algorithm and dataset
-def collect_ensemble_per_algo_per_dataset(wildcards):
-    dataset_label = get_dataset_label(wildcards)
-    return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt"
-
-# Returns pca coordinates for a specific algorithm and dataset
-def collect_pca_coordinates_per_algo_per_dataset(wildcards):
-    dataset_label = get_dataset_label(wildcards)
-    return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt"
-
-# Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
+# Run evaluation per algortihm for all associated pathway outputs for a dataset against its paired gold standard
 rule evaluation_per_algo_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
@@ -429,28 +404,6 @@ rule evaluation_per_algo_pathways:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
 
-rule evaluation_per_algo_ensemble_pr_curve:
-    input: 
-        gold_standard_file = get_gold_standard_pickle_file,
-        ensemble_file = collect_ensemble_per_algo_per_dataset,
-    output: 
-        pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-curve-ensemble-nodes.png']),
-    run:
-        node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file)
-        Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png)
-
-rule evaluation_per_algo_pca_chosen:
-    input: 
-        gold_standard_file = get_gold_standard_pickle_file,
-        pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset
-    output: 
-        pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-pca-chosen-pathway.txt"]),
-    run:
-        node_table = Evaluation.from_file(input.gold_standard_file).node_table
-        pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir)
-        Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file)
-
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'

From a1e69db1cedad33dff946c94f2b6ad13ecf83c97 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 3 Mar 2025 15:46:35 -0600
Subject: [PATCH 03/10] updated test cases to be for no param tuning

---
 .../expected/expected-node-ensemble.csv       | 13 -------
 ...recision-recall-per-pathway-pca-chosen.txt |  2 --
 test/evaluate/input/ensemble-network.tsv      | 10 ------
 test/evaluate/input/node-ensemble-empty.csv   |  2 --
 test/evaluate/input/node-ensemble.csv         | 13 -------
 test/evaluate/input/pca-coordinates.tsv       |  6 ----
 test/evaluate/test_evaluate.py                | 36 +------------------
 test/ml/expected/expected-pca-coordinates.tsv |  3 +-
 test/ml/test_ml.py                            |  9 +++--
 9 files changed, 6 insertions(+), 88 deletions(-)
 delete mode 100644 test/evaluate/expected/expected-node-ensemble.csv
 delete mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
 delete mode 100644 test/evaluate/input/ensemble-network.tsv
 delete mode 100644 test/evaluate/input/node-ensemble-empty.csv
 delete mode 100644 test/evaluate/input/node-ensemble.csv
 delete mode 100644 test/evaluate/input/pca-coordinates.tsv

diff --git a/test/evaluate/expected/expected-node-ensemble.csv b/test/evaluate/expected/expected-node-ensemble.csv
deleted file mode 100644
index ba467d55..00000000
--- a/test/evaluate/expected/expected-node-ensemble.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-Node	max_freq
-C	0.75
-E	0.75
-D	0.75
-F	0.75
-A	0.5
-B	0.5
-L	0.5
-M	0.5
-O	0.25
-P	0.25
-N	0.25
-Q	0.25
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
deleted file mode 100644
index 6c97ff7e..00000000
--- a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Pathway	Precision	Recall
-test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
diff --git a/test/evaluate/input/ensemble-network.tsv b/test/evaluate/input/ensemble-network.tsv
deleted file mode 100644
index 293ec3f5..00000000
--- a/test/evaluate/input/ensemble-network.tsv
+++ /dev/null
@@ -1,10 +0,0 @@
-Node1	Node2	Frequency	Direction
-A	B	0.5	U
-C	D	0.75	U
-E	F	0.75	U
-L	M	0.5	U
-M	N	0.25	U
-O	P	0.25	U
-P	Q	0.25	U
-A	B	0.25	D
-B	A	0.25	D
\ No newline at end of file
diff --git a/test/evaluate/input/node-ensemble-empty.csv b/test/evaluate/input/node-ensemble-empty.csv
deleted file mode 100644
index e488f56a..00000000
--- a/test/evaluate/input/node-ensemble-empty.csv
+++ /dev/null
@@ -1,2 +0,0 @@
-Node	max_freq
-
diff --git a/test/evaluate/input/node-ensemble.csv b/test/evaluate/input/node-ensemble.csv
deleted file mode 100644
index ba467d55..00000000
--- a/test/evaluate/input/node-ensemble.csv
+++ /dev/null
@@ -1,13 +0,0 @@
-Node	max_freq
-C	0.75
-E	0.75
-D	0.75
-F	0.75
-A	0.5
-B	0.5
-L	0.5
-M	0.5
-O	0.25
-P	0.25
-N	0.25
-Q	0.25
diff --git a/test/evaluate/input/pca-coordinates.tsv b/test/evaluate/input/pca-coordinates.tsv
deleted file mode 100644
index 92fc6b3d..00000000
--- a/test/evaluate/input/pca-coordinates.tsv
+++ /dev/null
@@ -1,6 +0,0 @@
-datapoint_labels	PC1	PC2
-data-test-params-123	-1.3973472526239425	-1.632993161855452
-data-test-params-456	2.025440509784659	1.9566080710032526e-16
-data-test-params-789	-1.3973472526239425	1.632993161855452
-data-test-params-empty	0.7692539954632259	-4.1496185644351084e-16
-centroid	-2.7755575615628914e-17	-4.822931287961988e-17
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index 5dc0b8f3..59705989 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -19,26 +19,6 @@ def setup_class(cls):
         """
         Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
 
-    def test_node_ensemble(self):
-        ensemble_file = INPUT_DIR + 'ensemble-network.tsv'
-        edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file)
-        edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False)
-        assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False)
-
-    def test_precision_recal_curve_ensemble_nodes(self):
-        out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes.png")
-        out_path.unlink(missing_ok=True)
-        ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep="\t", header=0)
-        Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path)
-        assert out_path.exists()
-
-    def test_precision_recal_curve_ensemble_nodes_empty(self):
-        out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes-empty.png")
-        out_path.unlink(missing_ok=True)
-        ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep="\t", header=0)
-        Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path)
-        assert out_path.exists()
-
     def test_precision_recall_per_pathway(self):
         file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
         algorithms = ["test"]
@@ -56,18 +36,4 @@ def test_precision_recall_per_pathway_empty(self):
         output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"
 
         Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
-        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
-
-
-    def  test_precision_recall_pca_chosen_pathway(self):
-        file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
-        algorithms = ["test"]
-        output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt"
-        output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png"
-
-        dataframe = ml.summarize_networks(file_paths)
-        ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv')
-
-        pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR)
-        Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png)
-        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
\ No newline at end of file
diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv
index ac10f2db..288be9c1 100644
--- a/test/ml/expected/expected-pca-coordinates.tsv
+++ b/test/ml/expected/expected-pca-coordinates.tsv
@@ -1,5 +1,4 @@
-datapoint_labels	PC1	PC2
+algorithm	PC1	PC2
 test-data-s1	-2.0066502104820323	-0.9865875190637746
 test-data-s2	-1.5276508866841985	1.0799457247533233
 test-data-s3	3.5343010971662308	-0.09335820568954915
-centroid	0.0	-1.6190752442450199e-16
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index b9ca69ca..cb9370ef 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -77,15 +77,14 @@ def test_pca_robustness(self):
         dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
         expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv')
         expected = expected.round(5)
-        expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
-
         for _ in range(5):
             dataframe_shuffled = dataframe.sample(frac=1, axis=1)  # permute the columns
             ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt',
                 OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
             coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv')
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
-            coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
+            coord.sort_values(by='algorithm', ignore_index=True, inplace=True)
+
             assert coord.equals(expected)
 
         for _ in range(5):
@@ -94,7 +93,7 @@ def test_pca_robustness(self):
                     OUT_DIR + 'pca-shuffled-rows-coordinates.tsv')
             coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv')
             coord = coord.round(5)  # round values to 5 digits to account for numeric differences across machines
-            coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True)
+            coord.sort_values(by='algorithm', ignore_index=True, inplace=True)
 
             assert coord.equals(expected)
 
@@ -139,4 +138,4 @@ def test_ensemble_network_empty(self):
         en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
         expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
 
-        assert en.equals(expected)
+        assert en.equals(expected)
\ No newline at end of file

From 38b6f549aa380f7a659e5e8e9d8d77bc55866774 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 3 Mar 2025 15:49:27 -0600
Subject: [PATCH 04/10] keep precision and recall only in evaluate.py

---
 spras/evaluation.py | 98 ---------------------------------------------
 1 file changed, 98 deletions(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index e6f60c0b..71bb1013 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -136,101 +136,3 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                 plt.plot([], [])
                 plt.title("Empty Pathway Files")
                 plt.savefig(output_png)
-
-
-    def select_max_freq_and_node(row: pd.Series):
-        """
-        Selects the node and frequency with the highest frequency value from two potential nodes in a row.
-        Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency.
-        """
-        max_freq = 0
-        node = ""
-        if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
-            max_freq = row['Freq1']
-            node = row['Node1']
-        elif pd.isna(row['Node1']) and pd.isna(row['Freq1']):
-            max_freq = row['Freq2']
-            node = row['Node2']
-        else:
-            max_freq = max(row['Freq1'], row['Freq2'])
-            node = row['Node1']
-        return node, max_freq
-
-    def edge_frequency_node_ensemble(ensemble_file: str):
-        """
-        Processes an ensemble of edge frequencies to identify the highest frequency associated with each node
-        Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node.
-        Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty.
-        @param ensemble_file: the pre-computed node_ensemble
-        """
-        ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0)
-
-        if not ensemble_df.empty:
-            node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction'])
-            node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction'])
-
-            max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
-            max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
-            max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
-            max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
-
-            node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
-            node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand')
-            node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
-            node_ensemble.sort_values('max_freq', ascending= False, inplace = True)
-            return node_ensemble
-        else:
-            return pd.DataFrame(columns = ['Node', 'max_freq'])
-
-    def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
-        """
-        Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table.
-        Plots a precision and recall curve for the node ensemble against its associated gold standard node table
-        Returns output back to output_png
-        @param node_ensemble: the pre-computed node_ensemble
-        @param node_table: the gold standard nodes
-        @param output_file: the filename to save the precision and recall curves
-        """
-        gold_standard_nodes = set(node_table['NODEID'])
-
-        if not node_ensemble.empty:
-            y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']]
-            y_scores = node_ensemble['max_freq'].tolist()
-            precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
-            auc_precision_recall = average_precision_score(y_true, y_scores)
-
-            plt.figure()
-            plt.plot(recall, precision, marker='o', label='Precision-Recall curve')
-            plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}')
-            plt.xlabel('Recall')
-            plt.ylabel('Precision')
-            plt.title('Precision-Recall Curve')
-            plt.legend()
-            plt.grid(True)
-            plt.savefig(output_png)
-        else:
-            plt.figure()
-            plt.plot([], [])
-            plt.title("Empty Ensemble File")
-            plt.savefig(output_png)
-
-    def pca_chosen_pathway(coordinates_file: str, output_dir:str):
-        """
-        Identifies the pathway closest to a specified centroid based on PCA coordinates
-        Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway.
-        Returns the file path for the representative pathway associated with the closest data point.
-        @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset
-        @param output_dir: the main reconstruction directory
-        """
-        coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0)
-
-        centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid']
-        centroid = centroid_row.iloc[0, 1:].tolist()
-        coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid']
-
-        pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
-        coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
-        closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
-        rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")]
-
-        return rep_pathway

From 00e3182363d99149097f063ceb679b0344dccd0f Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Tue, 11 Mar 2025 13:20:04 -0500
Subject: [PATCH 05/10] reverted expected pca coordinates and precommit

---
 spras/analysis/ml.py                          | 2 +-
 test/evaluate/test_evaluate.py                | 2 +-
 test/ml/expected/expected-pca-coordinates.tsv | 6 +++---
 test/ml/test_ml.py                            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py
index 0bab3f21..3dad8775 100644
--- a/spras/analysis/ml.py
+++ b/spras/analysis/ml.py
@@ -346,4 +346,4 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str):
         lambda edge: edge.split(DIR_CONST)[1] if DIR_CONST in edge else edge.split(UNDIR_CONST)[1])
 
     make_required_dirs(output_file)
-    row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)
\ No newline at end of file
+    row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index 59705989..413d2683 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -36,4 +36,4 @@ def test_precision_recall_per_pathway_empty(self):
         output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"
 
         Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
-        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
\ No newline at end of file
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv
index 288be9c1..b6371c84 100644
--- a/test/ml/expected/expected-pca-coordinates.tsv
+++ b/test/ml/expected/expected-pca-coordinates.tsv
@@ -1,4 +1,4 @@
 algorithm	PC1	PC2
-test-data-s1	-2.0066502104820323	-0.9865875190637746
-test-data-s2	-1.5276508866841985	1.0799457247533233
-test-data-s3	3.5343010971662308	-0.09335820568954915
+test-data-s1	-2.006650210482033	-0.9865875190637743
+test-data-s2	-1.5276508866841987	1.0799457247533237
+test-data-s3	3.534301097166232	-0.0933582056895495
\ No newline at end of file
diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py
index cb9370ef..2b5720ae 100644
--- a/test/ml/test_ml.py
+++ b/test/ml/test_ml.py
@@ -138,4 +138,4 @@ def test_ensemble_network_empty(self):
         en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv')
         expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')
 
-        assert en.equals(expected)
\ No newline at end of file
+        assert en.equals(expected)

From 0ca49646aa88380407defc92c2b6a5a2b8fb221d Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 24 Mar 2025 13:49:09 -0500
Subject: [PATCH 06/10] update evaluation.py to be up to date

---
 spras/evaluation.py | 102 +++-----------------------------------------
 1 file changed, 6 insertions(+), 96 deletions(-)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 7734902b..3879db54 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -3,8 +3,13 @@
 from pathlib import Path
 from typing import Dict, Iterable
 
+import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-from sklearn.metrics import precision_score
+from sklearn.metrics import (
+    precision_score,
+    recall_score,
+)
 
 
 class Evaluation:
@@ -131,99 +136,4 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                 plt.savefig(output_png)
 
 
-    def select_max_freq_and_node(row: pd.Series):
-        """
-        Selects the node and frequency with the highest frequency value from two potential nodes in a row.
-        Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency.
-        """
-        max_freq = 0
-        node = ""
-        if pd.isna(row['Node2']) and pd.isna(row['Freq2']):
-            max_freq = row['Freq1']
-            node = row['Node1']
-        elif pd.isna(row['Node1']) and pd.isna(row['Freq1']):
-            max_freq = row['Freq2']
-            node = row['Node2']
-        else:
-            max_freq = max(row['Freq1'], row['Freq2'])
-            node = row['Node1']
-        return node, max_freq
-
-    def edge_frequency_node_ensemble(ensemble_file: str):
-        """
-        Processes an ensemble of edge frequencies to identify the highest frequency associated with each node
-        Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node.
-        Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty.
-        @param ensemble_file: the pre-computed node_ensemble
-        """
-        ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0)
-
-        if not ensemble_df.empty:
-            node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction'])
-            node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction'])
-
-            max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index()
-            max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True)
-            max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index()
-            max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True)
-
-            node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer')
-            node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand')
-            node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True)
-            node_ensemble.sort_values('max_freq', ascending= False, inplace = True)
-            return node_ensemble
-        else:
-            return pd.DataFrame(columns = ['Node', 'max_freq'])
-
-    def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str):
-        """
-        Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table.
-        Plots a precision and recall curve for the node ensemble against its associated gold standard node table
-        Returns output back to output_png
-        @param node_ensemble: the pre-computed node_ensemble
-        @param node_table: the gold standard nodes
-        @param output_file: the filename to save the precision and recall curves
-        """
-        gold_standard_nodes = set(node_table['NODEID'])
-
-        if not node_ensemble.empty:
-            y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']]
-            y_scores = node_ensemble['max_freq'].tolist()
-            precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
-            auc_precision_recall = average_precision_score(y_true, y_scores)
-
-            plt.figure()
-            plt.plot(recall, precision, marker='o', label='Precision-Recall curve')
-            plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}')
-            plt.xlabel('Recall')
-            plt.ylabel('Precision')
-            plt.title('Precision-Recall Curve')
-            plt.legend()
-            plt.grid(True)
-            plt.savefig(output_png)
-        else:
-            plt.figure()
-            plt.plot([], [])
-            plt.title("Empty Ensemble File")
-            plt.savefig(output_png)
-
-    def pca_chosen_pathway(coordinates_file: str, output_dir:str):
-        """
-        Identifies the pathway closest to a specified centroid based on PCA coordinates
-        Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway.
-        Returns the file path for the representative pathway associated with the closest data point.
-        @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset
-        @param output_dir: the main reconstruction directory
-        """
-        coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0)
-
-        centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid']
-        centroid = centroid_row.iloc[0, 1:].tolist()
-        coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid']
-
-        pc_columns = [col for col in coord_df.columns if col.startswith('PC')]
-        coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns)))
-        closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0]
-        rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")]
 
-        return rep_pathway

From b483be73fcf97b59653823f92915df26cd47b016 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 24 Mar 2025 13:49:28 -0500
Subject: [PATCH 07/10] readd evaluation tests

---
 ...ted-precision-recall-per-pathway-empty.txt |  2 ++
 .../expected-precision-recall-per-pathway.txt |  5 +++++
 .../input/data-test-params-123/pathway.txt    |  3 +++
 .../input/data-test-params-456/pathway.txt    |  2 ++
 .../input/data-test-params-789/pathway.txt    |  3 +++
 .../input/data-test-params-empty/pathway.txt  |  1 +
 test/evaluate/input/node_table.csv            |  4 ++++
 test/evaluate/test_evaluate.py                | 22 ++++++++++++-------
 8 files changed, 34 insertions(+), 8 deletions(-)
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-123/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-456/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-789/pathway.txt
 create mode 100644 test/evaluate/input/data-test-params-empty/pathway.txt
 create mode 100644 test/evaluate/input/node_table.csv

diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
new file mode 100644
index 00000000..6c97ff7e
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt
@@ -0,0 +1,2 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway.txt b/test/evaluate/expected/expected-precision-recall-per-pathway.txt
new file mode 100644
index 00000000..02e17a7c
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway.txt
@@ -0,0 +1,5 @@
+Pathway	Precision	Recall
+test/evaluate/input/data-test-params-456/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-empty/pathway.txt	0.0	0.0
+test/evaluate/input/data-test-params-123/pathway.txt	0.6666666666666666	0.6666666666666666
+test/evaluate/input/data-test-params-789/pathway.txt	1.0	1.0
diff --git a/test/evaluate/input/data-test-params-123/pathway.txt b/test/evaluate/input/data-test-params-123/pathway.txt
new file mode 100644
index 00000000..21768464
--- /dev/null
+++ b/test/evaluate/input/data-test-params-123/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	C	1	U
diff --git a/test/evaluate/input/data-test-params-456/pathway.txt b/test/evaluate/input/data-test-params-456/pathway.txt
new file mode 100644
index 00000000..d445d80f
--- /dev/null
+++ b/test/evaluate/input/data-test-params-456/pathway.txt
@@ -0,0 +1,2 @@
+Node1	Node2	Rank	Direction
+F	L	1	U
diff --git a/test/evaluate/input/data-test-params-789/pathway.txt b/test/evaluate/input/data-test-params-789/pathway.txt
new file mode 100644
index 00000000..352698a0
--- /dev/null
+++ b/test/evaluate/input/data-test-params-789/pathway.txt
@@ -0,0 +1,3 @@
+Node1	Node2	Rank	Direction
+A	B	1	U
+B	Q	1	U
diff --git a/test/evaluate/input/data-test-params-empty/pathway.txt b/test/evaluate/input/data-test-params-empty/pathway.txt
new file mode 100644
index 00000000..63fda2b1
--- /dev/null
+++ b/test/evaluate/input/data-test-params-empty/pathway.txt
@@ -0,0 +1 @@
+Node1	Node2	Rank	Direction
\ No newline at end of file
diff --git a/test/evaluate/input/node_table.csv b/test/evaluate/input/node_table.csv
new file mode 100644
index 00000000..5b9cd41b
--- /dev/null
+++ b/test/evaluate/input/node_table.csv
@@ -0,0 +1,4 @@
+NODEID
+A
+B
+Q
\ No newline at end of file
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index 73acfaf1..413d2683 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -19,15 +19,21 @@ def setup_class(cls):
         """
         Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
 
-    def  test_precision_recall_pca_chosen_pathway(self):
+    def test_precision_recall_per_pathway(self):
         file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt",  INPUT_DIR + "data-test-params-789/pathway.txt",  INPUT_DIR + "data-test-params-empty/pathway.txt"]
         algorithms = ["test"]
-        output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt"
-        output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png"
+        output_file = OUT_DIR + "test-precision-recall-per-pathway.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway.png"
 
-        dataframe = ml.summarize_networks(file_paths)
-        ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv')
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)
 
-        pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR)
-        Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png)
-        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False)
+    def test_precision_recall_per_pathway_empty(self):
+
+        file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]
+        algorithms = ["test"]
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)

From 6a3e533667c2ccfa098a6c41937c0992be077bcd Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 5 Jun 2025 11:32:44 -0500
Subject: [PATCH 08/10] update snakefile and updated test cases

---
 Snakefile                                     | 31 +++++++++--------
 spras/evaluation.py                           | 33 ++++++++++---------
 ...d-precision-recall-per-pathway-nothing.txt |  1 +
 test/evaluate/test_evaluate.py                | 10 ++++++
 4 files changed, 47 insertions(+), 28 deletions(-)
 create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt

diff --git a/Snakefile b/Snakefile
index 4088e8cf..b34bfe31 100644
--- a/Snakefile
+++ b/Snakefile
@@ -105,11 +105,11 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
 
     if _config.config.analysis_include_evaluation:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) 
     if _config.config.analysis_include_evaluation_aggregate_algo:
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
-        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
+        final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
         # (if analysis is specified, these should be implicitly run).
@@ -375,14 +375,19 @@ def get_dataset_label(wildcards):
     dataset = parts[0]
     return dataset
 
-# Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
-rule evaluation:
+# Returns all pathways for a specific dataset
+def collect_pathways_per_dataset(wildcards):
+    dataset_label = get_dataset_label(wildcards)
+    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=dataset_label),
+
+# Run precision and recall for all pathway outputs for a dataset against its paired gold standard
+rule evaluation_pr_per_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
-        pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
+        pathways = collect_pathways_per_dataset
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
-        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
@@ -393,14 +398,14 @@ def collect_pathways_per_algo_per_dataset(wildcards):
     filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
     return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)
 
-# Run evaluation per algortihm for all associated pathway outputs for a dataset against its paired gold standard
-rule evaluation_per_algo_pathways:
+# Run precision and recall per algorithm for all pathway outputs for a dataset against its paired gold standard
+rule evaluation_per_algo_pr_per_pathways:
     input: 
         gold_standard_file = get_gold_standard_pickle_file,
         pathways =  collect_pathways_per_algo_per_dataset,
     output: 
-        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]),
-        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']),
+        pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway-per-{algorithm}.txt"]),
+        pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway-per-{algorithm}.png']),
     run:
         node_table = Evaluation.from_file(input.gold_standard_file).node_table
         Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)
diff --git a/spras/evaluation.py b/spras/evaluation.py
index 3879db54..aace5a6b 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -90,9 +90,8 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
         """
         y_true = set(node_table['NODEID'])
         results = []
-        for file in file_paths:
-            df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
-            # TODO: do we want to include the pathways that are empty for evaluation / in the pr_df?
+        for f in file_paths:
+            df = pd.read_table(f, sep="\t", header=0, usecols=["Node1", "Node2"])
             y_pred = set(df['Node1']).union(set(df['Node2']))
             all_nodes = y_true.union(y_pred)
             y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
@@ -101,15 +100,15 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
             # not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
             precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
             recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
-            results.append({"Pathway": file, "Precision": precision, "Recall": recall})
+            results.append({"Pathway": f, "Precision": precision, "Recall": recall})
 
         pr_df = pd.DataFrame(results)
-        pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
-        pr_df.to_csv(output_file, sep="\t", index=False)
 
-        if output_png is not None:
-            if not pr_df.empty:
-                plt.figure(figsize=(8, 6))
+        if not pr_df.empty:
+            pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
+            pr_df.to_csv(output_file, sep="\t", index=False)
+            if output_png is not None:
+                plt.figure(figsize=(10, 7))
                 # plot a line per algorithm
                 for algorithm in algorithms:
                     subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
@@ -122,17 +121,21 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                             label=f"{algorithm}"
                         )
 
-
                 plt.xlabel("Recall")
                 plt.ylabel("Precision")
-                plt.title(f"Precision and Recall Plot")
+                plt.xlim(-0.05, 1.05)
+                plt.ylim(-0.05, 1.05)
+                plt.title("Precision and Recall Plot")
                 plt.legend()
                 plt.grid(True)
                 plt.savefig(output_png)
-            else:
-                plt.figure()
-                plt.plot([], [])
-                plt.title("Empty Pathway Files")
+        else: # TODO: I don't think this case will ever hit
+            pr_df.to_csv(output_file, sep="\t", index=False)
+            if output_png is not None:
+                plt.figure(figsize=(10, 7))
+                plt.plot([], [], label="No Pathways Given")
+                plt.title("Precision and Recall Plot")
+                plt.legend()
                 plt.savefig(output_png)
 
 
diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/test/evaluate/expected/expected-precision-recall-per-pathway-nothing.txt
@@ -0,0 +1 @@
+
diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py
index 413d2683..3006ce2b 100644
--- a/test/evaluate/test_evaluate.py
+++ b/test/evaluate/test_evaluate.py
@@ -37,3 +37,13 @@ def test_precision_recall_per_pathway_empty(self):
 
         Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
         assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
+
+    def test_precision_recall_per_pathway_nothing(self):
+
+        file_paths = []
+        algorithms = []
+        output_file = OUT_DIR +"test-precision-recall-per-pathway-nothing.txt"
+        output_png = OUT_DIR + "test-precision-recall-per-pathway-nothing.png"
+
+        Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
+        assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-nothing.txt', shallow=False)

From 61e783987ebcb851f89bd6ce0345f2b8be22817d Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 5 Jun 2025 12:12:15 -0500
Subject: [PATCH 09/10] fix error, add plt.close

---
 Snakefile           | 2 +-
 spras/evaluation.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index b34bfe31..a86b79b9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -378,7 +378,7 @@ def get_dataset_label(wildcards):
 # Returns all pathways for a specific dataset
 def collect_pathways_per_dataset(wildcards):
     dataset_label = get_dataset_label(wildcards)
-    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=dataset_label),
+    return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=dataset_label)
 
 # Run precision and recall for all pathway outputs for a dataset against its paired gold standard
 rule evaluation_pr_per_pathways:
diff --git a/spras/evaluation.py b/spras/evaluation.py
index aace5a6b..17b9c30e 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -129,6 +129,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                 plt.legend()
                 plt.grid(True)
                 plt.savefig(output_png)
+                plt.close()
         else: # TODO: I don't think this case will ever hit
             pr_df.to_csv(output_file, sep="\t", index=False)
             if output_png is not None:
@@ -137,6 +138,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                 plt.title("Precision and Recall Plot")
                 plt.legend()
                 plt.savefig(output_png)
+                plt.close()
 
 
 

From 7be0c78eab273a147e2941cd930b86e9a5289b62 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 5 Jun 2025 15:24:46 -0500
Subject: [PATCH 10/10] add in color palette

---
 spras/evaluation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spras/evaluation.py b/spras/evaluation.py
index 17b9c30e..116d9b40 100644
--- a/spras/evaluation.py
+++ b/spras/evaluation.py
@@ -11,6 +11,8 @@
     recall_score,
 )
 
+from spras.analysis.ml import create_palette
+
 
 class Evaluation:
     NODE_ID = "NODEID"
@@ -109,6 +111,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
             pr_df.to_csv(output_file, sep="\t", index=False)
             if output_png is not None:
                 plt.figure(figsize=(10, 7))
+                color_palette = create_palette(algorithms)
                 # plot a line per algorithm
                 for algorithm in algorithms:
                     subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
@@ -116,6 +119,7 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a
                         plt.plot(
                             subset["Recall"],
                             subset["Precision"],
+                            color = color_palette[algorithm],
                             marker='o',
                             linestyle='',
                             label=f"{algorithm}"