From d2da12b009c621eb6e1507ddd0d5ee3c944ad9b2 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 3 Mar 2025 15:35:29 -0600 Subject: [PATCH 1/7] updated ml back to og --- spras/analysis/ml.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 7d45e091..0bab3f21 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -10,7 +10,7 @@ from scipy.cluster.hierarchy import dendrogram, fcluster from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA -from sklearn.preprocessing import MinMaxScaler, StandardScaler +from sklearn.preprocessing import StandardScaler from spras.util import make_required_dirs @@ -142,14 +142,8 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: if not isinstance(labels, bool): raise ValueError(f"labels={labels} must be True or False") - #TODO: MinMaxScaler changes nothing about the data - # scaler = MinMaxScaler() - # scaler.fit(X) # calc mean and standard deviation - # X_scaled = scaler.transform(X) - - scaler = StandardScaler() # TODO: StandardScalar doesn't make sense on binary data because the mean and variance lead to values outside the binary range + scaler = StandardScaler() scaler.fit(X) # calc mean and standard deviation - scaler.transform(X) X_scaled = scaler.transform(X) # choosing the PCA @@ -158,28 +152,20 @@ def pca(dataframe: pd.DataFrame, output_png: str, output_var: str, output_coord: X_pca = pca_instance.transform(X_scaled) variance = pca_instance.explained_variance_ratio_ * 100 - # calculating the centroid - centroid = np.mean(X_pca, axis=0) # mean of each principal component across all samples - # making the plot label_color_map = create_palette(column_names) plt.figure(figsize=(10, 7)) - sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, palette=label_color_map) - plt.scatter(centroid[0], centroid[1], color='red', marker='X', s=100, label='Centroid') + sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], s=70, hue=column_names, legend=True, palette=label_color_map) plt.title("PCA") - plt.legend() plt.xlabel(f"PC1 ({variance[0]:.1f}% variance)") plt.ylabel(f"PC2 ({variance[1]:.1f}% variance)") # saving the coordinates of each algorithm make_required_dirs(output_coord) coordinates_df = pd.DataFrame(X_pca, columns=['PC' + str(i) for i in range(1, components+1)]) - coordinates_df.insert(0, 'datapoint_labels', columns.tolist()) - centroid_row = ['centroid'] + centroid.tolist() - coordinates_df.loc[len(coordinates_df)] = centroid_row + coordinates_df.insert(0, 'algorithm', columns.tolist()) coordinates_df.to_csv(output_coord, sep='\t', index=False) - # saving the principal components make_required_dirs(output_var) with open(output_var, "w") as f: @@ -360,4 +346,4 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str): lambda edge: edge.split(DIR_CONST)[1] if DIR_CONST in edge else edge.split(UNDIR_CONST)[1]) make_required_dirs(output_file) - row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True) + row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True) \ No newline at end of file From e80d1242edc6e207dd068dd6aee0ea4da50f4f46 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 3 Mar 2025 15:39:19 -0600 Subject: [PATCH 2/7] updated snakemake file to only include no param tuning --- Snakefile | 51 ++------------------------------------------------- 1 file changed, 2 insertions(+), 49 deletions(-) diff --git a/Snakefile b/Snakefile index caee3428..a4322f5e 100644 --- a/Snakefile +++ b/Snakefile @@ -106,15 +106,9 @@ def make_final_input(wildcards): if _config.config.analysis_include_evaluation: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) - # TODO: should we provide the node ensemble frequencies if _config.config.analysis_include_evaluation_aggregate_algo: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-pca-chosen-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms_mult_param_combos)) - final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-curve-ensemble-nodes.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms)) - # TODO: should we provide the node ensemble frequencies per algortihm if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. # (if analysis is specified, these should be implicitly run). @@ -380,44 +374,25 @@ def get_dataset_label(wildcards): dataset = parts[0] return dataset - # Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard rule evaluation: input: gold_standard_file = get_gold_standard_pickle_file, pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label), - ensemble_file=lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}ensemble-pathway.txt", - pca_coordinates_file =lambda wildcards: f"{out_dir}{SEP}{get_dataset_label(wildcards)}-ml{SEP}pca-coordinates.txt" output: pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]), pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']), - pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-curve-ensemble-nodes.png']), - pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-pca-chosen-pathway.txt"]), run: node_table = Evaluation.from_file(input.gold_standard_file).node_table Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) - node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) - Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png) - pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) - + # Returns all pathways for a specific algorithm and dataset def collect_pathways_per_algo_per_dataset(wildcards): dataset_label = get_dataset_label(wildcards) filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label) -# Returns ensemble file for a specific algorithm and dataset -def collect_ensemble_per_algo_per_dataset(wildcards): - dataset_label = get_dataset_label(wildcards) - return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-ensemble-pathway.txt" - -# Returns pca coordinates for a specific algorithm and dataset -def collect_pca_coordinates_per_algo_per_dataset(wildcards): - dataset_label = get_dataset_label(wildcards) - return f"{out_dir}{SEP}{dataset_label}-ml{SEP}{wildcards.algorithm}-pca-coordinates.txt" - -# Run evaluation per algortihm for all associated pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard +# Run evaluation per algortihm for all associated pathway outputs for a dataset against its paired gold standard rule evaluation_per_algo_pathways: input: gold_standard_file = get_gold_standard_pickle_file, @@ -429,28 +404,6 @@ rule evaluation_per_algo_pathways: node_table = Evaluation.from_file(input.gold_standard_file).node_table Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png) -rule evaluation_per_algo_ensemble_pr_curve: - input: - gold_standard_file = get_gold_standard_pickle_file, - ensemble_file = collect_ensemble_per_algo_per_dataset, - output: - pr_curve_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-curve-ensemble-nodes.png']), - run: - node_table = Evaluation.from_file(input.gold_standard_file).node_table - node_ensemble = Evaluation.edge_frequency_node_ensemble(input.ensemble_file) - Evaluation.precision_recall_curve_node_ensemble(node_ensemble, node_table, output.pr_curve_png) - -rule evaluation_per_algo_pca_chosen: - input: - gold_standard_file = get_gold_standard_pickle_file, - pca_coordinates_file = collect_pca_coordinates_per_algo_per_dataset - output: - pca_chosen_pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-pca-chosen-pathway.txt"]), - run: - node_table = Evaluation.from_file(input.gold_standard_file).node_table - pca_chosen_pathway = Evaluation.pca_chosen_pathway(input.pca_coordinates_file, out_dir) - Evaluation.precision_and_recall(pca_chosen_pathway, node_table, algorithms, output.pca_chosen_pr_file) - # Remove the output directory rule clean: shell: f'rm -rf {out_dir}' From a1e69db1cedad33dff946c94f2b6ad13ecf83c97 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 3 Mar 2025 15:46:35 -0600 Subject: [PATCH 3/7] updated test cases to be for no param tuning --- .../expected/expected-node-ensemble.csv | 13 ------- ...recision-recall-per-pathway-pca-chosen.txt | 2 -- test/evaluate/input/ensemble-network.tsv | 10 ------ test/evaluate/input/node-ensemble-empty.csv | 2 -- test/evaluate/input/node-ensemble.csv | 13 ------- test/evaluate/input/pca-coordinates.tsv | 6 ---- test/evaluate/test_evaluate.py | 36 +------------------ test/ml/expected/expected-pca-coordinates.tsv | 3 +- test/ml/test_ml.py | 9 +++-- 9 files changed, 6 insertions(+), 88 deletions(-) delete mode 100644 test/evaluate/expected/expected-node-ensemble.csv delete mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt delete mode 100644 test/evaluate/input/ensemble-network.tsv delete mode 100644 test/evaluate/input/node-ensemble-empty.csv delete mode 100644 test/evaluate/input/node-ensemble.csv delete mode 100644 test/evaluate/input/pca-coordinates.tsv diff --git a/test/evaluate/expected/expected-node-ensemble.csv b/test/evaluate/expected/expected-node-ensemble.csv deleted file mode 100644 index ba467d55..00000000 --- a/test/evaluate/expected/expected-node-ensemble.csv +++ /dev/null @@ -1,13 +0,0 @@ -Node max_freq -C 0.75 -E 0.75 -D 0.75 -F 0.75 -A 0.5 -B 0.5 -L 0.5 -M 0.5 -O 0.25 -P 0.25 -N 0.25 -Q 0.25 diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt deleted file mode 100644 index 6c97ff7e..00000000 --- a/test/evaluate/expected/expected-precision-recall-per-pathway-pca-chosen.txt +++ /dev/null @@ -1,2 +0,0 @@ -Pathway Precision Recall -test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 diff --git a/test/evaluate/input/ensemble-network.tsv b/test/evaluate/input/ensemble-network.tsv deleted file mode 100644 index 293ec3f5..00000000 --- a/test/evaluate/input/ensemble-network.tsv +++ /dev/null @@ -1,10 +0,0 @@ -Node1 Node2 Frequency Direction -A B 0.5 U -C D 0.75 U -E F 0.75 U -L M 0.5 U -M N 0.25 U -O P 0.25 U -P Q 0.25 U -A B 0.25 D -B A 0.25 D \ No newline at end of file diff --git a/test/evaluate/input/node-ensemble-empty.csv b/test/evaluate/input/node-ensemble-empty.csv deleted file mode 100644 index e488f56a..00000000 --- a/test/evaluate/input/node-ensemble-empty.csv +++ /dev/null @@ -1,2 +0,0 @@ -Node max_freq - diff --git a/test/evaluate/input/node-ensemble.csv b/test/evaluate/input/node-ensemble.csv deleted file mode 100644 index ba467d55..00000000 --- a/test/evaluate/input/node-ensemble.csv +++ /dev/null @@ -1,13 +0,0 @@ -Node max_freq -C 0.75 -E 0.75 -D 0.75 -F 0.75 -A 0.5 -B 0.5 -L 0.5 -M 0.5 -O 0.25 -P 0.25 -N 0.25 -Q 0.25 diff --git a/test/evaluate/input/pca-coordinates.tsv b/test/evaluate/input/pca-coordinates.tsv deleted file mode 100644 index 92fc6b3d..00000000 --- a/test/evaluate/input/pca-coordinates.tsv +++ /dev/null @@ -1,6 +0,0 @@ -datapoint_labels PC1 PC2 -data-test-params-123 -1.3973472526239425 -1.632993161855452 -data-test-params-456 2.025440509784659 1.9566080710032526e-16 -data-test-params-789 -1.3973472526239425 1.632993161855452 -data-test-params-empty 0.7692539954632259 -4.1496185644351084e-16 -centroid -2.7755575615628914e-17 -4.822931287961988e-17 diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index 5dc0b8f3..59705989 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -19,26 +19,6 @@ def setup_class(cls): """ Path(OUT_DIR).mkdir(parents=True, exist_ok=True) - def test_node_ensemble(self): - ensemble_file = INPUT_DIR + 'ensemble-network.tsv' - edge_freq = Evaluation.edge_frequency_node_ensemble(ensemble_file) - edge_freq.to_csv(OUT_DIR + 'node-ensemble.csv', sep="\t", index=False) - assert filecmp.cmp(OUT_DIR + 'node-ensemble.csv', EXPECT_DIR + 'expected-node-ensemble.csv', shallow=False) - - def test_precision_recal_curve_ensemble_nodes(self): - out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes.png") - out_path.unlink(missing_ok=True) - ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble.csv', sep="\t", header=0) - Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path) - assert out_path.exists() - - def test_precision_recal_curve_ensemble_nodes_empty(self): - out_path = Path(OUT_DIR+"test-precision-recall-curve-ensemble-nodes-empty.png") - out_path.unlink(missing_ok=True) - ensemble_file = pd.read_csv(INPUT_DIR + 'node-ensemble-empty.csv', sep="\t", header=0) - Evaluation.precision_recall_curve_node_ensemble(ensemble_file, NODE_TABLE, out_path) - assert out_path.exists() - def test_precision_recall_per_pathway(self): file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"] algorithms = ["test"] @@ -56,18 +36,4 @@ def test_precision_recall_per_pathway_empty(self): output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png" Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) - assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False) - - - def test_precision_recall_pca_chosen_pathway(self): - file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"] - algorithms = ["test"] - output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt" - output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png" - - dataframe = ml.summarize_networks(file_paths) - ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv') - - pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR) - Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png) - assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False) \ No newline at end of file diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv index ac10f2db..288be9c1 100644 --- a/test/ml/expected/expected-pca-coordinates.tsv +++ b/test/ml/expected/expected-pca-coordinates.tsv @@ -1,5 +1,4 @@ -datapoint_labels PC1 PC2 +algorithm PC1 PC2 test-data-s1 -2.0066502104820323 -0.9865875190637746 test-data-s2 -1.5276508866841985 1.0799457247533233 test-data-s3 3.5343010971662308 -0.09335820568954915 -centroid 0.0 -1.6190752442450199e-16 diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index b9ca69ca..cb9370ef 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -77,15 +77,14 @@ def test_pca_robustness(self): dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) expected = pd.read_table(EXPECT_DIR + 'expected-pca-coordinates.tsv') expected = expected.round(5) - expected.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) - for _ in range(5): dataframe_shuffled = dataframe.sample(frac=1, axis=1) # permute the columns ml.pca(dataframe_shuffled, OUT_DIR + 'pca-shuffled-columns.png', OUT_DIR + 'pca-shuffled-columns-variance.txt', OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = pd.read_table(OUT_DIR + 'pca-shuffled-columns-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) + assert coord.equals(expected) for _ in range(5): @@ -94,7 +93,7 @@ def test_pca_robustness(self): OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = pd.read_table(OUT_DIR + 'pca-shuffled-rows-coordinates.tsv') coord = coord.round(5) # round values to 5 digits to account for numeric differences across machines - coord.sort_values(by='datapoint_labels', ignore_index=True, inplace=True) + coord.sort_values(by='algorithm', ignore_index=True, inplace=True) assert coord.equals(expected) @@ -139,4 +138,4 @@ def test_ensemble_network_empty(self): en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv') expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') - assert en.equals(expected) + assert en.equals(expected) \ No newline at end of file From 38b6f549aa380f7a659e5e8e9d8d77bc55866774 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 3 Mar 2025 15:49:27 -0600 Subject: [PATCH 4/7] keep precision and recall only in evaluate.py --- spras/evaluation.py | 98 --------------------------------------------- 1 file changed, 98 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index e6f60c0b..71bb1013 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -136,101 +136,3 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a plt.plot([], []) plt.title("Empty Pathway Files") plt.savefig(output_png) - - - def select_max_freq_and_node(row: pd.Series): - """ - Selects the node and frequency with the highest frequency value from two potential nodes in a row. - Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency. - """ - max_freq = 0 - node = "" - if pd.isna(row['Node2']) and pd.isna(row['Freq2']): - max_freq = row['Freq1'] - node = row['Node1'] - elif pd.isna(row['Node1']) and pd.isna(row['Freq1']): - max_freq = row['Freq2'] - node = row['Node2'] - else: - max_freq = max(row['Freq1'], row['Freq2']) - node = row['Node1'] - return node, max_freq - - def edge_frequency_node_ensemble(ensemble_file: str): - """ - Processes an ensemble of edge frequencies to identify the highest frequency associated with each node - Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node. - Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty. - @param ensemble_file: the pre-computed node_ensemble - """ - ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0) - - if not ensemble_df.empty: - node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction']) - node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction']) - - max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() - max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) - max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() - max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) - - node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') - node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand') - node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) - node_ensemble.sort_values('max_freq', ascending= False, inplace = True) - return node_ensemble - else: - return pd.DataFrame(columns = ['Node', 'max_freq']) - - def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): - """ - Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table. - Plots a precision and recall curve for the node ensemble against its associated gold standard node table - Returns output back to output_png - @param node_ensemble: the pre-computed node_ensemble - @param node_table: the gold standard nodes - @param output_file: the filename to save the precision and recall curves - """ - gold_standard_nodes = set(node_table['NODEID']) - - if not node_ensemble.empty: - y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']] - y_scores = node_ensemble['max_freq'].tolist() - precision, recall, thresholds = precision_recall_curve(y_true, y_scores) - auc_precision_recall = average_precision_score(y_true, y_scores) - - plt.figure() - plt.plot(recall, precision, marker='o', label='Precision-Recall curve') - plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}') - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.title('Precision-Recall Curve') - plt.legend() - plt.grid(True) - plt.savefig(output_png) - else: - plt.figure() - plt.plot([], []) - plt.title("Empty Ensemble File") - plt.savefig(output_png) - - def pca_chosen_pathway(coordinates_file: str, output_dir:str): - """ - Identifies the pathway closest to a specified centroid based on PCA coordinates - Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway. - Returns the file path for the representative pathway associated with the closest data point. - @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset - @param output_dir: the main reconstruction directory - """ - coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0) - - centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid'] - centroid = centroid_row.iloc[0, 1:].tolist() - coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid'] - - pc_columns = [col for col in coord_df.columns if col.startswith('PC')] - coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) - closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] - rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")] - - return rep_pathway From 00e3182363d99149097f063ceb679b0344dccd0f Mon Sep 17 00:00:00 2001 From: ntalluri Date: Tue, 11 Mar 2025 13:20:04 -0500 Subject: [PATCH 5/7] reverted expected pca coordinates and precommit --- spras/analysis/ml.py | 2 +- test/evaluate/test_evaluate.py | 2 +- test/ml/expected/expected-pca-coordinates.tsv | 6 +++--- test/ml/test_ml.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 0bab3f21..3dad8775 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -346,4 +346,4 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str): lambda edge: edge.split(DIR_CONST)[1] if DIR_CONST in edge else edge.split(UNDIR_CONST)[1]) make_required_dirs(output_file) - row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True) \ No newline at end of file + row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True) diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index 59705989..413d2683 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -36,4 +36,4 @@ def test_precision_recall_per_pathway_empty(self): output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png" Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) - assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False) \ No newline at end of file + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False) diff --git a/test/ml/expected/expected-pca-coordinates.tsv b/test/ml/expected/expected-pca-coordinates.tsv index 288be9c1..b6371c84 100644 --- a/test/ml/expected/expected-pca-coordinates.tsv +++ b/test/ml/expected/expected-pca-coordinates.tsv @@ -1,4 +1,4 @@ algorithm PC1 PC2 -test-data-s1 -2.0066502104820323 -0.9865875190637746 -test-data-s2 -1.5276508866841985 1.0799457247533233 -test-data-s3 3.5343010971662308 -0.09335820568954915 +test-data-s1 -2.006650210482033 -0.9865875190637743 +test-data-s2 -1.5276508866841987 1.0799457247533237 +test-data-s3 3.534301097166232 -0.0933582056895495 \ No newline at end of file diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index cb9370ef..2b5720ae 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -138,4 +138,4 @@ def test_ensemble_network_empty(self): en = pd.read_table(OUT_DIR + 'ensemble-network-empty.tsv') expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') - assert en.equals(expected) \ No newline at end of file + assert en.equals(expected) From 0ca49646aa88380407defc92c2b6a5a2b8fb221d Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 24 Mar 2025 13:49:09 -0500 Subject: [PATCH 6/7] update evaluation.py to be up to date --- spras/evaluation.py | 102 +++----------------------------------------- 1 file changed, 6 insertions(+), 96 deletions(-) diff --git a/spras/evaluation.py b/spras/evaluation.py index 7734902b..3879db54 100644 --- a/spras/evaluation.py +++ b/spras/evaluation.py @@ -3,8 +3,13 @@ from pathlib import Path from typing import Dict, Iterable +import matplotlib.pyplot as plt +import numpy as np import pandas as pd -from sklearn.metrics import precision_score +from sklearn.metrics import ( + precision_score, + recall_score, +) class Evaluation: @@ -131,99 +136,4 @@ def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, a plt.savefig(output_png) - def select_max_freq_and_node(row: pd.Series): - """ - Selects the node and frequency with the highest frequency value from two potential nodes in a row. - Handles cases where one of the nodes or frequencies may be missing and returns the node associated with the maximum frequency. - """ - max_freq = 0 - node = "" - if pd.isna(row['Node2']) and pd.isna(row['Freq2']): - max_freq = row['Freq1'] - node = row['Node1'] - elif pd.isna(row['Node1']) and pd.isna(row['Freq1']): - max_freq = row['Freq2'] - node = row['Node2'] - else: - max_freq = max(row['Freq1'], row['Freq2']) - node = row['Node1'] - return node, max_freq - - def edge_frequency_node_ensemble(ensemble_file: str): - """ - Processes an ensemble of edge frequencies to identify the highest frequency associated with each node - Reads ensemble_file, separates frequencies by node, and then calculates the maximum frequency for each node. - Returns a DataFrame of nodes with their respective maximum frequencies, or an empty DataFrame if ensemble_file is empty. - @param ensemble_file: the pre-computed node_ensemble - """ - ensemble_df = pd.read_table(ensemble_file, sep="\t", header=0) - - if not ensemble_df.empty: - node1_freq = ensemble_df.drop(columns = ['Node2', 'Direction']) - node2_freq = ensemble_df.drop(columns = ['Node1', 'Direction']) - - max_node1_freq = node1_freq.groupby(['Node1']).max().reset_index() - max_node1_freq.rename(columns = {'Frequency': 'Freq1'}, inplace = True) - max_node2_freq = node2_freq.groupby(['Node2']).max().reset_index() - max_node2_freq.rename(columns = {'Frequency': 'Freq2'}, inplace = True) - - node_ensemble = max_node1_freq.merge(max_node2_freq, left_on='Node1', right_on='Node2', how='outer') - node_ensemble[['Node', 'max_freq']] = node_ensemble.apply(Evaluation.select_max_freq_and_node, axis=1, result_type='expand') - node_ensemble.drop(columns = ['Node1', 'Node2', 'Freq1', 'Freq2'], inplace = True) - node_ensemble.sort_values('max_freq', ascending= False, inplace = True) - return node_ensemble - else: - return pd.DataFrame(columns = ['Node', 'max_freq']) - - def precision_recall_curve_node_ensemble(node_ensemble:pd.DataFrame, node_table:pd.DataFrame, output_png: str): - """ - Takes in an node ensemble for specific dataset or specific algorithm in a dataset, and an associated gold standard node table. - Plots a precision and recall curve for the node ensemble against its associated gold standard node table - Returns output back to output_png - @param node_ensemble: the pre-computed node_ensemble - @param node_table: the gold standard nodes - @param output_file: the filename to save the precision and recall curves - """ - gold_standard_nodes = set(node_table['NODEID']) - - if not node_ensemble.empty: - y_true = [1 if node in gold_standard_nodes else 0 for node in node_ensemble['Node']] - y_scores = node_ensemble['max_freq'].tolist() - precision, recall, thresholds = precision_recall_curve(y_true, y_scores) - auc_precision_recall = average_precision_score(y_true, y_scores) - - plt.figure() - plt.plot(recall, precision, marker='o', label='Precision-Recall curve') - plt.axhline(y=auc_precision_recall, color='r', linestyle='--', label=f'Avg Precision: {auc_precision_recall:.4f}') - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.title('Precision-Recall Curve') - plt.legend() - plt.grid(True) - plt.savefig(output_png) - else: - plt.figure() - plt.plot([], []) - plt.title("Empty Ensemble File") - plt.savefig(output_png) - - def pca_chosen_pathway(coordinates_file: str, output_dir:str): - """ - Identifies the pathway closest to a specified centroid based on PCA coordinates - Calculates the Euclidean distance from each data point to the centroid, then selects the closest pathway. - Returns the file path for the representative pathway associated with the closest data point. - @param coordinates_file: the pca coordinates file for a dataset or specific algorithm in a datset - @param output_dir: the main reconstruction directory - """ - coord_df = pd.read_csv(coordinates_file, delimiter="\t", header=0) - - centroid_row = coord_df[coord_df['datapoint_labels'] == 'centroid'] - centroid = centroid_row.iloc[0, 1:].tolist() - coord_df = coord_df[coord_df['datapoint_labels'] != 'centroid'] - - pc_columns = [col for col in coord_df.columns if col.startswith('PC')] - coord_df['Distance To Centroid'] = np.sqrt(sum((coord_df[pc] - centroid[i]) ** 2 for i, pc in enumerate(pc_columns))) - closest_to_centroid = coord_df.sort_values(by='Distance To Centroid').iloc[0] - rep_pathway = [os.path.join(output_dir, f"{closest_to_centroid['datapoint_labels']}", "pathway.txt")] - return rep_pathway From b483be73fcf97b59653823f92915df26cd47b016 Mon Sep 17 00:00:00 2001 From: ntalluri Date: Mon, 24 Mar 2025 13:49:28 -0500 Subject: [PATCH 7/7] readd evaluation tests --- ...ted-precision-recall-per-pathway-empty.txt | 2 ++ .../expected-precision-recall-per-pathway.txt | 5 +++++ .../input/data-test-params-123/pathway.txt | 3 +++ .../input/data-test-params-456/pathway.txt | 2 ++ .../input/data-test-params-789/pathway.txt | 3 +++ .../input/data-test-params-empty/pathway.txt | 1 + test/evaluate/input/node_table.csv | 4 ++++ test/evaluate/test_evaluate.py | 22 ++++++++++++------- 8 files changed, 34 insertions(+), 8 deletions(-) create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt create mode 100644 test/evaluate/expected/expected-precision-recall-per-pathway.txt create mode 100644 test/evaluate/input/data-test-params-123/pathway.txt create mode 100644 test/evaluate/input/data-test-params-456/pathway.txt create mode 100644 test/evaluate/input/data-test-params-789/pathway.txt create mode 100644 test/evaluate/input/data-test-params-empty/pathway.txt create mode 100644 test/evaluate/input/node_table.csv diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt new file mode 100644 index 00000000..6c97ff7e --- /dev/null +++ b/test/evaluate/expected/expected-precision-recall-per-pathway-empty.txt @@ -0,0 +1,2 @@ +Pathway Precision Recall +test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 diff --git a/test/evaluate/expected/expected-precision-recall-per-pathway.txt b/test/evaluate/expected/expected-precision-recall-per-pathway.txt new file mode 100644 index 00000000..02e17a7c --- /dev/null +++ b/test/evaluate/expected/expected-precision-recall-per-pathway.txt @@ -0,0 +1,5 @@ +Pathway Precision Recall +test/evaluate/input/data-test-params-456/pathway.txt 0.0 0.0 +test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0 +test/evaluate/input/data-test-params-123/pathway.txt 0.6666666666666666 0.6666666666666666 +test/evaluate/input/data-test-params-789/pathway.txt 1.0 1.0 diff --git a/test/evaluate/input/data-test-params-123/pathway.txt b/test/evaluate/input/data-test-params-123/pathway.txt new file mode 100644 index 00000000..21768464 --- /dev/null +++ b/test/evaluate/input/data-test-params-123/pathway.txt @@ -0,0 +1,3 @@ +Node1 Node2 Rank Direction +A B 1 U +B C 1 U diff --git a/test/evaluate/input/data-test-params-456/pathway.txt b/test/evaluate/input/data-test-params-456/pathway.txt new file mode 100644 index 00000000..d445d80f --- /dev/null +++ b/test/evaluate/input/data-test-params-456/pathway.txt @@ -0,0 +1,2 @@ +Node1 Node2 Rank Direction +F L 1 U diff --git a/test/evaluate/input/data-test-params-789/pathway.txt b/test/evaluate/input/data-test-params-789/pathway.txt new file mode 100644 index 00000000..352698a0 --- /dev/null +++ b/test/evaluate/input/data-test-params-789/pathway.txt @@ -0,0 +1,3 @@ +Node1 Node2 Rank Direction +A B 1 U +B Q 1 U diff --git a/test/evaluate/input/data-test-params-empty/pathway.txt b/test/evaluate/input/data-test-params-empty/pathway.txt new file mode 100644 index 00000000..63fda2b1 --- /dev/null +++ b/test/evaluate/input/data-test-params-empty/pathway.txt @@ -0,0 +1 @@ +Node1 Node2 Rank Direction \ No newline at end of file diff --git a/test/evaluate/input/node_table.csv b/test/evaluate/input/node_table.csv new file mode 100644 index 00000000..5b9cd41b --- /dev/null +++ b/test/evaluate/input/node_table.csv @@ -0,0 +1,4 @@ +NODEID +A +B +Q \ No newline at end of file diff --git a/test/evaluate/test_evaluate.py b/test/evaluate/test_evaluate.py index 73acfaf1..413d2683 100644 --- a/test/evaluate/test_evaluate.py +++ b/test/evaluate/test_evaluate.py @@ -19,15 +19,21 @@ def setup_class(cls): """ Path(OUT_DIR).mkdir(parents=True, exist_ok=True) - def test_precision_recall_pca_chosen_pathway(self): + def test_precision_recall_per_pathway(self): file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"] algorithms = ["test"] - output_file = OUT_DIR +"test-precision-recall-per-pathway-pca-chosen.txt" - output_png = OUT_DIR + "test-precision-recall-per-pathway-pca-chosen.png" + output_file = OUT_DIR + "test-precision-recall-per-pathway.txt" + output_png = OUT_DIR + "test-precision-recall-per-pathway.png" - dataframe = ml.summarize_networks(file_paths) - ml.pca(dataframe, OUT_DIR + 'pca.png', OUT_DIR + 'pca-variance.txt', OUT_DIR + 'pca-coordinates.tsv') + Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False) - pathway = Evaluation.pca_chosen_pathway(OUT_DIR + 'pca-coordinates.tsv', INPUT_DIR) - Evaluation.precision_and_recall(pathway, NODE_TABLE, algorithms, output_file, output_png) - assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-pca-chosen.txt', shallow=False) + def test_precision_recall_per_pathway_empty(self): + + file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"] + algorithms = ["test"] + output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt" + output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png" + + Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png) + assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)