Skip to content

Param tuning code integration:no param tuning #208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 35 additions & 7 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis_include_evaluation:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))

final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
if _config.config.analysis_include_evaluation_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}pr-per-pathway-per-{algorithm}.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
# (if analysis is specified, these should be implicitly run).
Expand Down Expand Up @@ -372,15 +375,40 @@ def get_dataset_label(wildcards):
dataset = parts[0]
return dataset

# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
rule evaluation:
# Returns all pathways for a specific dataset
def collect_pathways_per_dataset(wildcards):
dataset_label = get_dataset_label(wildcards)
return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=dataset_label)

# Run precision and recall for all pathway outputs for a dataset against its paired gold standard
rule evaluation_pr_per_pathways:
Comment on lines +383 to +384
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this now the behavior you switched to in #212

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this one is slightly different. This will give each precision and recall for each output subnetwork/pathway. The one in 212 will use the frequencies of the nodes from all of the outputs per algorithm and create PR curves using that.

input:
gold_standard_file = get_gold_standard_pickle_file,
pathways = collect_pathways_per_dataset
output:
pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway.txt"]),
pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway.png']),
run:
node_table = Evaluation.from_file(input.gold_standard_file).node_table
Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)

# Returns all pathways for a specific algorithm and dataset
def collect_pathways_per_algo_per_dataset(wildcards):
dataset_label = get_dataset_label(wildcards)
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)

# Run precision and recall per algorithm for all pathway outputs for a dataset against its paired gold standard
rule evaluation_per_algo_pr_per_pathways:
input:
gold_standard_file = get_gold_standard_pickle_file,
pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
pathways = collect_pathways_per_algo_per_dataset,
output:
pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "pr-per-pathway-per-{algorithm}.txt"]),
pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'pr-per-pathway-per-{algorithm}.png']),
run:
node_table = Evaluation.from_file(input.gold_standard_file).node_table
Evaluation.precision(input.pathways, node_table, output.eval_file)
Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)

# Remove the output directory
rule clean:
Expand Down
70 changes: 59 additions & 11 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@
from pathlib import Path
from typing import Dict, Iterable

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.metrics import (
precision_score,
recall_score,
)

from spras.analysis.ml import create_palette


class Evaluation:
Expand Down Expand Up @@ -72,29 +79,70 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
# TODO: later iteration - chose between node and edge file, or allow both

@staticmethod
def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
"""
Takes in file paths for a specific dataset and an associated gold standard node table.
Calculates precision for each pathway file
Calculates precision and recall for each pathway file
Returns output back to output_file
@param file_paths: file paths of pathway reconstruction algorithm outputs
@param node_table: the gold standard nodes
@param output_file: the filename to save the precision of each pathway
@param algorithms: list of algorithms used in current run of SPRAS
@param output_file: the filename to save the precision and recall of each pathway
@param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC)
"""
y_true = set(node_table['NODEID'])
results = []

for file in file_paths:
df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
for f in file_paths:
df = pd.read_table(f, sep="\t", header=0, usecols=["Node1", "Node2"])
y_pred = set(df['Node1']).union(set(df['Node2']))
all_nodes = y_true.union(y_pred)
y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]

# default to 0.0 if there is a divide by 0 error
# not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
results.append({"Pathway": f, "Precision": precision, "Recall": recall})

pr_df = pd.DataFrame(results)

if not pr_df.empty:
pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
pr_df.to_csv(output_file, sep="\t", index=False)
if output_png is not None:
plt.figure(figsize=(10, 7))
color_palette = create_palette(algorithms)
# plot a line per algorithm
for algorithm in algorithms:
subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
if not subset.empty:
plt.plot(
subset["Recall"],
subset["Precision"],
color = color_palette[algorithm],
marker='o',
linestyle='',
label=f"{algorithm}"
)

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.title("Precision and Recall Plot")
plt.legend()
plt.grid(True)
plt.savefig(output_png)
plt.close()
else: # TODO: I don't think this case will ever hit
pr_df.to_csv(output_file, sep="\t", index=False)
if output_png is not None:
plt.figure(figsize=(10, 7))
plt.plot([], [], label="No Pathways Given")
plt.title("Precision and Recall Plot")
plt.legend()
plt.savefig(output_png)
plt.close()


results.append({"Pathway": file, "Precision": precision})

precision_df = pd.DataFrame(results)
precision_df.to_csv(output_file, sep="\t", index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Pathway Precision Recall
test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Pathway Precision Recall
test/evaluate/input/data-test-params-456/pathway.txt 0.0 0.0
test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0
test/evaluate/input/data-test-params-123/pathway.txt 0.6666666666666666 0.6666666666666666
test/evaluate/input/data-test-params-789/pathway.txt 1.0 1.0
3 changes: 3 additions & 0 deletions test/evaluate/input/data-test-params-123/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Node1 Node2 Rank Direction
A B 1 U
B C 1 U
2 changes: 2 additions & 0 deletions test/evaluate/input/data-test-params-456/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Node1 Node2 Rank Direction
F L 1 U
3 changes: 3 additions & 0 deletions test/evaluate/input/data-test-params-789/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Node1 Node2 Rank Direction
A B 1 U
B Q 1 U
1 change: 1 addition & 0 deletions test/evaluate/input/data-test-params-empty/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Node1 Node2 Rank Direction
4 changes: 4 additions & 0 deletions test/evaluate/input/node_table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NODEID
A
B
Q
49 changes: 49 additions & 0 deletions test/evaluate/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import filecmp
from pathlib import Path

import pandas as pd
import pytest

import spras.analysis.ml as ml
from spras.evaluation import Evaluation

INPUT_DIR = 'test/evaluate/input/'
OUT_DIR = 'test/evaluate/output/'
EXPECT_DIR = 'test/evaluate/expected/'
NODE_TABLE = pd.read_csv(INPUT_DIR + "node_table.csv", header=0)
class TestEvaluate:
@classmethod
def setup_class(cls):
"""
Create the expected output directory
"""
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

def test_precision_recall_per_pathway(self):
file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"]
algorithms = ["test"]
output_file = OUT_DIR + "test-precision-recall-per-pathway.txt"
output_png = OUT_DIR + "test-precision-recall-per-pathway.png"

Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)

def test_precision_recall_per_pathway_empty(self):

file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]
algorithms = ["test"]
output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt"
output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"

Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)

def test_precision_recall_per_pathway_nothing(self):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this case should ever happen in the wild, but if it does, something is wrong.


file_paths = []
algorithms = []
output_file = OUT_DIR +"test-precision-recall-per-pathway-nothing.txt"
output_png = OUT_DIR + "test-precision-recall-per-pathway-nothing.png"

Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-nothing.txt', shallow=False)
Loading