Skip to content

Param tuning code integration:no param tuning #208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,11 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))

if _config.config.analysis_include_evaluation:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))

final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs))
if _config.config.analysis_include_evaluation_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-eval{sep}{algorithm}-precision-recall-per-pathway.png',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm=algorithms))
if len(final_input) == 0:
# No analysis added yet, so add reconstruction output files if they exist.
# (if analysis is specified, these should be implicitly run).
Expand Down Expand Up @@ -372,15 +375,35 @@ def get_dataset_label(wildcards):
dataset = parts[0]
return dataset

# Run evaluation code for a specific dataset's pathway outputs against its paired gold standard
# Run evaluation for all pathway outputs, ensemble.txt, and pca_coordinates.txt for a dataset against its paired gold standard
rule evaluation:
input:
gold_standard_file = get_gold_standard_pickle_file,
pathways = expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params, dataset_label=get_dataset_label),
output: eval_file = SEP.join([out_dir, "{dataset_gold_standard_pairs}-evaluation.txt"])
output:
pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "precision-recall-per-pathway.txt"]),
pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', 'precision-recall-per-pathway.png']),
run:
node_table = Evaluation.from_file(input.gold_standard_file).node_table
Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)

# Returns all pathways for a specific algorithm and dataset
def collect_pathways_per_algo_per_dataset(wildcards):
dataset_label = get_dataset_label(wildcards)
filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
return expand('{out_dir}{sep}{dataset_label}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params, dataset_label= dataset_label)

# Run evaluation per algortihm for all associated pathway outputs for a dataset against its paired gold standard
rule evaluation_per_algo_pathways:
input:
gold_standard_file = get_gold_standard_pickle_file,
pathways = collect_pathways_per_algo_per_dataset,
output:
pr_file = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', "{algorithm}-precision-recall-per-pathway.txt"]),
pr_png = SEP.join([out_dir, '{dataset_gold_standard_pairs}-eval', '{algorithm}-precision-recall-per-pathway.png']),
run:
node_table = Evaluation.from_file(input.gold_standard_file).node_table
Evaluation.precision(input.pathways, node_table, output.eval_file)
Evaluation.precision_and_recall(input.pathways, node_table, algorithms, output.pr_file, output.pr_png)

# Remove the output directory
rule clean:
Expand Down
57 changes: 48 additions & 9 deletions spras/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
from pathlib import Path
from typing import Dict, Iterable

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.metrics import (
precision_score,
recall_score,
)


class Evaluation:
Expand Down Expand Up @@ -72,29 +77,63 @@ def load_files_from_dict(self, gold_standard_dict: Dict):
# TODO: later iteration - chose between node and edge file, or allow both

@staticmethod
def precision(file_paths: Iterable[Path], node_table: pd.DataFrame, output_file: str):
def precision_and_recall(file_paths: Iterable[Path], node_table: pd.DataFrame, algorithms: list, output_file: str, output_png:str=None):
"""
Takes in file paths for a specific dataset and an associated gold standard node table.
Calculates precision for each pathway file
Calculates precision and recall for each pathway file
Returns output back to output_file
@param file_paths: file paths of pathway reconstruction algorithm outputs
@param node_table: the gold standard nodes
@param output_file: the filename to save the precision of each pathway
@param algorithms: list of algorithms used in current run of SPRAS
@param output_file: the filename to save the precision and recall of each pathway
@param output_png (optional): the filename to plot the precision and recall of each pathway (not a PRC)
"""
y_true = set(node_table['NODEID'])
results = []

for file in file_paths:
df = pd.read_table(file, sep="\t", header=0, usecols=["Node1", "Node2"])
# TODO: do we want to include the pathways that are empty for evaluation / in the pr_df?
y_pred = set(df['Node1']).union(set(df['Node2']))
all_nodes = y_true.union(y_pred)
y_true_binary = [1 if node in y_true else 0 for node in all_nodes]
y_pred_binary = [1 if node in y_pred else 0 for node in all_nodes]

# default to 0.0 if there is a divide by 0 error
# not using precision_recall_curve because thresholds are binary (0 or 1); rather we are directly calculating precision and recall per pathway
precision = precision_score(y_true_binary, y_pred_binary, zero_division=0.0)
recall = recall_score(y_true_binary, y_pred_binary, zero_division=0.0)
results.append({"Pathway": file, "Precision": precision, "Recall": recall})

pr_df = pd.DataFrame(results)
pr_df.sort_values(by=["Recall", "Pathway"], axis=0, ascending=True, inplace=True)
pr_df.to_csv(output_file, sep="\t", index=False)

if output_png is not None:
if not pr_df.empty:
plt.figure(figsize=(8, 6))
# plot a line per algorithm
for algorithm in algorithms:
subset = pr_df[pr_df["Pathway"].str.contains(algorithm)]
if not subset.empty:
plt.plot(
subset["Recall"],
subset["Precision"],
marker='o',
linestyle='',
label=f"{algorithm}"
)


plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision and Recall Plot")
plt.legend()
plt.grid(True)
plt.savefig(output_png)
else:
plt.figure()
plt.plot([], [])
plt.title("Empty Pathway Files")
plt.savefig(output_png)


results.append({"Pathway": file, "Precision": precision})

precision_df = pd.DataFrame(results)
precision_df.to_csv(output_file, sep="\t", index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Pathway Precision Recall
test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Pathway Precision Recall
test/evaluate/input/data-test-params-456/pathway.txt 0.0 0.0
test/evaluate/input/data-test-params-empty/pathway.txt 0.0 0.0
test/evaluate/input/data-test-params-123/pathway.txt 0.6666666666666666 0.6666666666666666
test/evaluate/input/data-test-params-789/pathway.txt 1.0 1.0
3 changes: 3 additions & 0 deletions test/evaluate/input/data-test-params-123/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Node1 Node2 Rank Direction
A B 1 U
B C 1 U
2 changes: 2 additions & 0 deletions test/evaluate/input/data-test-params-456/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Node1 Node2 Rank Direction
F L 1 U
3 changes: 3 additions & 0 deletions test/evaluate/input/data-test-params-789/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Node1 Node2 Rank Direction
A B 1 U
B Q 1 U
1 change: 1 addition & 0 deletions test/evaluate/input/data-test-params-empty/pathway.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Node1 Node2 Rank Direction
4 changes: 4 additions & 0 deletions test/evaluate/input/node_table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NODEID
A
B
Q
39 changes: 39 additions & 0 deletions test/evaluate/test_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import filecmp
from pathlib import Path

import pandas as pd
import pytest

import spras.analysis.ml as ml
from spras.evaluation import Evaluation

INPUT_DIR = 'test/evaluate/input/'
OUT_DIR = 'test/evaluate/output/'
EXPECT_DIR = 'test/evaluate/expected/'
NODE_TABLE = pd.read_csv(INPUT_DIR + "node_table.csv", header=0)
class TestEvaluate:
@classmethod
def setup_class(cls):
"""
Create the expected output directory
"""
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

def test_precision_recall_per_pathway(self):
file_paths = [INPUT_DIR + "data-test-params-123/pathway.txt", INPUT_DIR + "data-test-params-456/pathway.txt", INPUT_DIR + "data-test-params-789/pathway.txt", INPUT_DIR + "data-test-params-empty/pathway.txt"]
algorithms = ["test"]
output_file = OUT_DIR + "test-precision-recall-per-pathway.txt"
output_png = OUT_DIR + "test-precision-recall-per-pathway.png"

Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway.txt', shallow=False)

def test_precision_recall_per_pathway_empty(self):

file_paths = [INPUT_DIR + "data-test-params-empty/pathway.txt"]
algorithms = ["test"]
output_file = OUT_DIR +"test-precision-recall-per-pathway-empty.txt"
output_png = OUT_DIR + "test-precision-recall-per-pathway-empty.png"

Evaluation.precision_and_recall(file_paths, NODE_TABLE, algorithms, output_file, output_png)
assert filecmp.cmp(output_file, EXPECT_DIR + 'expected-precision-recall-per-pathway-empty.txt', shallow=False)
Loading