Skip to content

Algorithm similarity eval - general implementation #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
17 changes: 17 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))

if _config.config.analysis_include_ml_aggregate_algo:
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
Expand All @@ -103,6 +105,7 @@ def make_final_input(wildcards):
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos))
final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms))
#TODO: add algo similarity outputs per algo

if _config.config.analysis_include_evaluation:
final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params))
Expand Down Expand Up @@ -317,6 +320,20 @@ rule ml_analysis:
ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)

# Algorithm similarity evaluation - jaccard similarity
#TODO: will have to add visualiztion - start with jaccard eval though
rule algorithm_similarity_eval:
input:
pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt',
out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
output:
algo_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', 'jaccard-matrix.txt']),
algo_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', 'jaccard-heatmap.png'])
run:
summary_df = ml.summarize_networks(input.pathways)
jaccard = ml.jaccard_similarity_eval(summary_df, output.algo_similarity_matrix, output.algo_similarity_heatmap)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need to store the output if it isn't used

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make sure I understand correctly - should I be changing line 33 & 34 to:
jaccard = ml.jaccard_similarity_eval(ml.summarize_networks(input.pathways), output.algo_similarity_matrix, output.algo_similarity_heatmap)



# Ensemble the output pathways for each dataset
rule ensemble:
input:
Expand Down
8 changes: 4 additions & 4 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ container_registry:
algorithms:
- name: "pathlinker"
params:
include: true
include: false
run1:
k: range(100,201,100)

- name: "omicsintegrator1"
params:
include: true
include: false
run1:
b: [5, 6]
w: np.linspace(0,5,2)
Expand All @@ -80,7 +80,7 @@ algorithms:

- name: "mincostflow"
params:
include: true
include: false
run1:
flow: [1] # The flow must be an int
capacity: [1]
Expand All @@ -91,7 +91,7 @@ algorithms:

- name: "domino"
params:
include: true
include: false
run1:
slice_threshold: [0.3]
module_threshold: [0.05]
Expand Down
39 changes: 39 additions & 0 deletions spras/analysis/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import jaccard_score

from spras.util import make_required_dirs

Expand Down Expand Up @@ -86,6 +87,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra
concated_df = concated_df.fillna(0)
concated_df = concated_df.astype('int64')

print(concated_df)
return concated_df


Expand Down Expand Up @@ -347,3 +349,40 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str):

make_required_dirs(output_file)
row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True)


def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str, output_png: str):
"""
Calculates the pairwise Jaccard similarity matrix from the binary representation of `summary_df`.
Save the resulting similarity matrix as a tab-delimited file and generates and save a heatmap
visualization of the similarities.
@param summary_df: pandas dataframe with algorithm-parameter summary information
@param output_file: the filename to save the ensemble network
@param output_png: the file name to save the heatmap image
"""
# calculate the jaccard similarity between all combinations of algorithms & parameters
binarized_df = summary_df.applymap(lambda x: 1 if x !=0 else 0)
algorithms = binarized_df.columns
jaccard_matrix = pd.DataFrame(np.identity(len(algorithms)), index=algorithms, columns=algorithms)
for i, alg1 in enumerate(algorithms):
for _j, alg2 in enumerate(algorithms[i+1:], start=i+1):
sim_value = jaccard_score(binarized_df[alg1], binarized_df[alg2])
jaccard_matrix.loc[alg1, alg2] = sim_value
jaccard_matrix.loc[alg2, alg1] = sim_value
# save the jaccard matrix as a csv
jaccard_matrix.to_csv(output_file, sep='\t', index=True, header=True)
# make a heatmap from the jaccard matrix
fig, ax = plt.subplots(figsize=(8, 6))
cax = ax.imshow(jaccard_matrix.values, interpolation='nearest', cmap='viridis', vmin=0, vmax=1)
ax.set_title("Jaccard similarity heatmap")
# set tick labels with algorithm names
ax.set_xticks(np.arange(len(algorithms)))
ax.set_yticks(np.arange(len(algorithms)))
ax.set_xticklabels(algorithms)
ax.set_yticklabels(algorithms)
plt.colorbar(cax, ax=ax)
# annotate each cell with the corresponding similarity value
for i in range(len(algorithms)):
for j in range(len(algorithms)):
ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white')
plt.savefig(output_png, bbox_inches="tight", dpi=DPI)
Binary file added test/ml/expected/expected-jaccard-heatmap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions test/ml/expected/expected-jaccard-matrix.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
data
data 1.0
10 changes: 10 additions & 0 deletions test/ml/test_ml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import filecmp
import shutil
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -145,3 +146,12 @@ def test_ensemble_network_empty(self):
expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv')

assert en.equals(expected)

# NOTE: binary comparison of generated binary files may be fragile across matplotlib versions
# if this test begins failing unexpectedly, consider loosening or disabling test
def test_jaccard_similarity_eval(self):
dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt'])
ml.jaccard_similarity_eval(dataframe, OUT_DIR + 'jaccard-matrix.txt', OUT_DIR + 'jaccard-heatmap.png')

assert filecmp.cmp(OUT_DIR + 'jaccard-matrix.txt', EXPECT_DIR + 'expected-jaccard-matrix.txt', shallow=False)
assert filecmp.cmp(OUT_DIR + 'jaccard-heatmap.png', EXPECT_DIR + 'expected-jaccard-heatmap.png', shallow=False)
Loading