diff --git a/Snakefile b/Snakefile index 1825b6d1..eae887dd 100644 --- a/Snakefile +++ b/Snakefile @@ -93,6 +93,8 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-matrix.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}jaccard-heatmap.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_ml_aggregate_algo: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) @@ -103,6 +105,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms)) + #TODO: add algo similarity outputs per algo if _config.config.analysis_include_evaluation: final_input.extend(expand('{out_dir}{sep}{dataset_gold_standard_pair}-evaluation.txt',out_dir=out_dir,sep=SEP,dataset_gold_standard_pair=dataset_gold_standard_pairs,algorithm_params=algorithms_with_params)) @@ -317,6 +320,20 @@ rule ml_analysis: ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) +# Algorithm similarity evaluation - jaccard similarity +#TODO: will have to add visualiztion - start with jaccard eval though +rule algorithm_similarity_eval: + input: + pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', + out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) + output: + algo_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', 'jaccard-matrix.txt']), + algo_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', 'jaccard-heatmap.png']) + run: + summary_df = ml.summarize_networks(input.pathways) + jaccard = ml.jaccard_similarity_eval(summary_df, output.algo_similarity_matrix, output.algo_similarity_heatmap) + + # Ensemble the output pathways for each dataset rule ensemble: input: diff --git a/config/config.yaml b/config/config.yaml index 3fa8e542..fd8a5a06 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -47,13 +47,13 @@ container_registry: algorithms: - name: "pathlinker" params: - include: true + include: false run1: k: range(100,201,100) - name: "omicsintegrator1" params: - include: true + include: false run1: b: [5, 6] w: np.linspace(0,5,2) @@ -80,7 +80,7 @@ algorithms: - name: "mincostflow" params: - include: true + include: false run1: flow: [1] # The flow must be an int capacity: [1] @@ -91,7 +91,7 @@ algorithms: - name: "domino" params: - include: true + include: false run1: slice_threshold: [0.3] module_threshold: [0.05] diff --git a/spras/analysis/ml.py b/spras/analysis/ml.py index 3dad8775..19dde88e 100644 --- a/spras/analysis/ml.py +++ b/spras/analysis/ml.py @@ -11,6 +11,7 @@ from sklearn.cluster import AgglomerativeClustering from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler +from sklearn.metrics import jaccard_score from spras.util import make_required_dirs @@ -86,6 +87,7 @@ def summarize_networks(file_paths: Iterable[Union[str, PathLike]]) -> pd.DataFra concated_df = concated_df.fillna(0) concated_df = concated_df.astype('int64') + print(concated_df) return concated_df @@ -347,3 +349,40 @@ def ensemble_network(dataframe: pd.DataFrame, output_file: str): make_required_dirs(output_file) row_means[['Node1', 'Node2', 'Frequency', "Direction"]].to_csv(output_file, sep='\t', index=False, header=True) + + +def jaccard_similarity_eval(summary_df: pd.DataFrame, output_file: str, output_png: str): + """ + Calculates the pairwise Jaccard similarity matrix from the binary representation of `summary_df`. + Save the resulting similarity matrix as a tab-delimited file and generates and save a heatmap + visualization of the similarities. + @param summary_df: pandas dataframe with algorithm-parameter summary information + @param output_file: the filename to save the ensemble network + @param output_png: the file name to save the heatmap image + """ + # calculate the jaccard similarity between all combinations of algorithms & parameters + binarized_df = summary_df.applymap(lambda x: 1 if x !=0 else 0) + algorithms = binarized_df.columns + jaccard_matrix = pd.DataFrame(np.identity(len(algorithms)), index=algorithms, columns=algorithms) + for i, alg1 in enumerate(algorithms): + for _j, alg2 in enumerate(algorithms[i+1:], start=i+1): + sim_value = jaccard_score(binarized_df[alg1], binarized_df[alg2]) + jaccard_matrix.loc[alg1, alg2] = sim_value + jaccard_matrix.loc[alg2, alg1] = sim_value + # save the jaccard matrix as a csv + jaccard_matrix.to_csv(output_file, sep='\t', index=True, header=True) + # make a heatmap from the jaccard matrix + fig, ax = plt.subplots(figsize=(8, 6)) + cax = ax.imshow(jaccard_matrix.values, interpolation='nearest', cmap='viridis', vmin=0, vmax=1) + ax.set_title("Jaccard similarity heatmap") + # set tick labels with algorithm names + ax.set_xticks(np.arange(len(algorithms))) + ax.set_yticks(np.arange(len(algorithms))) + ax.set_xticklabels(algorithms) + ax.set_yticklabels(algorithms) + plt.colorbar(cax, ax=ax) + # annotate each cell with the corresponding similarity value + for i in range(len(algorithms)): + for j in range(len(algorithms)): + ax.text(j, i, f'{jaccard_matrix.values[i, j]:.2f}', ha='center', va='center', color='white') + plt.savefig(output_png, bbox_inches="tight", dpi=DPI) diff --git a/test/ml/expected/expected-jaccard-heatmap.png b/test/ml/expected/expected-jaccard-heatmap.png new file mode 100644 index 00000000..f3df84b7 Binary files /dev/null and b/test/ml/expected/expected-jaccard-heatmap.png differ diff --git a/test/ml/expected/expected-jaccard-matrix.txt b/test/ml/expected/expected-jaccard-matrix.txt new file mode 100644 index 00000000..139b162f --- /dev/null +++ b/test/ml/expected/expected-jaccard-matrix.txt @@ -0,0 +1,2 @@ + data +data 1.0 diff --git a/test/ml/test_ml.py b/test/ml/test_ml.py index 1dbb0399..b1483860 100644 --- a/test/ml/test_ml.py +++ b/test/ml/test_ml.py @@ -1,4 +1,5 @@ import filecmp +import shutil from pathlib import Path import pandas as pd @@ -145,3 +146,12 @@ def test_ensemble_network_empty(self): expected = pd.read_table(EXPECT_DIR + 'expected-ensemble-network-empty.tsv') assert en.equals(expected) + + # NOTE: binary comparison of generated binary files may be fragile across matplotlib versions + # if this test begins failing unexpectedly, consider loosening or disabling test + def test_jaccard_similarity_eval(self): + dataframe = ml.summarize_networks([INPUT_DIR + 'test-data-s1/s1.txt', INPUT_DIR + 'test-data-s2/s2.txt', INPUT_DIR + 'test-data-s3/s3.txt']) + ml.jaccard_similarity_eval(dataframe, OUT_DIR + 'jaccard-matrix.txt', OUT_DIR + 'jaccard-heatmap.png') + + assert filecmp.cmp(OUT_DIR + 'jaccard-matrix.txt', EXPECT_DIR + 'expected-jaccard-matrix.txt', shallow=False) + assert filecmp.cmp(OUT_DIR + 'jaccard-heatmap.png', EXPECT_DIR + 'expected-jaccard-heatmap.png', shallow=False)