Reed-CompBio · cgsze · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Dec 2, 2024
diff --git a/Snakefile b/Snakefile
@@ -296,7 +296,7 @@ rule summary_table:
     run:
         # Load the node table from the pickled dataset file
         node_table = Dataset.from_file(input.dataset_file).node_table
-        summary_df = summary.summarize_networks(input.pathways, node_table)
+        summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params)
         summary_df.to_csv(output.summary_table, sep='\t', index=False)
 
 # Cluster the output pathways for each dataset

diff --git a/spras/analysis/summary.py b/spras/analysis/summary.py
@@ -1,3 +1,4 @@
+import json
 import os
 import sys
 from pathlib import Path
@@ -7,7 +8,7 @@
 import pandas as pd
 
 
-def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) -> pd.DataFrame:
+def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame, algo_params, algo_with_params) -> pd.DataFrame:
     """
     Generate a table that aggregates summary information about networks in file_paths,
     including which nodes are present in node_table columns.
@@ -31,41 +32,64 @@ def summarize_networks(file_paths: Iterable[Path], node_table: pd.DataFrame) ->
     # Initialize list to store network summary data
     nw_info = []
 
+    index = 0
+    col_names = []
+
+    root_dir = Path(__file__).resolve().parents[2]
+
     # Iterate through each network file path
     for file_path in sorted(file_paths):
-
+        file_path = Path(file_path)
+        file_path = (root_dir / file_path).resolve()
+        rel_path = file_path.relative_to(root_dir)
         with open(file_path, 'r') as f:
             lines = f.readlines()[1:]  # skip the header line
 
+        # directed or mixed graph are parsed and summarized as an undirected graph
         nw = nx.read_edgelist(lines, data=(('weight', float), ('Direction', str)))
 
         # Save the network name, number of nodes, number edges, and number of connected components
-        nw_name = str(file_path)
+        nw_name = str(rel_path).replace("\\\\", "\\") #replace \\ in filepath with \
         number_nodes = nw.number_of_nodes()
         number_edges = nw.number_of_edges()
         ncc = nx.number_connected_components(nw)
+
         # Initialize list to store current network information
         cur_nw_info = [nw_name, number_nodes, number_edges, ncc]
+
         # Iterate through each node property and save the intersection with the current network
         for node_list in nodes_by_col:
             num_nodes = len(set(nw).intersection(node_list))
             cur_nw_info.append(num_nodes)
+
+        # String split name to access algorithm and hashcode from filepath
+        # Name of filepath follows format "output/.../data#-algo-params-hashcode/pathway.txt"
+        # algorithm parameters have format { algo : { hashcode : { parameter combos } } }
+        filename = sorted(algo_with_params)[index].split("-")
+        algo = filename[0]
+        hashcode = filename[2]
+        index = index + 1
+
+        param_combo = algo_params[algo][hashcode]
+        params = json.dumps(param_combo)
+        params = params.replace("\"", "") #removes extra double quotes from string
+        cur_nw_info.append(params)
+
         # Save the current network information to the network summary list
         nw_info.append(cur_nw_info)
 
+    # Prepare column names
+    col_names = ["Name", "Number of nodes", "Number of edges", "Number of connected components"]
+    col_names.extend(nodes_by_col_labs)
+    col_names.append("Parameter combination")
+
     # Convert the network summary data to pandas dataframe
     # Could refactor to create the dataframe line by line instead of storing data as lists and then converting
     nw_info = pd.DataFrame(
         nw_info,
-        columns=[
-                    "Name",
-                    "Number of nodes",
-                    "Number of undirected edges",
-                    "Number of connected components"
-                ]
-                +
-                nodes_by_col_labs
+        columns=col_names
     )
+    # print(nw_info) #debug
     return nw_info
 
 

diff --git a/test/analysis/expected_output/expected_node_table.txt b/test/analysis/expected_output/expected_node_table.txt
@@ -0,0 +1,4 @@
+NODEID	prize	active	dummy	sources	targets
+0	C	5.7	True	NaN	NaN	True
+1	A	2.0	True	NaN	True	NaN
+2	B	NaN	NaN	NaN	NaN	NaN
diff --git a/test/analysis/expected_output/test_egfr_summary.txt b/test/analysis/expected_output/test_egfr_summary.txt
@@ -0,0 +1,10 @@
+Name	Number of nodes	Number of edges	Number of connected components	Nodes in prize	Nodes in sources	Nodes in targets	Nodes in active	Nodes in dummy	Parameter combination
+test\analysis\input\egfr\tps-egfr-domino-params-V3X4RW7_pathway.txt	48	45	3	27	0	27	27	0	{slice_threshold: 0.3, module_threshold: 0.05}
+test\analysis\input\egfr\tps-egfr-meo-params-GKEDDFZ_pathway.txt	1877	12845	1	621	1	620	621	0	{local_search: Yes, max_path_length: 3, rand_restarts: 10}
+test\analysis\input\egfr\tps-egfr-omicsintegrator1-params-3THRXWW_pathway.txt	28	20	8	28	1	27	28	0	{b: 2, d: 10, g: 1e-3, r: 0.01, w: 0.1, mu: 0.008, dummy_mode: file}
+test\analysis\input\egfr\tps-egfr-omicsintegrator1-params-5QH767V_pathway.txt	39	31	8	39	1	38	39	0	{b: 10, d: 10, g: 1e-3, r: 0.01, w: 0.1, mu: 0.008, dummy_mode: file}
+test\analysis\input\egfr\tps-egfr-omicsintegrator1-params-ITO5EQS_pathway.txt	14	9	5	14	0	14	14	0	{b: 0.55, d: 10, g: 1e-3, r: 0.01, w: 0.1, mu: 0.008, dummy_mode: file}
+test\analysis\input\egfr\tps-egfr-omicsintegrator2-params-EHHWPMD_pathway.txt	593	591	2	531	1	530	531	0	{b: 2, g: 3}
+test\analysis\input\egfr\tps-egfr-omicsintegrator2-params-IV3IPCJ_pathway.txt	704	702	2	616	1	615	616	0	{b: 4, g: 0}
+test\analysis\input\egfr\tps-egfr-pathlinker-params-7S4SLU6_pathway.txt	14	17	1	6	1	5	6	0	{k: 10}
+test\analysis\input\egfr\tps-egfr-pathlinker-params-TCEMRS7_pathway.txt	25	32	1	11	1	10	11	0	{k: 20}
diff --git a/test/analysis/expected_output/test_example_summary.txt b/test/analysis/expected_output/test_example_summary.txt
@@ -0,0 +1,13 @@
+Name	Number of nodes	Number of edges	Number of connected components	Nodes in prize	Nodes in active	Nodes in dummy	Nodes in sources	Nodes in targets	Parameter combination
+test\analysis\input\example\data0-allpairs-params-BEH6YB2_pathway.txt	3	2	1	2	2	0	1	1	{spras_placeholder: no parameters}
+test\analysis\input\example\data0-domino-params-V3X4RW7_pathway.txt	0	0	0	0	0	0	0	0	{slice_threshold: 0.3, module_threshold: 0.05}
+test\analysis\input\example\data0-meo-params-GKEDDFZ_pathway.txt	3	2	1	2	2	0	1	1	{max_path_length: 3, local_search: Yes, rand_restarts: 10}
+test\analysis\input\example\data0-mincostflow-params-SZPZVU6_pathway.txt	3	2	1	2	2	0	1	1	{flow: 1, capacity: 1}
+test\analysis\input\example\data0-omicsintegrator1-params-E3LSEZQ_pathway.txt	3	2	1	2	2	0	1	1	{b: 6, w: 5.0, d: 10, dummy_mode: file}
+test\analysis\input\example\data0-omicsintegrator1-params-NFIPHUX_pathway.txt	0	0	0	0	0	0	0	0	{b: 6, w: 0.0, d: 10, dummy_mode: file}
+test\analysis\input\example\data0-omicsintegrator1-params-SU2S63Y_pathway.txt	3	2	1	2	2	0	1	1	{b: 5, w: 0.0, d: 10, dummy_mode: file}
+test\analysis\input\example\data0-omicsintegrator1-params-V26JBGX_pathway.txt	0	0	0	0	0	0	0	0	{b: 5, w: 5.0, d: 10, dummy_mode: file}
+test\analysis\input\example\data0-omicsintegrator2-params-EHHWPMD_pathway.txt	0	0	0	0	0	0	0	0	{b: 2, g: 3}
+test\analysis\input\example\data0-omicsintegrator2-params-IV3IPCJ_pathway.txt	3	2	1	2	2	0	1	1	{b: 4, g: 0}
+test\analysis\input\example\data0-pathlinker-params-6SWY7JS_pathway.txt	3	2	1	2	2	0	1	1	{k: 200}
+test\analysis\input\example\data0-pathlinker-params-VQL7BDZ_pathway.txt	3	2	1	2	2	0	1	1	{k: 100}
diff --git a/test/analysis/input/config.yaml b/test/analysis/input/config.yaml
@@ -0,0 +1,177 @@
+# Global workflow control
+
+# The length of the hash used to identify a parameter combination
+hash_length: 7
+
+# Specify the container framework. Current supported versions include 'docker' and
+# 'singularity'. If container_framework is not specified, SPRAS will default to docker.
+container_framework: docker
+
+# Only used if container_framework is set to singularity, this will unpack the singularity containers
+# to the local filesystem. This is useful when PRM containers need to run inside another container,
+# such as would be the case in an HTCondor/OSPool environment.
+# NOTE: This unpacks singularity containers to the local filesystem, which will take up space in a way
+# that persists after the workflow is complete. To clean up the unpacked containers, the user must
+# manually delete them.
+unpack_singularity: false
+
+# Allow the user to configure which container registry containers should be pulled from
+# Note that this assumes container names are consistent across registries, and that the
+# registry being passed doesn't require authentication for pull actions
+container_registry:
+   base_url: docker.io
+   # The owner or project of the registry
+   # For example, "reedcompbio" if the image is available as docker.io/reedcompbio/allpairs
+   owner: reedcompbio
+
+# This list of algorithms should be generated by a script which checks the filesystem for installs.
+# It shouldn't be changed by mere mortals. (alternatively, we could add a path to executable for each algorithm
+# in the list to reduce the number of assumptions of the program at the cost of making the config a little more involved)
+# Each algorithm has an 'include' parameter. By toggling 'include' to true/false the user can change
+# which algorithms are run in a given experiment.
+#
+# algorithm-specific parameters are embedded in lists so that users can specify multiple. If multiple
+# parameters are specified then the algorithm will be run as many times as needed to cover all parameter
+# combinations. For instance if we have the following:
+# - name: "myAlg"
+#   params:
+#         include: true
+#         a: [1,2]
+#         b: [0.5,0.75]
+#
+# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
+# careful: too many parameters might make your runs take a long time.
+
+algorithms:
+      - name: "pathlinker"
+        params:
+              include: true
+              run1:
+                  k: range(100,201,100)
+
+      - name: "omicsintegrator1"
+        params:
+              include: true
+              run1:
+                  b: [5, 6]
+                  w: np.linspace(0,5,2)
+                  d: [10]
+                  dummy_mode: ["file"]
+
+      - name: "omicsintegrator2"
+        params:
+              include: true
+              run1:
+                  b: [4]
+                  g: [0]
+              run2:
+                  b: [2]
+                  g: [3]
+
+      - name: "meo"
+        params:
+              include: true
+              run1:
+                  max_path_length: [3]
+                  local_search: ["Yes"]
+                  rand_restarts: [10]
+
+      - name: "mincostflow"
+        params:
+              include: true
+              run1:
+                  flow: [1] # The flow must be an int
+                  capacity: [1]
+
+      - name: "allpairs"
+        params:
+              include: true
+
+      - name: "domino"
+        params:
+              include: true
+              run1:
+                  slice_threshold: [0.3]
+                  module_threshold: [0.05]
+
+
+# Here we specify which pathways to run and other file location information.
+# DataLoader.py can currently only load a single dataset
+# Assume that if a dataset label does not change, the lists of associated input files do not change
+datasets:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: data0
+      node_files: ["node-prizes.txt", "sources.txt", "targets.txt"]
+      # DataLoader.py can currently only load a single edge file, which is the primary network
+      edge_files: ["network.txt"]
+      # Placeholder
+      other_files: []
+      # Relative path from the spras directory
+      data_dir: "input"
+    #-
+    #label: data1
+      # Reuse some of the same sources file as 'data0' but different network and targets
+      # node_files: ["node-prizes.txt", "sources.txt", "alternative-targets.txt"]
+      # edge_files: ["alternative-network.txt"]
+      # other_files: []
+      # Relative path from the spras directory
+      # data_dir: "input"
+
+gold_standards:
+    -
+      # Labels can only contain letters, numbers, or underscores
+      label: gs0
+      node_files: ["gs_nodes0.txt"]
+      # edge_files: [] TODO: later iteration
+      data_dir: "input"
+      # List of dataset labels to compare with the specific gold standard dataset
+      dataset_labels: ["data0"]
+    -
+    #label: gs1
+    # node_files: ["gs_nodes1.txt"]
+    # data_dir: "input"
+    # dataset_labels: ["data1", "data0"]
+
+# If we want to reconstruct then we should set run to true.
+# TODO: if include is true above but run is false here, algs are not run.
+# is this the behavior we want?
+reconstruction_settings:
+
+        #set where everything is saved
+        locations:
+
+              #place the save path here
+              # TODO move to global
+              reconstruction_dir: "output"
+
+        run: true
+
+analysis:
+      # Create one summary per pathway file and a single summary table for all pathways for each dataset
+      summary:
+        include: true
+      # Create output files for each pathway that can be visualized with GraphSpace
+      graphspace:
+        include: false
+      # Create Cytoscape session file with all pathway graphs for each dataset
+      cytoscape:
+        include: true
+      # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
+      ml:
+        # ml analysis per dataset
+        include: false
+        # adds ml analysis per algorithm output
+        # only runs for algorithms with multiple parameter combinations chosen
+        aggregate_per_algorithm: true
+        # specify how many principal components to calculate
+        components: 2
+        # boolean to show the labels on the pca graph
+        labels: true
+        # 'ward', 'complete', 'average', 'single'
+        # if linkage: ward, must use metric: euclidean
+        linkage: 'ward'
+        # 'euclidean', 'manhattan', 'cosine'
+        metric: 'euclidean'
+      evaluation:
+        include: false