Reed-CompBio · Lyce24 · Feb 18, 2024 · Feb 20, 2024 · Feb 21, 2024 · annaritz
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -146,6 +146,15 @@ jobs:
         tags: latest
         cache_froms: reedcompbio/domino:latest
         push: false
+    - name: Build RWR Docker image
+      uses: docker/build-push-action@v1
+      with:
+        path: docker-wrappers/RWR/.
+        dockerfile: docker-wrappers/RWR/Dockerfile
+        repository: reedcompbio/random-walk-with-restart
+        tags: latest
+        cache_froms: reedcompbio/random-walk-with-restart:latest
+        push: false
     - name: Build Cytoscape Docker image
       uses: docker/build-push-action@v1
       with:

diff --git a/config/config.yaml b/config/config.yaml
@@ -87,6 +87,10 @@ algorithms:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]
 
+      - name: "rwr"
+        params:
+              include: true
+
 
 # Here we specify which pathways to run and other file location information.
 # DataLoader.py can currently only load a single dataset

diff --git a/config/egfr.yaml b/config/egfr.yaml
@@ -67,6 +67,21 @@ algorithms:
           - 0.3
         module_threshold:
           - 0.05
+
+  -
+    name: rwr
+    params:
+      include: true
+      run1:
+        single_source:
+          - 0
+        w:
+          - 0.20
+        df:
+          - 0.85
+        threshold:
+          - 0.0002
+
 datasets:
   -
     data_dir: input

diff --git a/docker-wrappers/RWR/Dockerfile b/docker-wrappers/RWR/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.10.7
+
+WORKDIR /RWR
+
+RUN pip install networkx==2.8 numpy==1.24.3 scipy==1.10.1
+
+RUN wget https://raw.githubusercontent.com/Reed-CompBio/random-walk-with-restart/8ca6969fb2fc744edd544535e2ebd67217b0606c/random_walk.py
diff --git a/docker-wrappers/RWR/README.md b/docker-wrappers/RWR/README.md
@@ -0,0 +1,34 @@
+# RWR Docker image
+
+A Docker image for the random-walk-with-start algorithm that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/random-walk-with-restart).
+
+To create the Docker image run:
+
+```
+docker build -t reedcompbio/random-walk-with-restart -f Dockerfile .
+```
+
+from this directory.
+
+To inspect the installed Python packages:
+
+```
+winpty docker run reedcompbio/random-walk-with-restart pip list
+```
+
+The `winpty` prefix is only needed on Windows.
+
+## Testing
+Test code is located in `test/RWR`.
+The `input` subdirectory contains test files `source_nodes.txt`, `target_nodes.txt` and `edges.txt`.
+The Docker wrapper can be tested with `pytest` or a unit test with `pytest -k test_rwr.py`.
+
+Alternatively, to test the Docker image directly, run the following command from the root of the `spras` repository
+
+```
+docker run -w /data --mount type=bind,source=/${PWD},target=/data reedcompbio/random-walk-with-restart python random_walk.py \
+  /data/test/RWR/input/edges.txt /data/test/RWR/input/source_nodes.txt /data/test/RWR/input/target_nodes.txt --damping_factor 0.85 --selection_function min --threshold 0.001 --w 0.0001 --output_file /data/test/RWR/output/output.txt
+```
+
+This will run RWR on the test input files and write the output files to the root of the `spras` repository.
+Windows users may need to escape the absolute paths so that `/data` becomes `//data`, etc.
diff --git a/spras/runner.py b/spras/runner.py
@@ -7,6 +7,7 @@
 from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1
 from spras.omicsintegrator2 import OmicsIntegrator2 as omicsintegrator2
 from spras.pathlinker import PathLinker as pathlinker
+from spras.rwr import RWR as rwr
 
 
 def run(algorithm, params):

diff --git a/spras/rwr.py b/spras/rwr.py
@@ -0,0 +1,200 @@
+import warnings
+from pathlib import Path
+
+import pandas as pd
+
+from spras.containers import prepare_volume, run_container
+from spras.interactome import (
+    convert_undirected_to_directed,
+    reinsert_direction_col_directed,
+)
+from spras.prm import PRM
+from spras.util import add_rank_column
+
+__all__ = ['RWR']
+
+"""
+RWR will construct a directed graph from the provided input file
+- an edge is represented with a head and tail node, which represents the direction of the interation between two nodes
+- uses networkx Digraph() object
+
+Expected raw input format:
+Node1	Node2	Edge Flux	Weight	InNetwork	Type
+- the expected raw input file should have node pairs in the 1st and 2nd columns, with a edge flux in the 3rd column, a weight in the 4th column, and a boolean in the 5th column to indicate if the edge/node is in the network
+- the 'type' column should be 1 for edges, 2 for nodes, and 3 for pathways as we want to keep information about nodes, edges, and pathways.
+- it can include repeated and bidirectional edges
+
+Expected raw input format for prizes:
+NODEID  prizes  Node type
+- the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column
+- it can include repeated and bidirectional edges
+- if there are no prizes, the algorithm will assume that all nodes have a prize of 1.0
+"""
+
+class RWR(PRM):
+    # we need edges (weighted), source set (with prizes), and target set (with prizes).
+    required_inputs = ['edges', 'prizes']
+
+    @staticmethod
+    def generate_inputs(data, filename_map):
+        """
+        Access fields from the dataset and write the required input files
+        @param data: dataset
+        @param filename_map: a dict mapping file types in the required_inputs to the filename for that type
+        """
+
+        # ensures the required input are within the filename_map
+        for input_type in RWR.required_inputs:
+            if input_type not in filename_map:
+                raise ValueError(f"{input_type} filename is missing")
+
+        sources_targets = data.request_node_columns(["sources", "targets"])
+        if sources_targets is None:
+            if data.contains_node_columns('prize'):
+                sources_targets = data.request_node_columns(['prize'])
+                input_df = sources_targets[["NODEID"]].copy()
+                input_df["Node type"] = "source"
+            else:
+                raise ValueError("No sources, targets, or prizes found in dataset")
+        else:
+            both_series = sources_targets.sources & sources_targets.targets
+            for _index,row in sources_targets[both_series].iterrows():
+                warn_msg = row.NODEID+" has been labeled as both a source and a target."
+                # Only use stacklevel 1 because this is due to the data not the code context
+                warnings.warn(warn_msg, stacklevel=1)
+
+            #Create nodetype file
+            input_df = sources_targets[["NODEID"]].copy()
+            input_df.loc[sources_targets["sources"] == True,"Node type"]="source"
+            input_df.loc[sources_targets["targets"] == True,"Node type"]="target"
+
+            if data.contains_node_columns('prize'):
+                node_df = data.request_node_columns(['prize'])
+                input_df = pd.merge(input_df, node_df, on='NODEID')
+            else:
+                #If there aren't prizes but are sources and targets, make prizes based on them
+                input_df['prize'] = 1.0
+
+        input_df.to_csv(filename_map["prizes"],sep="\t",index=False,columns=["NODEID", "prize", "Node type"])
+
+        # create the network of edges
+        edges = data.get_interactome()
+
+        edges = convert_undirected_to_directed(edges)
+
+        # creates the edges files that contains the head and tail nodes and the weights after them
+        edges.to_csv(filename_map['edges'], sep="\t", index=False, columns=["Interactor1","Interactor2","Weight"])
+
+
+    # Skips parameter validation step
+    @staticmethod
+    def run(edges=None, prizes = None, output_file = None, single_source = None, df = None, w = None, f = None, threshold = None, container_framework="docker"):
+        """
+        Run RandomWalk with Docker
+        @param edges:  input network file (required)
+        @param prizes:  input node prizes with sources and targets (required)
+        @param output_file: path to the output pathway file (required)
+        @param df: damping factor for restarting (default 0.85) (optional)
+        @param single_source: 1 for single source, 0 for source-target (default 1) (optional)
+        @param w: lower bound to filter the edges based on the edge confidence (default 0.00) (optional)
+        @param f: selection function (default 'min') (optional)
+        @param threshold: threshold for constructing the final pathway (default 0.0001) (optional)
+        @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional)
+        """
+
+        if not edges or not prizes or not output_file:
+            raise ValueError('Required RWR arguments are missing')
+
+        work_dir = '/spras'
+
+        # Each volume is a tuple (src, dest) - data generated by Docker
+        volumes = list()
+
+        bind_path, edges_file = prepare_volume(edges, work_dir)
+        volumes.append(bind_path)
+
+        bind_path, prizes_file = prepare_volume(prizes, work_dir)
+        volumes.append(bind_path)
+
+
+        out_dir = Path(output_file).parent
+
+        # RWR requires that the output directory exist
+        out_dir.mkdir(parents=True, exist_ok=True)
+        bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir)
+        volumes.append(bind_path)
+        mapped_out_prefix= mapped_out_dir + '/out'  # Use posix path inside the container
+
+
+        command = ['python',
+                   '/RWR/random_walk.py',
+                   '--edges_file', edges_file,
+                   '--prizes_file', prizes_file,
+                   '--output_file', mapped_out_prefix]
+
+        if single_source is not None:
+            command.extend(['--single_source', str(single_source)])
+        if df is not None:
+            command.extend(['--damping_factor', str(df)])
+        if f is not None:
+            command.extend(['--selection_function', str(f)])
+        if w is not None:
+            command.extend(['--w', str(w)])
+        if threshold is not None:
+            command.extend(['--threshold', str(threshold)])
+
+        print('Running RWR with arguments: {}'.format(' '.join(command)), flush=True)
+
+        container_suffix = "random-walk-with-restart"
+        out = run_container(container_framework,
+                            container_suffix,
+                            command,
+                            volumes,
+                            work_dir)
+        print(out)
+
+        output = Path(out_dir, 'out')
+        output.rename(output_file)
+
+
+    @staticmethod
+    def parse_output(raw_pathway_file, standardized_pathway_file):
+        """
+        Convert a predicted pathway into the universal format
+        @param raw_pathway_file: pathway file produced by an algorithm's run function
+        @param standardized_pathway_file: the same pathway written in the universal format
+        """
+
+        df = pd.read_csv(raw_pathway_file, sep="\t")
+
+        # add a rank column to the dataframe
+        df = add_rank_column(df)
+
+        pathway_output_file = standardized_pathway_file
+        edge_output_file = standardized_pathway_file.replace('.txt', '') + '_edges.txt'
+        node_output_file = standardized_pathway_file.replace('.txt', '') + '_nodes.txt'
+
+        # get all rows where type is 1
+        df_edge = df.loc[df["Type"] == 1]
+
+        # get rid of the placeholder column and output it to a file
+        df_edge = df_edge.drop(columns=['Type'])
+        df_edge = df_edge.drop(columns=['Rank'])
+        df_edge.to_csv(edge_output_file, sep="\t", index=False, header=True)
+
+        # locate the first place where placeholder is not Nan
+        df_node = df.loc[df['Type'] == 2]
+        # rename the header to Node, Pr, R_Pr, Final_Pr
+        df_node = df_node.drop(columns=['Type'])
+        df_node = df_node.drop(columns=['Rank'])
+        df_node = df_node.rename(columns={'Node1': 'Node', 'Node2': 'Pr', 'Edge Flux': 'R_Pr', 'Weight': 'Final_Pr', 'InNetwork' : 'InNetwork'})
+        df_node.to_csv(node_output_file, sep="\t", index=False, header=True)
+
+        df_pathway = df.loc[df['Type'] == 3]
+        df_pathway = df_pathway.drop(columns=['InNetwork'])
+        df_pathway = df_pathway.drop(columns=['Type'])
+        df_pathway = df_pathway.drop(columns=['Weight'])
+        df_pathway = df_pathway.drop(columns=['Edge Flux'])
+
+        df_pathway = reinsert_direction_col_directed(df_pathway)
+        df_pathway.to_csv(pathway_output_file, sep="\t", index=False, header=False)
diff --git a/test/RWR/input/edges.txt b/test/RWR/input/edges.txt
@@ -0,0 +1,7 @@
+Node1	Node2	Weight
+A	D	5
+B	D	1.3
+C	D	0.4
+D	E	4.5
+D	F	2
+D	G	3.2
diff --git a/test/RWR/input/prizes.txt b/test/RWR/input/prizes.txt
@@ -0,0 +1,7 @@
+NODEID	prizes	Node type
+A	1	source
+B	1	source
+C	1	source
+E	1	target
+F	1	target
+G	1	target
diff --git a/test/RWR/test_rwr.py b/test/RWR/test_rwr.py
@@ -0,0 +1,69 @@
+import shutil
+from pathlib import Path
+
+import pytest
+
+import spras.config as config
+from spras.rwr import RWR
+
+config.init_from_file("config/config.yaml")
+
+TEST_DIR = 'test/RWR/'
+OUT_FILE_DEFAULT = TEST_DIR+'output/rwr-edges.txt'
+OUT_FILE_OPTIONAL = TEST_DIR+'output/rwr-edges-optional.txt'
+
+
+class TestRWR:
+    """
+    Run RWR tests in the Docker image
+    """
+    def test_rwr(self):
+        out_path = Path(OUT_FILE_DEFAULT)
+        out_path.unlink(missing_ok=True)
+        # Only include required arguments
+        RWR.run(
+            edges=TEST_DIR+'input/edges.txt',
+            prizes=TEST_DIR+'input/prizes.txt',
+            output_file=OUT_FILE_DEFAULT
+        )
+        assert out_path.exists()
+
+    def test_rwr_optional(self):
+        out_path = Path(OUT_FILE_OPTIONAL)
+        out_path.unlink(missing_ok=True)
+        # Include optional argument - single_source, df, w, f, threshold,
+        RWR.run(
+            edges=TEST_DIR+'input/edges.txt',
+            prizes=TEST_DIR+'input/prizes.txt',
+            output_file=OUT_FILE_OPTIONAL,
+            single_source=1,
+            df=0.85,
+            w=0.00,
+            f='min',
+            threshold=0.0001
+        )
+        assert out_path.exists()
+
+    def test_rwr_missing(self):
+        # Test the expected error is raised when required arguments are missing
+        with pytest.raises(ValueError):
+            # No nodetypes
+            RWR.run(
+                edges=TEST_DIR + 'input/edges.txt',
+                output_file=OUT_FILE_OPTIONAL,
+                single_source=1,
+                df=0.85)
+
+    # Only run Singularity test if the binary is available on the system
+    # spython is only available on Unix, but do not explicitly skip non-Unix platforms
+    @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system')
+    def test_rwr_singularity(self):
+        out_path = Path(OUT_FILE_DEFAULT)
+        out_path.unlink(missing_ok=True)
+        # Only include required arguments and run with Singularity
+        RWR.run(
+            edges=TEST_DIR+'input/edges.txt',
+            prizes=TEST_DIR+'input/prizes.txt',
+            output_file=OUT_FILE_DEFAULT,
+            container_framework="singularity")
+        assert out_path.exists()