diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index 3dc2ab85..5ef2ce1b 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -146,6 +146,15 @@ jobs: tags: latest cache_froms: reedcompbio/domino:latest push: false + - name: Build RWR Docker image + uses: docker/build-push-action@v1 + with: + path: docker-wrappers/RWR/. + dockerfile: docker-wrappers/RWR/Dockerfile + repository: reedcompbio/random-walk-with-restart + tags: latest + cache_froms: reedcompbio/random-walk-with-restart:latest + push: false - name: Build Cytoscape Docker image uses: docker/build-push-action@v1 with: diff --git a/config/config.yaml b/config/config.yaml index b85c599b..5b1973c5 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -87,6 +87,10 @@ algorithms: slice_threshold: [0.3] module_threshold: [0.05] + - name: "rwr" + params: + include: true + # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset diff --git a/config/egfr.yaml b/config/egfr.yaml index 71282905..2478e451 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -67,6 +67,21 @@ algorithms: - 0.3 module_threshold: - 0.05 + + - + name: rwr + params: + include: true + run1: + single_source: + - 0 + w: + - 0.20 + df: + - 0.85 + threshold: + - 0.0002 + datasets: - data_dir: input diff --git a/docker-wrappers/RWR/Dockerfile b/docker-wrappers/RWR/Dockerfile new file mode 100644 index 00000000..c67ae2e3 --- /dev/null +++ b/docker-wrappers/RWR/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.10.7 + +WORKDIR /RWR + +RUN pip install networkx==2.8 numpy==1.24.3 scipy==1.10.1 + +RUN wget https://raw.githubusercontent.com/Reed-CompBio/random-walk-with-restart/8ca6969fb2fc744edd544535e2ebd67217b0606c/random_walk.py \ No newline at end of file diff --git a/docker-wrappers/RWR/README.md b/docker-wrappers/RWR/README.md new file mode 100644 index 00000000..09ba6b42 --- /dev/null +++ b/docker-wrappers/RWR/README.md @@ -0,0 +1,34 @@ +# RWR Docker image + +A Docker image for the random-walk-with-start algorithm that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/random-walk-with-restart). + +To create the Docker image run: + +``` +docker build -t reedcompbio/random-walk-with-restart -f Dockerfile . +``` + +from this directory. + +To inspect the installed Python packages: + +``` +winpty docker run reedcompbio/random-walk-with-restart pip list +``` + +The `winpty` prefix is only needed on Windows. + +## Testing +Test code is located in `test/RWR`. +The `input` subdirectory contains test files `source_nodes.txt`, `target_nodes.txt` and `edges.txt`. +The Docker wrapper can be tested with `pytest` or a unit test with `pytest -k test_rwr.py`. + +Alternatively, to test the Docker image directly, run the following command from the root of the `spras` repository + +``` +docker run -w /data --mount type=bind,source=/${PWD},target=/data reedcompbio/random-walk-with-restart python random_walk.py \ + /data/test/RWR/input/edges.txt /data/test/RWR/input/source_nodes.txt /data/test/RWR/input/target_nodes.txt --damping_factor 0.85 --selection_function min --threshold 0.001 --w 0.0001 --output_file /data/test/RWR/output/output.txt +``` + +This will run RWR on the test input files and write the output files to the root of the `spras` repository. +Windows users may need to escape the absolute paths so that `/data` becomes `//data`, etc. diff --git a/spras/runner.py b/spras/runner.py index 6ef26496..88522fde 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -7,6 +7,7 @@ from spras.omicsintegrator1 import OmicsIntegrator1 as omicsintegrator1 from spras.omicsintegrator2 import OmicsIntegrator2 as omicsintegrator2 from spras.pathlinker import PathLinker as pathlinker +from spras.rwr import RWR as rwr def run(algorithm, params): diff --git a/spras/rwr.py b/spras/rwr.py new file mode 100644 index 00000000..20d04ca2 --- /dev/null +++ b/spras/rwr.py @@ -0,0 +1,200 @@ +import warnings +from pathlib import Path + +import pandas as pd + +from spras.containers import prepare_volume, run_container +from spras.interactome import ( + convert_undirected_to_directed, + reinsert_direction_col_directed, +) +from spras.prm import PRM +from spras.util import add_rank_column + +__all__ = ['RWR'] + +""" +RWR will construct a directed graph from the provided input file +- an edge is represented with a head and tail node, which represents the direction of the interation between two nodes +- uses networkx Digraph() object + +Expected raw input format: +Node1 Node2 Edge Flux Weight InNetwork Type +- the expected raw input file should have node pairs in the 1st and 2nd columns, with a edge flux in the 3rd column, a weight in the 4th column, and a boolean in the 5th column to indicate if the edge/node is in the network +- the 'type' column should be 1 for edges, 2 for nodes, and 3 for pathways as we want to keep information about nodes, edges, and pathways. +- it can include repeated and bidirectional edges + +Expected raw input format for prizes: +NODEID prizes Node type +- the expected raw input file should have node pairs in the 1st and 2nd columns, with a weight in the 3rd column +- it can include repeated and bidirectional edges +- if there are no prizes, the algorithm will assume that all nodes have a prize of 1.0 +""" + +class RWR(PRM): + # we need edges (weighted), source set (with prizes), and target set (with prizes). + required_inputs = ['edges', 'prizes'] + + @staticmethod + def generate_inputs(data, filename_map): + """ + Access fields from the dataset and write the required input files + @param data: dataset + @param filename_map: a dict mapping file types in the required_inputs to the filename for that type + """ + + # ensures the required input are within the filename_map + for input_type in RWR.required_inputs: + if input_type not in filename_map: + raise ValueError(f"{input_type} filename is missing") + + sources_targets = data.request_node_columns(["sources", "targets"]) + if sources_targets is None: + if data.contains_node_columns('prize'): + sources_targets = data.request_node_columns(['prize']) + input_df = sources_targets[["NODEID"]].copy() + input_df["Node type"] = "source" + else: + raise ValueError("No sources, targets, or prizes found in dataset") + else: + both_series = sources_targets.sources & sources_targets.targets + for _index,row in sources_targets[both_series].iterrows(): + warn_msg = row.NODEID+" has been labeled as both a source and a target." + # Only use stacklevel 1 because this is due to the data not the code context + warnings.warn(warn_msg, stacklevel=1) + + #Create nodetype file + input_df = sources_targets[["NODEID"]].copy() + input_df.loc[sources_targets["sources"] == True,"Node type"]="source" + input_df.loc[sources_targets["targets"] == True,"Node type"]="target" + + if data.contains_node_columns('prize'): + node_df = data.request_node_columns(['prize']) + input_df = pd.merge(input_df, node_df, on='NODEID') + else: + #If there aren't prizes but are sources and targets, make prizes based on them + input_df['prize'] = 1.0 + + input_df.to_csv(filename_map["prizes"],sep="\t",index=False,columns=["NODEID", "prize", "Node type"]) + + # create the network of edges + edges = data.get_interactome() + + edges = convert_undirected_to_directed(edges) + + # creates the edges files that contains the head and tail nodes and the weights after them + edges.to_csv(filename_map['edges'], sep="\t", index=False, columns=["Interactor1","Interactor2","Weight"]) + + + # Skips parameter validation step + @staticmethod + def run(edges=None, prizes = None, output_file = None, single_source = None, df = None, w = None, f = None, threshold = None, container_framework="docker"): + """ + Run RandomWalk with Docker + @param edges: input network file (required) + @param prizes: input node prizes with sources and targets (required) + @param output_file: path to the output pathway file (required) + @param df: damping factor for restarting (default 0.85) (optional) + @param single_source: 1 for single source, 0 for source-target (default 1) (optional) + @param w: lower bound to filter the edges based on the edge confidence (default 0.00) (optional) + @param f: selection function (default 'min') (optional) + @param threshold: threshold for constructing the final pathway (default 0.0001) (optional) + @param container_framework: choose the container runtime framework, currently supports "docker" or "singularity" (optional) + """ + + if not edges or not prizes or not output_file: + raise ValueError('Required RWR arguments are missing') + + work_dir = '/spras' + + # Each volume is a tuple (src, dest) - data generated by Docker + volumes = list() + + bind_path, edges_file = prepare_volume(edges, work_dir) + volumes.append(bind_path) + + bind_path, prizes_file = prepare_volume(prizes, work_dir) + volumes.append(bind_path) + + + out_dir = Path(output_file).parent + + # RWR requires that the output directory exist + out_dir.mkdir(parents=True, exist_ok=True) + bind_path, mapped_out_dir = prepare_volume(str(out_dir), work_dir) + volumes.append(bind_path) + mapped_out_prefix= mapped_out_dir + '/out' # Use posix path inside the container + + + command = ['python', + '/RWR/random_walk.py', + '--edges_file', edges_file, + '--prizes_file', prizes_file, + '--output_file', mapped_out_prefix] + + if single_source is not None: + command.extend(['--single_source', str(single_source)]) + if df is not None: + command.extend(['--damping_factor', str(df)]) + if f is not None: + command.extend(['--selection_function', str(f)]) + if w is not None: + command.extend(['--w', str(w)]) + if threshold is not None: + command.extend(['--threshold', str(threshold)]) + + print('Running RWR with arguments: {}'.format(' '.join(command)), flush=True) + + container_suffix = "random-walk-with-restart" + out = run_container(container_framework, + container_suffix, + command, + volumes, + work_dir) + print(out) + + output = Path(out_dir, 'out') + output.rename(output_file) + + + @staticmethod + def parse_output(raw_pathway_file, standardized_pathway_file): + """ + Convert a predicted pathway into the universal format + @param raw_pathway_file: pathway file produced by an algorithm's run function + @param standardized_pathway_file: the same pathway written in the universal format + """ + + df = pd.read_csv(raw_pathway_file, sep="\t") + + # add a rank column to the dataframe + df = add_rank_column(df) + + pathway_output_file = standardized_pathway_file + edge_output_file = standardized_pathway_file.replace('.txt', '') + '_edges.txt' + node_output_file = standardized_pathway_file.replace('.txt', '') + '_nodes.txt' + + # get all rows where type is 1 + df_edge = df.loc[df["Type"] == 1] + + # get rid of the placeholder column and output it to a file + df_edge = df_edge.drop(columns=['Type']) + df_edge = df_edge.drop(columns=['Rank']) + df_edge.to_csv(edge_output_file, sep="\t", index=False, header=True) + + # locate the first place where placeholder is not Nan + df_node = df.loc[df['Type'] == 2] + # rename the header to Node, Pr, R_Pr, Final_Pr + df_node = df_node.drop(columns=['Type']) + df_node = df_node.drop(columns=['Rank']) + df_node = df_node.rename(columns={'Node1': 'Node', 'Node2': 'Pr', 'Edge Flux': 'R_Pr', 'Weight': 'Final_Pr', 'InNetwork' : 'InNetwork'}) + df_node.to_csv(node_output_file, sep="\t", index=False, header=True) + + df_pathway = df.loc[df['Type'] == 3] + df_pathway = df_pathway.drop(columns=['InNetwork']) + df_pathway = df_pathway.drop(columns=['Type']) + df_pathway = df_pathway.drop(columns=['Weight']) + df_pathway = df_pathway.drop(columns=['Edge Flux']) + + df_pathway = reinsert_direction_col_directed(df_pathway) + df_pathway.to_csv(pathway_output_file, sep="\t", index=False, header=False) diff --git a/test/RWR/input/edges.txt b/test/RWR/input/edges.txt new file mode 100644 index 00000000..829462b3 --- /dev/null +++ b/test/RWR/input/edges.txt @@ -0,0 +1,7 @@ +Node1 Node2 Weight +A D 5 +B D 1.3 +C D 0.4 +D E 4.5 +D F 2 +D G 3.2 \ No newline at end of file diff --git a/test/RWR/input/prizes.txt b/test/RWR/input/prizes.txt new file mode 100644 index 00000000..261ab2e8 --- /dev/null +++ b/test/RWR/input/prizes.txt @@ -0,0 +1,7 @@ +NODEID prizes Node type +A 1 source +B 1 source +C 1 source +E 1 target +F 1 target +G 1 target \ No newline at end of file diff --git a/test/RWR/test_rwr.py b/test/RWR/test_rwr.py new file mode 100644 index 00000000..8d6724fb --- /dev/null +++ b/test/RWR/test_rwr.py @@ -0,0 +1,69 @@ +import shutil +from pathlib import Path + +import pytest + +import spras.config as config +from spras.rwr import RWR + +config.init_from_file("config/config.yaml") + +TEST_DIR = 'test/RWR/' +OUT_FILE_DEFAULT = TEST_DIR+'output/rwr-edges.txt' +OUT_FILE_OPTIONAL = TEST_DIR+'output/rwr-edges-optional.txt' + + +class TestRWR: + """ + Run RWR tests in the Docker image + """ + def test_rwr(self): + out_path = Path(OUT_FILE_DEFAULT) + out_path.unlink(missing_ok=True) + # Only include required arguments + RWR.run( + edges=TEST_DIR+'input/edges.txt', + prizes=TEST_DIR+'input/prizes.txt', + output_file=OUT_FILE_DEFAULT + ) + assert out_path.exists() + + def test_rwr_optional(self): + out_path = Path(OUT_FILE_OPTIONAL) + out_path.unlink(missing_ok=True) + # Include optional argument - single_source, df, w, f, threshold, + RWR.run( + edges=TEST_DIR+'input/edges.txt', + prizes=TEST_DIR+'input/prizes.txt', + output_file=OUT_FILE_OPTIONAL, + single_source=1, + df=0.85, + w=0.00, + f='min', + threshold=0.0001 + ) + assert out_path.exists() + + def test_rwr_missing(self): + # Test the expected error is raised when required arguments are missing + with pytest.raises(ValueError): + # No nodetypes + RWR.run( + edges=TEST_DIR + 'input/edges.txt', + output_file=OUT_FILE_OPTIONAL, + single_source=1, + df=0.85) + + # Only run Singularity test if the binary is available on the system + # spython is only available on Unix, but do not explicitly skip non-Unix platforms + @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') + def test_rwr_singularity(self): + out_path = Path(OUT_FILE_DEFAULT) + out_path.unlink(missing_ok=True) + # Only include required arguments and run with Singularity + RWR.run( + edges=TEST_DIR+'input/edges.txt', + prizes=TEST_DIR+'input/prizes.txt', + output_file=OUT_FILE_DEFAULT, + container_framework="singularity") + assert out_path.exists()