merge data workflow to main (#48)

Gy-Lu · web-flow · commit b254d4654d38 · 2022-08-23T16:56:56.000+08:00
Added data workflow for fastfold
diff --git a/README.md b/README.md
@@ -18,14 +18,16 @@ FastFold provides a **high-performance implementation of Evoformer** with the fo
 3. Ease of use
     * Huge performance gains with a few lines changes
     * You don't need to care about how the parallel part is implemented
+4. Faster data processing, about 3x times faster than the original way
 
 ## Installation
 
 To install and use FastFold, you will need:
-+ Python 3.8 or later
++ Python 3.8 or 3.9.
 + [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 11.1 or above
 + PyTorch 1.10 or above 
 
+
 For now, You can install FastFold:
 ### Using Conda (Recommended)
 
@@ -116,6 +118,32 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
     --hhsearch_binary_path `which hhsearch` \
     --kalign_binary_path `which kalign`
 ```
+or run the script `./inference.sh`, you can change the parameter in the script, especisally those data path.
+```shell
+./inference.sh
+```
+
+#### inference with data workflow
+Alphafold's data pre-processing takes a lot of time, so we speed up the data pre-process by [ray](https://docs.ray.io/en/latest/workflows/concepts.html) workflow, which achieves a 3x times faster speed. To run the intference with ray workflow, you should install the package and add parameter `--enable_workflow` to cmdline or shell script `./inference.sh`
+```shell
+pip install ray==1.13.0 pyarrow
+```
+```shell
+python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
+    --output_dir ./ \
+    --gpus 2 \
+    --uniref90_database_path data/uniref90/uniref90.fasta \
+    --mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
+    --pdb70_database_path data/pdb70/pdb70 \
+    --uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
+    --bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
+    --jackhmmer_binary_path `which jackhmmer` \
+    --hhblits_binary_path `which hhblits` \
+    --hhsearch_binary_path `which hhsearch` \
+    --kalign_binary_path `which kalign`  \
+    --enable_workflow 
+```
+
 
 ## Performance Benchmark
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -9,7 +9,7 @@ RUN conda install pytorch==1.10.0 torchvision torchaudio cudatoolkit=11.3 -c pyt
  && conda install hmmer==3.3.2 hhsuite=3.3.0 kalign2=2.04 -c bioconda
 
 RUN pip install biopython==1.79 dm-tree==0.1.6 ml-collections==0.1.0 numpy==1.21.2 \
- PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops 
+ PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops ray==1.13.0 pyarrow
 
 RUN pip install colossalai==0.1.8+torch1.10cu11.3 -f https://release.colossalai.org
 
diff --git a/fastfold/workflow/__init__.py b/fastfold/workflow/__init__.py
@@ -0,0 +1 @@
+from .workflow_run import batch_run
diff --git a/fastfold/workflow/factory/__init__.py b/fastfold/workflow/factory/__init__.py
@@ -0,0 +1,5 @@
+from .task_factory import TaskFactory
+from .hhblits import HHBlitsFactory
+from .hhsearch import HHSearchFactory
+from .jackhmmer import JackHmmerFactory
+from .hhfilter import HHfilterFactory
diff --git a/fastfold/workflow/factory/hhblits.py b/fastfold/workflow/factory/hhblits.py
@@ -0,0 +1,29 @@
+from ray import workflow
+from typing import List
+from fastfold.workflow.factory import TaskFactory
+from ray.workflow.common import Workflow
+import fastfold.data.tools.hhblits as ffHHBlits
+
+class HHBlitsFactory(TaskFactory):
+
+    keywords = ['binary_path', 'databases', 'n_cpu']
+
+    def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
+        
+        self.isReady()
+
+        # setup runner
+        runner = ffHHBlits.HHBlits(
+            binary_path=self.config['binary_path'],
+            databases=self.config['databases'],
+            n_cpu=self.config['n_cpu']
+        )
+
+        # generate step function
+        @workflow.step
+        def hhblits_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
+            result = runner.query(fasta_path)
+            with open(output_path, "w") as f:
+                f.write(result["a3m"])
+
+        return hhblits_step.step(fasta_path, output_path, after)
diff --git a/fastfold/workflow/factory/hhfilter.py b/fastfold/workflow/factory/hhfilter.py
@@ -0,0 +1,33 @@
+import subprocess
+import logging
+from ray import workflow
+from typing import List
+from fastfold.workflow.factory import TaskFactory
+from ray.workflow.common import Workflow
+
+class HHfilterFactory(TaskFactory):
+
+    keywords = ['binary_path']
+
+    def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
+        
+        self.isReady()
+
+        # generate step function
+        @workflow.step
+        def hhfilter_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
+            
+            cmd = [
+                self.config.get('binary_path'),
+            ]
+            if 'id' in self.config:
+                cmd += ['-id', str(self.config.get('id'))]
+            if 'cov' in self.config:
+                cmd += ['-cov', str(self.config.get('cov'))]
+            cmd += ['-i', fasta_path, '-o', output_path]
+
+            logging.info(f"HHfilter start: {' '.join(cmd)}")
+
+            subprocess.run(cmd)
+
+        return hhfilter_step.step(fasta_path, output_path, after)
diff --git a/fastfold/workflow/factory/hhsearch.py b/fastfold/workflow/factory/hhsearch.py
@@ -0,0 +1,38 @@
+from fastfold.workflow.factory import TaskFactory
+from ray import workflow
+from ray.workflow.common import Workflow
+import fastfold.data.tools.hhsearch as ffHHSearch
+from typing import List
+
+class HHSearchFactory(TaskFactory):
+
+    keywords = ['binary_path', 'databases', 'n_cpu']
+
+    def gen_task(self, a3m_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
+
+        self.isReady()
+        
+        # setup runner
+        runner = ffHHSearch.HHSearch(
+            binary_path=self.config['binary_path'],
+            databases=self.config['databases'],
+            n_cpu=self.config['n_cpu']
+        )
+
+        # generate step function
+        @workflow.step
+        def hhsearch_step(a3m_path: str, output_path: str, after: List[Workflow], atab_path: str = None) -> None:
+
+            with open(a3m_path, "r") as f:
+                a3m = f.read()
+            if atab_path:
+                hhsearch_result, atab = runner.query(a3m, gen_atab=True)
+            else:
+                hhsearch_result = runner.query(a3m)
+            with open(output_path, "w") as f:
+                f.write(hhsearch_result)
+            if atab_path:
+                with open(atab_path, "w") as f:
+                    f.write(atab)
+
+        return hhsearch_step.step(a3m_path, output_path, after)
diff --git a/fastfold/workflow/factory/jackhmmer.py b/fastfold/workflow/factory/jackhmmer.py
@@ -0,0 +1,34 @@
+from fastfold.workflow.factory import TaskFactory
+from ray import workflow
+from ray.workflow.common import Workflow
+import fastfold.data.tools.jackhmmer as ffJackHmmer
+from fastfold.data import parsers
+from typing import List
+
+class JackHmmerFactory(TaskFactory):
+
+    keywords = ['binary_path', 'database_path', 'n_cpu', 'uniref_max_hits']
+
+    def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
+
+        self.isReady()
+        
+        # setup runner
+        runner = ffJackHmmer.Jackhmmer(
+            binary_path=self.config['binary_path'],
+            database_path=self.config['database_path'],
+            n_cpu=self.config['n_cpu']
+        )
+
+        # generate step function
+        @workflow.step
+        def jackhmmer_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
+            result = runner.query(fasta_path)[0]
+            uniref90_msa_a3m = parsers.convert_stockholm_to_a3m(
+                result['sto'],
+                max_sequences=self.config['uniref_max_hits']
+            )
+            with open(output_path, "w") as f:
+                f.write(uniref90_msa_a3m)
+        
+        return jackhmmer_step.step(fasta_path, output_path, after)
diff --git a/fastfold/workflow/factory/task_factory.py b/fastfold/workflow/factory/task_factory.py
@@ -0,0 +1,50 @@
+from ast import keyword
+import json
+from ray.workflow.common import Workflow
+from os import path
+from typing import List
+
+class TaskFactory:
+
+    keywords = []
+
+    def __init__(self, config: dict = None, config_path: str = None) -> None:
+        
+        # skip if no keyword required from config file
+        if not self.__class__.keywords:
+            return
+
+        # setting config for factory
+        if config is not None:
+            self.config = config
+        elif config_path is not None:
+            self.loadConfig(config_path)
+        else:
+            self.loadConfig()
+        
+    def configure(self, config: dict, purge=False) -> None:
+        if purge:
+            self.config = config
+        else:
+            self.config.update(config)
+
+    def configure(self, keyword: str, value: any) -> None:
+        self.config[keyword] = value
+
+    def gen_task(self, after: List[Workflow]=None, *args, **kwargs) -> Workflow:
+        raise NotImplementedError
+
+    def isReady(self):
+        for key in self.__class__.keywords:
+            if key not in self.config:
+                raise KeyError(f"{self.__class__.__name__} not ready: \"{key}\" not specified")
+
+    def loadConfig(self, config_path='./config.json'):
+        with open(config_path) as configFile:
+            globalConfig = json.load(configFile)
+            if 'tools' not in globalConfig:
+                raise KeyError("\"tools\" not found in global config file")
+            factoryName = self.__class__.__name__[:-7]
+            if factoryName not in globalConfig['tools']:
+                raise KeyError(f"\"{factoryName}\" not found in the \"tools\" section in config")
+            self.config = globalConfig['tools'][factoryName]
diff --git a/fastfold/workflow/template/__init__.py b/fastfold/workflow/template/__init__.py
@@ -0,0 +1 @@
+from .fastfold_data_workflow import FastFoldDataWorkFlow
diff --git a/fastfold/workflow/template/fastfold_data_workflow.py b/fastfold/workflow/template/fastfold_data_workflow.py
diff --git a/fastfold/workflow/workflow_run.py b/fastfold/workflow/workflow_run.py
diff --git a/inference.py b/inference.py
diff --git a/inference.sh b/inference.sh

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .fastfold_data_workflow import FastFoldDataWorkFlow`