Skip to content

Commit b254d46

Browse files
authored
merge data workflow to main (#48)
Added data workflow for fastfold
1 parent a37c8b4 commit b254d46

File tree

14 files changed

+432
-17
lines changed

14 files changed

+432
-17
lines changed

README.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@ FastFold provides a **high-performance implementation of Evoformer** with the fo
1818
3. Ease of use
1919
* Huge performance gains with a few lines changes
2020
* You don't need to care about how the parallel part is implemented
21+
4. Faster data processing, about 3x times faster than the original way
2122

2223
## Installation
2324

2425
To install and use FastFold, you will need:
25-
+ Python 3.8 or later
26+
+ Python 3.8 or 3.9.
2627
+ [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) 11.1 or above
2728
+ PyTorch 1.10 or above
2829

30+
2931
For now, You can install FastFold:
3032
### Using Conda (Recommended)
3133

@@ -116,6 +118,32 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
116118
--hhsearch_binary_path `which hhsearch` \
117119
--kalign_binary_path `which kalign`
118120
```
121+
or run the script `./inference.sh`, you can change the parameter in the script, especisally those data path.
122+
```shell
123+
./inference.sh
124+
```
125+
126+
#### inference with data workflow
127+
Alphafold's data pre-processing takes a lot of time, so we speed up the data pre-process by [ray](https://docs.ray.io/en/latest/workflows/concepts.html) workflow, which achieves a 3x times faster speed. To run the intference with ray workflow, you should install the package and add parameter `--enable_workflow` to cmdline or shell script `./inference.sh`
128+
```shell
129+
pip install ray==1.13.0 pyarrow
130+
```
131+
```shell
132+
python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
133+
--output_dir ./ \
134+
--gpus 2 \
135+
--uniref90_database_path data/uniref90/uniref90.fasta \
136+
--mgnify_database_path data/mgnify/mgy_clusters_2018_12.fa \
137+
--pdb70_database_path data/pdb70/pdb70 \
138+
--uniclust30_database_path data/uniclust30/uniclust30_2018_08/uniclust30_2018_08 \
139+
--bfd_database_path data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \
140+
--jackhmmer_binary_path `which jackhmmer` \
141+
--hhblits_binary_path `which hhblits` \
142+
--hhsearch_binary_path `which hhsearch` \
143+
--kalign_binary_path `which kalign` \
144+
--enable_workflow
145+
```
146+
119147

120148
## Performance Benchmark
121149

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ RUN conda install pytorch==1.10.0 torchvision torchaudio cudatoolkit=11.3 -c pyt
99
&& conda install hmmer==3.3.2 hhsuite=3.3.0 kalign2=2.04 -c bioconda
1010

1111
RUN pip install biopython==1.79 dm-tree==0.1.6 ml-collections==0.1.0 numpy==1.21.2 \
12-
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops
12+
PyYAML==5.4.1 requests==2.26.0 scipy==1.7.1 tqdm==4.62.2 typing-extensions==3.10.0.2 einops ray==1.13.0 pyarrow
1313

1414
RUN pip install colossalai==0.1.8+torch1.10cu11.3 -f https://release.colossalai.org
1515

fastfold/workflow/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .workflow_run import batch_run
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .task_factory import TaskFactory
2+
from .hhblits import HHBlitsFactory
3+
from .hhsearch import HHSearchFactory
4+
from .jackhmmer import JackHmmerFactory
5+
from .hhfilter import HHfilterFactory
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from ray import workflow
2+
from typing import List
3+
from fastfold.workflow.factory import TaskFactory
4+
from ray.workflow.common import Workflow
5+
import fastfold.data.tools.hhblits as ffHHBlits
6+
7+
class HHBlitsFactory(TaskFactory):
8+
9+
keywords = ['binary_path', 'databases', 'n_cpu']
10+
11+
def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
12+
13+
self.isReady()
14+
15+
# setup runner
16+
runner = ffHHBlits.HHBlits(
17+
binary_path=self.config['binary_path'],
18+
databases=self.config['databases'],
19+
n_cpu=self.config['n_cpu']
20+
)
21+
22+
# generate step function
23+
@workflow.step
24+
def hhblits_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
25+
result = runner.query(fasta_path)
26+
with open(output_path, "w") as f:
27+
f.write(result["a3m"])
28+
29+
return hhblits_step.step(fasta_path, output_path, after)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import subprocess
2+
import logging
3+
from ray import workflow
4+
from typing import List
5+
from fastfold.workflow.factory import TaskFactory
6+
from ray.workflow.common import Workflow
7+
8+
class HHfilterFactory(TaskFactory):
9+
10+
keywords = ['binary_path']
11+
12+
def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
13+
14+
self.isReady()
15+
16+
# generate step function
17+
@workflow.step
18+
def hhfilter_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
19+
20+
cmd = [
21+
self.config.get('binary_path'),
22+
]
23+
if 'id' in self.config:
24+
cmd += ['-id', str(self.config.get('id'))]
25+
if 'cov' in self.config:
26+
cmd += ['-cov', str(self.config.get('cov'))]
27+
cmd += ['-i', fasta_path, '-o', output_path]
28+
29+
logging.info(f"HHfilter start: {' '.join(cmd)}")
30+
31+
subprocess.run(cmd)
32+
33+
return hhfilter_step.step(fasta_path, output_path, after)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from fastfold.workflow.factory import TaskFactory
2+
from ray import workflow
3+
from ray.workflow.common import Workflow
4+
import fastfold.data.tools.hhsearch as ffHHSearch
5+
from typing import List
6+
7+
class HHSearchFactory(TaskFactory):
8+
9+
keywords = ['binary_path', 'databases', 'n_cpu']
10+
11+
def gen_task(self, a3m_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
12+
13+
self.isReady()
14+
15+
# setup runner
16+
runner = ffHHSearch.HHSearch(
17+
binary_path=self.config['binary_path'],
18+
databases=self.config['databases'],
19+
n_cpu=self.config['n_cpu']
20+
)
21+
22+
# generate step function
23+
@workflow.step
24+
def hhsearch_step(a3m_path: str, output_path: str, after: List[Workflow], atab_path: str = None) -> None:
25+
26+
with open(a3m_path, "r") as f:
27+
a3m = f.read()
28+
if atab_path:
29+
hhsearch_result, atab = runner.query(a3m, gen_atab=True)
30+
else:
31+
hhsearch_result = runner.query(a3m)
32+
with open(output_path, "w") as f:
33+
f.write(hhsearch_result)
34+
if atab_path:
35+
with open(atab_path, "w") as f:
36+
f.write(atab)
37+
38+
return hhsearch_step.step(a3m_path, output_path, after)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from fastfold.workflow.factory import TaskFactory
2+
from ray import workflow
3+
from ray.workflow.common import Workflow
4+
import fastfold.data.tools.jackhmmer as ffJackHmmer
5+
from fastfold.data import parsers
6+
from typing import List
7+
8+
class JackHmmerFactory(TaskFactory):
9+
10+
keywords = ['binary_path', 'database_path', 'n_cpu', 'uniref_max_hits']
11+
12+
def gen_task(self, fasta_path: str, output_path: str, after: List[Workflow]=None) -> Workflow:
13+
14+
self.isReady()
15+
16+
# setup runner
17+
runner = ffJackHmmer.Jackhmmer(
18+
binary_path=self.config['binary_path'],
19+
database_path=self.config['database_path'],
20+
n_cpu=self.config['n_cpu']
21+
)
22+
23+
# generate step function
24+
@workflow.step
25+
def jackhmmer_step(fasta_path: str, output_path: str, after: List[Workflow]) -> None:
26+
result = runner.query(fasta_path)[0]
27+
uniref90_msa_a3m = parsers.convert_stockholm_to_a3m(
28+
result['sto'],
29+
max_sequences=self.config['uniref_max_hits']
30+
)
31+
with open(output_path, "w") as f:
32+
f.write(uniref90_msa_a3m)
33+
34+
return jackhmmer_step.step(fasta_path, output_path, after)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from ast import keyword
2+
import json
3+
from ray.workflow.common import Workflow
4+
from os import path
5+
from typing import List
6+
7+
class TaskFactory:
8+
9+
keywords = []
10+
11+
def __init__(self, config: dict = None, config_path: str = None) -> None:
12+
13+
# skip if no keyword required from config file
14+
if not self.__class__.keywords:
15+
return
16+
17+
# setting config for factory
18+
if config is not None:
19+
self.config = config
20+
elif config_path is not None:
21+
self.loadConfig(config_path)
22+
else:
23+
self.loadConfig()
24+
25+
def configure(self, config: dict, purge=False) -> None:
26+
if purge:
27+
self.config = config
28+
else:
29+
self.config.update(config)
30+
31+
def configure(self, keyword: str, value: any) -> None:
32+
self.config[keyword] = value
33+
34+
def gen_task(self, after: List[Workflow]=None, *args, **kwargs) -> Workflow:
35+
raise NotImplementedError
36+
37+
def isReady(self):
38+
for key in self.__class__.keywords:
39+
if key not in self.config:
40+
raise KeyError(f"{self.__class__.__name__} not ready: \"{key}\" not specified")
41+
42+
def loadConfig(self, config_path='./config.json'):
43+
with open(config_path) as configFile:
44+
globalConfig = json.load(configFile)
45+
if 'tools' not in globalConfig:
46+
raise KeyError("\"tools\" not found in global config file")
47+
factoryName = self.__class__.__name__[:-7]
48+
if factoryName not in globalConfig['tools']:
49+
raise KeyError(f"\"{factoryName}\" not found in the \"tools\" section in config")
50+
self.config = globalConfig['tools'][factoryName]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .fastfold_data_workflow import FastFoldDataWorkFlow

0 commit comments

Comments
 (0)