-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcmscan_genome_against_rfam.py
executable file
·99 lines (81 loc) · 3.71 KB
/
cmscan_genome_against_rfam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/python
# Copyright (C) 2019 Shengwei Hou : housw2010 'at' gmail 'dot' com
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import sys
import subprocess
import argparse
from Bio import SeqIO
def do_cmscan(input_fna, rfam_cm, rfam_clanin, output_dir, prefix):
""" this function will be used to scan ncRNAs based on input_cm database,
from input_fna sequences, the infernal v1.1.2 needs to be installed
"""
# output
output_prefix = os.path.join(output_dir, prefix)
# compute database size in Mb
# genomesize * 2 / 1,000,000
database_size = 0
for contig in SeqIO.parse(input_fna, "fasta"):
contig_size = len(contig.seq)
database_size += contig_size
database_size = (2 * database_size) / 1000000.0
# cmpress if not
if not os.path.exists(rfam_cm+".i1m"):
try:
cmpress_proc = subprocess.Popen(['cmpress', rfam_cm])
cmpress_proc.wait()
except Exception as e:
print("Can't run cmpress on input rfam.cm:\n\t{err}".format(err=e))
raise
# run cmscan
try:
cmscan_proc = subprocess.Popen(['cmscan',
"-Z", str(database_size),
"--cut_ga",
"--rfam",
"--nohmmonly",
"--tblout", output_prefix+".tblout",
"--fmt", "2",
"--clanin", rfam_clanin,
"--cpu", "20",
"-o", output_prefix+".cmscan",
rfam_cm,
input_fna
])
cmscan_proc.wait()
except Exception as e:
print("Can't run cmscan due to the following error:\n\t{err}".format(err=e))
raise
def main():
# main parser
parser = argparse.ArgumentParser(description="scan ncRNAs against rfam using cmscan (requires infernal v1.1.2) for input genome in fasta format")
parser.add_argument("input_fna", help="input genome in fasta format")
parser.add_argument("--rfam_cm", default="/home/shengwei/db/Rfam/current/Rfam.cm", help="absolute path to rfam.cm")
parser.add_argument("--rfam_clanin", default="/home/shengwei/db/Rfam/current/Rfam.clanin", help="absolute path to rfam.clanin")
parser.add_argument("-p", "--prefix", help="output prefix")
parser.add_argument("-o", "--output_dir", help="output directory")
parser.add_argument("-v", "--version", action="version", version="%(prog)s 1.0")
args = parser.parse_args()
# prefix handeling
if not args.prefix:
basename = os.path.basename(args.input_fna)
args.prefix = os.path.splitext(basename)[0]
# output_dir handeling
if not args.output_dir:
args.output_dir = os.path.dirname(os.path.abspath(args.input_fna))
# do cmscan
do_cmscan(args.input_fna, args.rfam_cm, args.rfam_clanin, args.output_dir, args.prefix)
if __name__ == "__main__":
main()