-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathGenbank_slicer.py
More file actions
161 lines (128 loc) · 4.89 KB
/
Genbank_slicer.py
File metadata and controls
161 lines (128 loc) · 4.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/python
# This script is designed to take a genbank file and 'slice out'/'subset'
# regions (genes/operons etc.) and produce a separate file. This can be
# done explicitly by telling the script which base sites to use, or can
# 'decide' for itself by blasting a fasta of the sequence you're inter-
# ed in against the Genbank you want to slice a record out of.
# Note, the script (obviously) does not preseve the index number of the
# bases from the original
# Based upon the tutorial at:
# http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc44
# This script depends on BLAST and having the makeblastdb functionality
# available if BLAST_MODE is active. It also depends on Biopython.
# Set up and handle arguments:
from Bio import SeqIO
from Bio import SearchIO
import subprocess
import sys
import argparse
import traceback
import warnings
import os
def convert(basename, genbank):
'''Convert the provided genbank to a fasta to BLAST.'''
refFasta = "{}.fasta.tmp".format(basename)
SeqIO.convert(genbank, 'genbank', refFasta, 'fasta')
return refFasta
def runBlast(basename, refFasta, fasta):
'''Synthesise BLAST commands and make system calls'''
resultHandle = "{}.blastout.tmp".format(basename)
blastdb_cmd = 'makeblastdb -in {0} -dbtype nucl -title temp_blastdb'.format(refFasta)
blastn_cmd = 'blastn -query {0} -strand both -task blastn -db {1} -perc_identity 100 -outfmt 6 -out {2} -max_target_seqs 1'.format(fasta, refFasta, resultHandle)
print("Constructing BLAST Database: " + '\n' + blastdb_cmd)
print("BLASTing: " + '\n' + blastn_cmd)
DB_process = subprocess.Popen(blastdb_cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
DB_process.wait()
BLAST_process = subprocess.Popen(blastn_cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
BLAST_process.wait()
return resultHandle
def getIndices(resultHandle):
'''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''
blast_result = SearchIO.read(resultHandle, 'blast-tab')
print(blast_result[0][0])
start = blast_result[0][0].hit_start
end = blast_result[0][0].hit_end
return start, end
def slice(start, end, genbank):
'''Subset the provided genbank to return the sub record.'''
seqObj = SeqIO.read(genbank, 'genbank')
subRecord = seqObj[start:end]
return subRecord
def main():
###################################################################################################
# Parse command line arguments
try:
parser = argparse.ArgumentParser(
description='This script slices entries such as genes or operons out of a genbank, subsetting them as their own file.')
parser.add_argument(
'-g',
'--genbank',
action='store',
required=True,
help='The genbank file you wish to subset.')
parser.add_argument(
'-o',
'--outfile',
action='store',
help='If specifed, the script will write a file, otherwise redirect STDOUT for pipelines.')
parser.add_argument(
'-s',
'--start',
type=int,
help='Integer base index to slice from.')
parser.add_argument(
'-e',
'--end',
type=int,
help='Integer base index to slice to.')
parser.add_argument(
'-b',
'--blastmode',
action='store_true',
default=False,
help='(False if not specified- If flag is provided, you can provide an input fasta, and BLAST will be called to find your sequence indices in the genbank')
parser.add_argument(
'-f',
'--fasta',
action='store',
help='The operon fasta to pull annotations from the provided genbank.')
parser.add_argument(
'-t',
'--no_tidy',
action='store_false',
default=True,
help='Tell the script whether or not to remove the temporary files it generated during processing. On by default. WARNING: removes files based on the "tmp" string.')
args = parser.parse_args()
except:
print "An exception occured with argument parsing. Check your provided options."
traceback.print_exc()
genbank = args.genbank
fasta = args.fasta
split = os.path.splitext(args.genbank)
basename = os.path.basename(split[0])
start = args.start
end = args.end
blastMode = args.blastmode
outfile = args.outfile
fasta = args.fasta
tidy = args.no_tidy
# Main code:
if blastMode is not False and fasta is not None:
refFasta = convert(basename,genbank)
resultHandle = runBlast(basename, refFasta, fasta)
start, end = getIndices(resultHandle)
else:
if fasta is None:
print("No fasta was provided so BLAST mode cannot be used.")
if start is None or end is None:
print('No slice indices have been specified or retrieved from blastout')
sys.exit(1)
subRecord = slice(start, end, genbank)
if outfile is not None:
SeqIO.write(subRecord, outfile, "genbank")
else:
print(subRecord.format('genbank'))
if tidy is True:
subprocess.Popen("rm -v ./*tmp*",shell=True)
if __name__ == "__main__":
main()