Skip to content

Commit 9637cfb

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
subcommand remote added
1 parent 51c8788 commit 9637cfb

File tree

1 file changed

+290
-0
lines changed

1 file changed

+290
-0
lines changed

bin/funannotate-remote.py

+290
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#!/usr/bin/env python
2+
from __future__ import division
3+
4+
import sys, os, subprocess, inspect, shutil, argparse, time, glob, requests, zipfile, urllib2
5+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
6+
parentdir = os.path.dirname(currentdir)
7+
sys.path.insert(0,parentdir)
8+
import lib.library as lib
9+
10+
RUNIPRSCAN = os.path.join(parentdir, 'util', 'runIPRscan.py')
11+
XMLCombine = os.path.join(parentdir, 'util', 'xmlcombine.py')
12+
13+
#setup menu with argparse
14+
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
15+
def __init__(self,prog):
16+
super(MyFormatter,self).__init__(prog,max_help_position=48)
17+
parser=argparse.ArgumentParser(prog='funannotate-remote.py',
18+
description='''Script that adds functional annotation to a genome using remote searches.''',
19+
epilog="""Written by Jon Palmer (2016-2017) [email protected]""",
20+
formatter_class = MyFormatter)
21+
parser.add_argument('-i', '--input', help='Folder from funannotate predict.')
22+
parser.add_argument('-g', '--genbank', help='Annotated genome in GenBank format')
23+
parser.add_argument('-m', '--methods', required=True, nargs='+', choices=['all', 'phobius', 'antismash', 'interproscan'], help='Method to run')
24+
parser.add_argument('-o', '--out', help='Basename of output files')
25+
parser.add_argument('-e', '--email', required=True, help='Email address for IPRSCAN server')
26+
parser.add_argument('--force', action='store_true', help='Over-write output folder')
27+
args=parser.parse_args()
28+
29+
def runIPRpython(Input):
30+
base = Input.split('/')[-1]
31+
base = base.split('.fa')[0]
32+
OUTPATH = os.path.join(IPROUT, base)
33+
subprocess.call([sys.executable, RUNIPRSCAN, '--goterms','--email', args.email, '--outfile', OUTPATH, '--input', Input], stderr=FNULL, stdout=FNULL)
34+
#now rename output files, just keep xml and tsv files?
35+
time.sleep(3) #make sure there is time for all files to show up
36+
os.rename(OUTPATH+'.xml.xml', OUTPATH+'.xml')
37+
os.rename(OUTPATH+'.tsv.txt', OUTPATH+'.tsv')
38+
os.rename(OUTPATH+'.svg.svg', OUTPATH+'.svg')
39+
os.rename(OUTPATH+'.sequence.txt', OUTPATH+'.fa')
40+
os.remove(OUTPATH+'.gff.txt')
41+
os.remove(OUTPATH+'.htmltarball.html.tar.gz')
42+
os.remove(OUTPATH+'.log.txt')
43+
os.remove(OUTPATH+'.out.txt')
44+
45+
def download(url, name):
46+
file_name = name
47+
u = urllib2.urlopen(url)
48+
f = open(file_name, 'wb')
49+
meta = u.info()
50+
file_size = int(meta.getheaders("Content-Length")[0])
51+
print("Downloading: {0} Bytes: {1}".format(url, file_size))
52+
file_size_dl = 0
53+
block_sz = 8192
54+
while True:
55+
buffer = u.read(block_sz)
56+
if not buffer:
57+
break
58+
file_size_dl += len(buffer)
59+
f.write(buffer)
60+
p = float(file_size_dl) / file_size
61+
status = r"{0} [{1:.2%}]".format(file_size_dl, p)
62+
status = status + chr(8)*(len(status)+1)
63+
sys.stdout.write(status)
64+
f.close()
65+
66+
#create log file
67+
log_name = 'funannotate-remote.log'
68+
if os.path.isfile(log_name):
69+
os.remove(log_name)
70+
71+
#initialize script, log system info and cmd issue at runtime
72+
lib.setupLogging(log_name)
73+
FNULL = open(os.devnull, 'w')
74+
cmd_args = " ".join(sys.argv)+'\n'
75+
lib.log.debug(cmd_args)
76+
print "-------------------------------------------------------"
77+
lib.SystemInfo()
78+
79+
#get version of funannotate
80+
version = lib.get_version()
81+
lib.log.info("Running %s" % version)
82+
83+
#need to do some checks here of the input
84+
genbank = ''
85+
Proteins = ''
86+
if not args.input:
87+
#did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
88+
if not args.out:
89+
lib.log.error("If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)")
90+
sys.exit(1)
91+
else:
92+
outputdir = args.out
93+
#create outputdir and subdirs
94+
if not os.path.isdir(outputdir):
95+
os.makedirs(outputdir)
96+
os.makedirs(os.path.join(outputdir, 'annotate_misc'))
97+
os.makedirs(os.path.join(outputdir, 'annotate_results'))
98+
os.makedirs(os.path.join(outputdir, 'logfiles'))
99+
if not args.genbank:
100+
lib.log.error("You did not specifiy the apropriate input files, either: \n1) Funannotate input \n2) GenBank")
101+
sys.exit(1)
102+
else:
103+
#create output directories
104+
if not os.path.isdir(outputdir):
105+
os.makedirs(outputdir)
106+
os.makedirs(os.path.join(outputdir, 'annotate_misc'))
107+
os.makedirs(os.path.join(outputdir, 'annotate_results'))
108+
os.makedirs(os.path.join(outputdir, 'logfiles'))
109+
else:
110+
lib.log.error("Output directory %s already exists" % (outputdir))
111+
if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
112+
os.makedirs(os.path.join(outputdir, 'annotate_misc'))
113+
if not os.path.isdir(os.path.join(outputdir, 'annotate_results')):
114+
os.makedirs(os.path.join(outputdir, 'annotate_results'))
115+
if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
116+
os.makedirs(os.path.join(outputdir, 'logfiles'))
117+
genbank = args.genbank
118+
Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta')
119+
Proteins = os.path.join(outputdir, 'annotate_misc', 'genome.proteins.fasta')
120+
Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.transcripts.fasta')
121+
GFF = os.path.join(outputdir, 'annotate_misc', 'genome.gff3')
122+
lib.log.info("Checking GenBank file for annotation")
123+
if not lib.checkGenBank(genbank):
124+
lib.log.error("Found no annotation in GenBank file, exiting")
125+
sys.exit(1)
126+
lib.gb2allout(genbank, GFF, Proteins, Transcripts, Scaffolds)
127+
128+
else:
129+
#should be a folder, with funannotate files, thus store results there, no need to create output folder
130+
if not os.path.isdir(args.input):
131+
lib.log.error("%s directory does not exist" % args.input)
132+
sys.exit(1)
133+
if os.path.isdir(os.path.join(args.input, 'predict_results')): #funannotate results should be here
134+
inputdir = os.path.join(args.input, 'predict_results')
135+
outputdir = args.input
136+
else:
137+
inputdir = os.path.join(args.input) #here user specified the predict_results folder, or it is a custom folder
138+
139+
#get files that you need
140+
for file in os.listdir(inputdir):
141+
if file.endswith('.gbk'):
142+
genbank = os.path.join(inputdir, file)
143+
if file.endswith('.gff3'):
144+
GFF = os.path.join(inputdir, file)
145+
146+
#now create the files from genbank input file for consistency in gene naming, etc
147+
if not genbank or not GFF:
148+
lib.log.error("Properly formatted 'funannotate predict' files do no exist in this directory")
149+
sys.exit(1)
150+
else:
151+
if 'predict_results' in inputdir: #if user gave predict_results folder, then set output to up one directory
152+
outputdir = lib.get_parent_dir(inputdir)
153+
else:
154+
if not args.out:
155+
outputdir = inputdir #output the results in the input directory
156+
else:
157+
outputdir = args.out
158+
if not os.path.isdir(outputdir):
159+
os.makedirs(outputdir)
160+
#create output directories
161+
if not os.path.isdir(os.path.join(outputdir, 'annotate_misc')):
162+
os.makedirs(os.path.join(outputdir, 'annotate_misc'))
163+
os.makedirs(os.path.join(outputdir, 'annotate_results'))
164+
else:
165+
lib.log.error("Output directory %s already exists, will use any existing data. If this is not what you want, exit, and provide a unique name for output folder" % (outputdir))
166+
lib.log.info("Parsing input files")
167+
Scaffolds = os.path.join(outputdir, 'annotate_misc', 'genome.scaffolds.fasta')
168+
Proteins = os.path.join(outputdir, 'annotate_misc','genome.proteins.fasta')
169+
Transcripts = os.path.join(outputdir, 'annotate_misc', 'genome.transcripts.fasta')
170+
lib.gb2output(genbank, Proteins, Transcripts, Scaffolds)
171+
172+
#make sure logfiles directory is present, will need later
173+
if not os.path.isdir(os.path.join(outputdir, 'logfiles')):
174+
os.makedirs(os.path.join(outputdir, 'logfiles'))
175+
176+
#get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
177+
Proteins = os.path.abspath(Proteins)
178+
genbank = os.path.abspath(genbank)
179+
180+
181+
if 'phobius' in args.methods or 'all' in args.methods:
182+
#run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
183+
phobius_out = os.path.join(outputdir, 'annotate_misc', 'phobius.results.txt')
184+
phobiusLog = os.path.join(outputdir, 'logfiles', 'phobius.log')
185+
lib.log.info("Predicting secreted and transmembrane proteins using Phobius")
186+
if not lib.checkannotations(phobius_out):
187+
if args.email:
188+
subprocess.call([os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-e', str(args.email), '-l', phobiusLog])
189+
else:
190+
subprocess.call([os.path.join(parentdir, 'util', 'phobius-multiproc.py'), '-i', Proteins, '-o', phobius_out, '-l', phobiusLog])
191+
192+
if 'interproscan' in args.methods or 'all' in args.methods:
193+
IPRCombined = os.path.join(outputdir, 'annotate_misc', 'iprscan.xml')
194+
#run interpro scan
195+
IPROUT = os.path.join(outputdir, 'annotate_misc', 'iprscan')
196+
PROTS = os.path.join(outputdir, 'annotate_misc', 'protein_tmp')
197+
for i in IPROUT,PROTS:
198+
if not os.path.exists(i):
199+
os.makedirs(i)
200+
#now run interproscan
201+
#split input into individual files
202+
lib.splitFASTA(Proteins, PROTS)
203+
#now iterate over list using pool and up to 25 submissions at a time
204+
proteins = []
205+
for file in os.listdir(PROTS):
206+
if file.endswith('.fa'):
207+
file = os.path.join(PROTS, file)
208+
proteins.append(file)
209+
210+
num_files = len(glob.glob1(IPROUT,"*.xml"))
211+
num_prots = len(proteins)
212+
lib.log.info("Now running InterProScan search remotely using EBI servers on " + '{0:,}'.format(num_prots) + ' proteins')
213+
#build in a check before running (in case script gets stopped and needs to restart
214+
finished = []
215+
for file in os.listdir(IPROUT):
216+
if file.endswith('.xml'):
217+
base = file.split('.xml')[0]
218+
fasta_file = os.path.join(PROTS, base+'.fa')
219+
finished.append(fasta_file)
220+
221+
finished = set(finished) #make sure no duplicates
222+
runlist = [x for x in proteins if x not in finished]
223+
if len(runlist) < num_prots:
224+
lib.log.info("Results found, querying remaining %i proteins" % len(runlist))
225+
#start up the list, max 25 at a time
226+
lib.runMultiProgress(runIPRpython, runlist, 25)
227+
#clean up protein fasta files
228+
shutil.rmtree(PROTS)
229+
#now convert to single file and then clean up
230+
with open(IPRCombined, 'w') as output:
231+
subprocess.call([sys.executable, XMLCombine, IPROUT], stdout = output)
232+
if lib.checkannotations(IPRCombined):
233+
shutil.rmtree(IPROUT)
234+
235+
if 'antismash' in args.methods or 'all' in args.methods:
236+
version = requests.get("http://fungismash.secondarymetabolites.org/api/v1.0/version")
237+
as_vers = version.json()['antismash_generation']
238+
tax = version.json()['taxon']
239+
as_status = requests.get("http://fungismash.secondarymetabolites.org/api/v1.0/stats")
240+
queue = as_status.json()['queue_length']
241+
running = as_status.json()['running']
242+
lib.log.info("Connecting to antiSMASH %s v%s webserver" % (tax, as_vers))
243+
lib.log.info("Queue Length: %s; Jobs Running: %s" % (queue, running))
244+
lib.log.info("PLEASE to not abuse the webserver, be considerate!")
245+
if int(queue) > 10 and not args.force:
246+
lib.log.error("There are more than 10 antiSMASH jobs in queue, use --force to submit anyway")
247+
sys.exit(1)
248+
job_files = {'seq': open(genbank, 'rb')}
249+
job_parameters = {'email': args.email, 'smcogs': 'on', 'knownclusterblast': 'on', 'activesitefinder': 'on', 'subclusterblast': 'on'}
250+
lib.log.info("Uploading %s to webserver" % genbank)
251+
postjob = requests.post("http://fungismash.secondarymetabolites.org/api/v1.0/submit", files=job_files, data=job_parameters)
252+
jobid = postjob.json()['id']
253+
#now we can query the job every so often, not sure what is reasonable here, start with 2 minutes?
254+
lib.log.info("Waiting for results from job: %s" % jobid)
255+
while True:
256+
job_status = requests.get("http://fungismash.secondarymetabolites.org/api/v1.0/status/"+jobid)
257+
if job_status.json()['status'] == 'done':
258+
break
259+
time.sleep(120) #check every 2 minutes
260+
result_url = job_status.json()['result_url']
261+
base_url = result_url.replace('index.html', '')
262+
lib.log.info("antiSMASH v%s job finished" % (as_vers))
263+
lib.log.debug("%s" % job_status.json())
264+
#need to retrieve results, have to find link, seems like this might be first scaffold name?
265+
#after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it
266+
job_html = requests.get("http://fungismash.secondarymetabolites.org"+result_url)
267+
for line in job_html.iter_lines():
268+
if 'Download GenBank summary file' in line:
269+
cols = line.split('a href="')
270+
for x in cols:
271+
if '.zip' in x:
272+
link = x.split('"')[0]
273+
baselink = link.replace('.zip', '')
274+
download_url = "http://fungismash.secondarymetabolites.org"+base_url+link
275+
download(download_url, 'antiSMASH.zip')
276+
#now unzip and move folder
277+
zipref = zipfile.ZipFile('antiSMASH.zip', 'r')
278+
zipref.extractall(outputdir)
279+
zipref.close()
280+
os.remove('antiSMASH.zip')
281+
lib.log.info("Results folder: %s/%s" % (outputdir, jobid))
282+
#now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection
283+
anti_GBK = os.path.join(outputdir, jobid, baselink+'.final.gbk')
284+
final = os.path.join(outputdir, 'annotate_misc', 'antiSMASH.results.gbk')
285+
shutil.copyfile(anti_GBK, final)
286+
lib.log.info("Results GBK: %s" % final)
287+
288+
lib.log.info("Remote searches complete")
289+
#move logfile
290+
os.rename(log_name, os.path.join(outputdir, 'logfiles', log_name))

0 commit comments

Comments
 (0)