1
+ #!/usr/bin/env python
2
+ from __future__ import division
3
+
4
+ import sys , os , subprocess , inspect , shutil , argparse , time , glob , requests , zipfile , urllib2
5
+ currentdir = os .path .dirname (os .path .abspath (inspect .getfile (inspect .currentframe ())))
6
+ parentdir = os .path .dirname (currentdir )
7
+ sys .path .insert (0 ,parentdir )
8
+ import lib .library as lib
9
+
10
+ RUNIPRSCAN = os .path .join (parentdir , 'util' , 'runIPRscan.py' )
11
+ XMLCombine = os .path .join (parentdir , 'util' , 'xmlcombine.py' )
12
+
13
+ #setup menu with argparse
14
+ class MyFormatter (argparse .ArgumentDefaultsHelpFormatter ):
15
+ def __init__ (self ,prog ):
16
+ super (MyFormatter ,self ).__init__ (prog ,max_help_position = 48 )
17
+ parser = argparse .ArgumentParser (prog = 'funannotate-remote.py' ,
18
+ description = '''Script that adds functional annotation to a genome using remote searches.''' ,
19
+ epilog = """Written by Jon Palmer (2016-2017) [email protected] """ ,
20
+ formatter_class = MyFormatter )
21
+ parser .add_argument ('-i' , '--input' , help = 'Folder from funannotate predict.' )
22
+ parser .add_argument ('-g' , '--genbank' , help = 'Annotated genome in GenBank format' )
23
+ parser .add_argument ('-m' , '--methods' , required = True , nargs = '+' , choices = ['all' , 'phobius' , 'antismash' , 'interproscan' ], help = 'Method to run' )
24
+ parser .add_argument ('-o' , '--out' , help = 'Basename of output files' )
25
+ parser .add_argument ('-e' , '--email' , required = True , help = 'Email address for IPRSCAN server' )
26
+ parser .add_argument ('--force' , action = 'store_true' , help = 'Over-write output folder' )
27
+ args = parser .parse_args ()
28
+
29
+ def runIPRpython (Input ):
30
+ base = Input .split ('/' )[- 1 ]
31
+ base = base .split ('.fa' )[0 ]
32
+ OUTPATH = os .path .join (IPROUT , base )
33
+ subprocess .call ([sys .executable , RUNIPRSCAN , '--goterms' ,'--email' , args .email , '--outfile' , OUTPATH , '--input' , Input ], stderr = FNULL , stdout = FNULL )
34
+ #now rename output files, just keep xml and tsv files?
35
+ time .sleep (3 ) #make sure there is time for all files to show up
36
+ os .rename (OUTPATH + '.xml.xml' , OUTPATH + '.xml' )
37
+ os .rename (OUTPATH + '.tsv.txt' , OUTPATH + '.tsv' )
38
+ os .rename (OUTPATH + '.svg.svg' , OUTPATH + '.svg' )
39
+ os .rename (OUTPATH + '.sequence.txt' , OUTPATH + '.fa' )
40
+ os .remove (OUTPATH + '.gff.txt' )
41
+ os .remove (OUTPATH + '.htmltarball.html.tar.gz' )
42
+ os .remove (OUTPATH + '.log.txt' )
43
+ os .remove (OUTPATH + '.out.txt' )
44
+
45
+ def download (url , name ):
46
+ file_name = name
47
+ u = urllib2 .urlopen (url )
48
+ f = open (file_name , 'wb' )
49
+ meta = u .info ()
50
+ file_size = int (meta .getheaders ("Content-Length" )[0 ])
51
+ print ("Downloading: {0} Bytes: {1}" .format (url , file_size ))
52
+ file_size_dl = 0
53
+ block_sz = 8192
54
+ while True :
55
+ buffer = u .read (block_sz )
56
+ if not buffer :
57
+ break
58
+ file_size_dl += len (buffer )
59
+ f .write (buffer )
60
+ p = float (file_size_dl ) / file_size
61
+ status = r"{0} [{1:.2%}]" .format (file_size_dl , p )
62
+ status = status + chr (8 )* (len (status )+ 1 )
63
+ sys .stdout .write (status )
64
+ f .close ()
65
+
66
+ #create log file
67
+ log_name = 'funannotate-remote.log'
68
+ if os .path .isfile (log_name ):
69
+ os .remove (log_name )
70
+
71
+ #initialize script, log system info and cmd issue at runtime
72
+ lib .setupLogging (log_name )
73
+ FNULL = open (os .devnull , 'w' )
74
+ cmd_args = " " .join (sys .argv )+ '\n '
75
+ lib .log .debug (cmd_args )
76
+ print "-------------------------------------------------------"
77
+ lib .SystemInfo ()
78
+
79
+ #get version of funannotate
80
+ version = lib .get_version ()
81
+ lib .log .info ("Running %s" % version )
82
+
83
+ #need to do some checks here of the input
84
+ genbank = ''
85
+ Proteins = ''
86
+ if not args .input :
87
+ #did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
88
+ if not args .out :
89
+ lib .log .error ("If you are not providing funannotate predict input folder, then you need to provide an output folder (--out)" )
90
+ sys .exit (1 )
91
+ else :
92
+ outputdir = args .out
93
+ #create outputdir and subdirs
94
+ if not os .path .isdir (outputdir ):
95
+ os .makedirs (outputdir )
96
+ os .makedirs (os .path .join (outputdir , 'annotate_misc' ))
97
+ os .makedirs (os .path .join (outputdir , 'annotate_results' ))
98
+ os .makedirs (os .path .join (outputdir , 'logfiles' ))
99
+ if not args .genbank :
100
+ lib .log .error ("You did not specifiy the apropriate input files, either: \n 1) Funannotate input \n 2) GenBank" )
101
+ sys .exit (1 )
102
+ else :
103
+ #create output directories
104
+ if not os .path .isdir (outputdir ):
105
+ os .makedirs (outputdir )
106
+ os .makedirs (os .path .join (outputdir , 'annotate_misc' ))
107
+ os .makedirs (os .path .join (outputdir , 'annotate_results' ))
108
+ os .makedirs (os .path .join (outputdir , 'logfiles' ))
109
+ else :
110
+ lib .log .error ("Output directory %s already exists" % (outputdir ))
111
+ if not os .path .isdir (os .path .join (outputdir , 'annotate_misc' )):
112
+ os .makedirs (os .path .join (outputdir , 'annotate_misc' ))
113
+ if not os .path .isdir (os .path .join (outputdir , 'annotate_results' )):
114
+ os .makedirs (os .path .join (outputdir , 'annotate_results' ))
115
+ if not os .path .isdir (os .path .join (outputdir , 'logfiles' )):
116
+ os .makedirs (os .path .join (outputdir , 'logfiles' ))
117
+ genbank = args .genbank
118
+ Scaffolds = os .path .join (outputdir , 'annotate_misc' , 'genome.scaffolds.fasta' )
119
+ Proteins = os .path .join (outputdir , 'annotate_misc' , 'genome.proteins.fasta' )
120
+ Transcripts = os .path .join (outputdir , 'annotate_misc' , 'genome.transcripts.fasta' )
121
+ GFF = os .path .join (outputdir , 'annotate_misc' , 'genome.gff3' )
122
+ lib .log .info ("Checking GenBank file for annotation" )
123
+ if not lib .checkGenBank (genbank ):
124
+ lib .log .error ("Found no annotation in GenBank file, exiting" )
125
+ sys .exit (1 )
126
+ lib .gb2allout (genbank , GFF , Proteins , Transcripts , Scaffolds )
127
+
128
+ else :
129
+ #should be a folder, with funannotate files, thus store results there, no need to create output folder
130
+ if not os .path .isdir (args .input ):
131
+ lib .log .error ("%s directory does not exist" % args .input )
132
+ sys .exit (1 )
133
+ if os .path .isdir (os .path .join (args .input , 'predict_results' )): #funannotate results should be here
134
+ inputdir = os .path .join (args .input , 'predict_results' )
135
+ outputdir = args .input
136
+ else :
137
+ inputdir = os .path .join (args .input ) #here user specified the predict_results folder, or it is a custom folder
138
+
139
+ #get files that you need
140
+ for file in os .listdir (inputdir ):
141
+ if file .endswith ('.gbk' ):
142
+ genbank = os .path .join (inputdir , file )
143
+ if file .endswith ('.gff3' ):
144
+ GFF = os .path .join (inputdir , file )
145
+
146
+ #now create the files from genbank input file for consistency in gene naming, etc
147
+ if not genbank or not GFF :
148
+ lib .log .error ("Properly formatted 'funannotate predict' files do no exist in this directory" )
149
+ sys .exit (1 )
150
+ else :
151
+ if 'predict_results' in inputdir : #if user gave predict_results folder, then set output to up one directory
152
+ outputdir = lib .get_parent_dir (inputdir )
153
+ else :
154
+ if not args .out :
155
+ outputdir = inputdir #output the results in the input directory
156
+ else :
157
+ outputdir = args .out
158
+ if not os .path .isdir (outputdir ):
159
+ os .makedirs (outputdir )
160
+ #create output directories
161
+ if not os .path .isdir (os .path .join (outputdir , 'annotate_misc' )):
162
+ os .makedirs (os .path .join (outputdir , 'annotate_misc' ))
163
+ os .makedirs (os .path .join (outputdir , 'annotate_results' ))
164
+ else :
165
+ lib .log .error ("Output directory %s already exists, will use any existing data. If this is not what you want, exit, and provide a unique name for output folder" % (outputdir ))
166
+ lib .log .info ("Parsing input files" )
167
+ Scaffolds = os .path .join (outputdir , 'annotate_misc' , 'genome.scaffolds.fasta' )
168
+ Proteins = os .path .join (outputdir , 'annotate_misc' ,'genome.proteins.fasta' )
169
+ Transcripts = os .path .join (outputdir , 'annotate_misc' , 'genome.transcripts.fasta' )
170
+ lib .gb2output (genbank , Proteins , Transcripts , Scaffolds )
171
+
172
+ #make sure logfiles directory is present, will need later
173
+ if not os .path .isdir (os .path .join (outputdir , 'logfiles' )):
174
+ os .makedirs (os .path .join (outputdir , 'logfiles' ))
175
+
176
+ #get absolute path for all input so there are no problems later, not using Transcripts yet could be error? so take out here
177
+ Proteins = os .path .abspath (Proteins )
178
+ genbank = os .path .abspath (genbank )
179
+
180
+
181
+ if 'phobius' in args .methods or 'all' in args .methods :
182
+ #run Phobius to predict secreted proteins and membrane, default is local if installed, otherwise remote
183
+ phobius_out = os .path .join (outputdir , 'annotate_misc' , 'phobius.results.txt' )
184
+ phobiusLog = os .path .join (outputdir , 'logfiles' , 'phobius.log' )
185
+ lib .log .info ("Predicting secreted and transmembrane proteins using Phobius" )
186
+ if not lib .checkannotations (phobius_out ):
187
+ if args .email :
188
+ subprocess .call ([os .path .join (parentdir , 'util' , 'phobius-multiproc.py' ), '-i' , Proteins , '-o' , phobius_out , '-e' , str (args .email ), '-l' , phobiusLog ])
189
+ else :
190
+ subprocess .call ([os .path .join (parentdir , 'util' , 'phobius-multiproc.py' ), '-i' , Proteins , '-o' , phobius_out , '-l' , phobiusLog ])
191
+
192
+ if 'interproscan' in args .methods or 'all' in args .methods :
193
+ IPRCombined = os .path .join (outputdir , 'annotate_misc' , 'iprscan.xml' )
194
+ #run interpro scan
195
+ IPROUT = os .path .join (outputdir , 'annotate_misc' , 'iprscan' )
196
+ PROTS = os .path .join (outputdir , 'annotate_misc' , 'protein_tmp' )
197
+ for i in IPROUT ,PROTS :
198
+ if not os .path .exists (i ):
199
+ os .makedirs (i )
200
+ #now run interproscan
201
+ #split input into individual files
202
+ lib .splitFASTA (Proteins , PROTS )
203
+ #now iterate over list using pool and up to 25 submissions at a time
204
+ proteins = []
205
+ for file in os .listdir (PROTS ):
206
+ if file .endswith ('.fa' ):
207
+ file = os .path .join (PROTS , file )
208
+ proteins .append (file )
209
+
210
+ num_files = len (glob .glob1 (IPROUT ,"*.xml" ))
211
+ num_prots = len (proteins )
212
+ lib .log .info ("Now running InterProScan search remotely using EBI servers on " + '{0:,}' .format (num_prots ) + ' proteins' )
213
+ #build in a check before running (in case script gets stopped and needs to restart
214
+ finished = []
215
+ for file in os .listdir (IPROUT ):
216
+ if file .endswith ('.xml' ):
217
+ base = file .split ('.xml' )[0 ]
218
+ fasta_file = os .path .join (PROTS , base + '.fa' )
219
+ finished .append (fasta_file )
220
+
221
+ finished = set (finished ) #make sure no duplicates
222
+ runlist = [x for x in proteins if x not in finished ]
223
+ if len (runlist ) < num_prots :
224
+ lib .log .info ("Results found, querying remaining %i proteins" % len (runlist ))
225
+ #start up the list, max 25 at a time
226
+ lib .runMultiProgress (runIPRpython , runlist , 25 )
227
+ #clean up protein fasta files
228
+ shutil .rmtree (PROTS )
229
+ #now convert to single file and then clean up
230
+ with open (IPRCombined , 'w' ) as output :
231
+ subprocess .call ([sys .executable , XMLCombine , IPROUT ], stdout = output )
232
+ if lib .checkannotations (IPRCombined ):
233
+ shutil .rmtree (IPROUT )
234
+
235
+ if 'antismash' in args .methods or 'all' in args .methods :
236
+ version = requests .get ("http://fungismash.secondarymetabolites.org/api/v1.0/version" )
237
+ as_vers = version .json ()['antismash_generation' ]
238
+ tax = version .json ()['taxon' ]
239
+ as_status = requests .get ("http://fungismash.secondarymetabolites.org/api/v1.0/stats" )
240
+ queue = as_status .json ()['queue_length' ]
241
+ running = as_status .json ()['running' ]
242
+ lib .log .info ("Connecting to antiSMASH %s v%s webserver" % (tax , as_vers ))
243
+ lib .log .info ("Queue Length: %s; Jobs Running: %s" % (queue , running ))
244
+ lib .log .info ("PLEASE to not abuse the webserver, be considerate!" )
245
+ if int (queue ) > 10 and not args .force :
246
+ lib .log .error ("There are more than 10 antiSMASH jobs in queue, use --force to submit anyway" )
247
+ sys .exit (1 )
248
+ job_files = {'seq' : open (genbank , 'rb' )}
249
+ job_parameters = {'email' : args .email , 'smcogs' : 'on' , 'knownclusterblast' : 'on' , 'activesitefinder' : 'on' , 'subclusterblast' : 'on' }
250
+ lib .log .info ("Uploading %s to webserver" % genbank )
251
+ postjob = requests .post ("http://fungismash.secondarymetabolites.org/api/v1.0/submit" , files = job_files , data = job_parameters )
252
+ jobid = postjob .json ()['id' ]
253
+ #now we can query the job every so often, not sure what is reasonable here, start with 2 minutes?
254
+ lib .log .info ("Waiting for results from job: %s" % jobid )
255
+ while True :
256
+ job_status = requests .get ("http://fungismash.secondarymetabolites.org/api/v1.0/status/" + jobid )
257
+ if job_status .json ()['status' ] == 'done' :
258
+ break
259
+ time .sleep (120 ) #check every 2 minutes
260
+ result_url = job_status .json ()['result_url' ]
261
+ base_url = result_url .replace ('index.html' , '' )
262
+ lib .log .info ("antiSMASH v%s job finished" % (as_vers ))
263
+ lib .log .debug ("%s" % job_status .json ())
264
+ #need to retrieve results, have to find link, seems like this might be first scaffold name?
265
+ #after asking Kai Blin - there is no "easy" way to identify the output name, however, think I can grab the html file and parse it
266
+ job_html = requests .get ("http://fungismash.secondarymetabolites.org" + result_url )
267
+ for line in job_html .iter_lines ():
268
+ if 'Download GenBank summary file' in line :
269
+ cols = line .split ('a href="' )
270
+ for x in cols :
271
+ if '.zip' in x :
272
+ link = x .split ('"' )[0 ]
273
+ baselink = link .replace ('.zip' , '' )
274
+ download_url = "http://fungismash.secondarymetabolites.org" + base_url + link
275
+ download (download_url , 'antiSMASH.zip' )
276
+ #now unzip and move folder
277
+ zipref = zipfile .ZipFile ('antiSMASH.zip' , 'r' )
278
+ zipref .extractall (outputdir )
279
+ zipref .close ()
280
+ os .remove ('antiSMASH.zip' )
281
+ lib .log .info ("Results folder: %s/%s" % (outputdir , jobid ))
282
+ #now grab the GBK files from folder as you will need just that for annotation, place in annotate_misc folder for auto-detection
283
+ anti_GBK = os .path .join (outputdir , jobid , baselink + '.final.gbk' )
284
+ final = os .path .join (outputdir , 'annotate_misc' , 'antiSMASH.results.gbk' )
285
+ shutil .copyfile (anti_GBK , final )
286
+ lib .log .info ("Results GBK: %s" % final )
287
+
288
+ lib .log .info ("Remote searches complete" )
289
+ #move logfile
290
+ os .rename (log_name , os .path .join (outputdir , 'logfiles' , log_name ))
0 commit comments