11#!/usr/bin/env python
22
3- import sys , os , subprocess , csv , shutil , inspect , itertools , argparse
3+ import sys , os , subprocess , shutil , inspect , itertools , argparse
44from Bio import SeqIO
5+ from Bio .SeqIO .FastaIO import SimpleFastaParser
56currentdir = os .path .dirname (os .path .abspath (inspect .getfile (inspect .currentframe ())))
67parentdir = os .path .dirname (currentdir )
78sys .path .insert (0 ,parentdir )
@@ -22,9 +23,9 @@ def __init__(self,prog):
2223parser .add_argument ('-o' ,'--out' , required = True , help = 'Final exonerate output file' )
2324parser .add_argument ('--maxintron' , default = 3000 , help = 'Maximum intron size' )
2425parser .add_argument ('--logfile' , default = 'funannotate-p2g.log' , help = 'logfile' )
26+ parser .add_argument ('--ploidy' , default = 1 , type = int , help = 'Ploidy of assembly' )
2527args = parser .parse_args ()
2628
27-
2829log_name = args .logfile
2930if os .path .isfile (log_name ):
3031 os .remove (log_name )
@@ -42,90 +43,153 @@ def __init__(self,prog):
4243blast_version = blast_version .split (': ' )[- 1 ]
4344lib .log .debug ("BLAST v%s; Exonerate v%s" % (blast_version , exo_version ))
4445
45- def tblastnFilter (input , query , cpus , output ):
46+ def runtblastn (input , query , cpus , output , maxhits ):
4647 #start by formatting blast db/dustmasker filtered format
4748 cmd = ['dustmasker' , '-in' , input , '-infmt' , 'fasta' , '-parse_seqids' , '-outfmt' , 'maskinfo_asn1_bin' , '-out' , 'genome_dust.asnb' ]
4849 lib .runSubprocess (cmd , output , lib .log )
4950 cmd = ['makeblastdb' , '-in' , input , '-dbtype' , 'nucl' , '-parse_seqids' , '-mask_data' , 'genome_dust.asnb' , '-out' , 'genome' ]
5051 lib .runSubprocess (cmd , output , lib .log )
51- cmd = ['tblastn' , '-num_threads' , str (cpus ), '-db' , 'genome' , '-query' , query , '-max_target_seqs' , '1' , '-db_soft_mask' , '11' , '-threshold' , '999' , '-max_intron_length' , str (args .maxintron ), '-evalue' , '1e-10' , '-outfmt' , '6' , '-out' , 'filter.tblastn.tab' ]
52+ cmd = ['tblastn' , '-num_threads' , str (cpus ), '-db' , 'genome' , '-query' , query , '-max_target_seqs' , str ( maxhits ) , '-db_soft_mask' , '11' , '-threshold' , '999' , '-max_intron_length' , str (args .maxintron ), '-evalue' , '1e-10' , '-outfmt' , '6' , '-out' , 'filter.tblastn.tab' ]
5253 lib .runSubprocess (cmd , output , lib .log )
5354
5455def parseBlast (blastresult ):
55- global HitList
56- HitList = []
57- #now parse through results, generating a list for exonerate function
56+ Results = {}
5857 with open (blastresult , 'rU' ) as input :
59- reader = csv .reader (input , delimiter = '\t ' )
60- for cols in reader :
61- hit = cols [0 ] + '::' + cols [1 ]
62- if hit not in HitList :
63- HitList .append (hit )
58+ for line in input :
59+ cols = line .split ('\t ' )
60+ hit = cols [0 ] + ':::' + cols [1 ]
61+ if int (cols [8 ]) < int (cols [9 ]):
62+ start = cols [8 ]
63+ end = cols [9 ]
64+ else :
65+ start = cols [9 ]
66+ end = cols [8 ]
67+ if not hit in Results :
68+ Results [hit ] = (start , end )
69+ else :
70+ #get old start stop
71+ old = Results .get (hit )
72+ if int (start ) < int (old [0 ]):
73+ newstart = start
74+ else :
75+ newstart = old [0 ]
76+ if int (end ) > int (old [1 ]):
77+ newstop = end
78+ else :
79+ newstop = old [1 ]
80+ Results [hit ] = (newstart , newstop )
81+ #convert Dictionary to a list that has hit:::scaffold:::start:::stop
82+ HitList = []
83+ for k ,v in Results .items ():
84+ finalhit = k + ':::' + str (v [0 ])+ ':::' + str (v [1 ])
85+ HitList .append (finalhit )
86+ return HitList
6487
6588def runExonerate (input ):
66- FNULL = open ( os . devnull , 'w ' )
67- s = input . split ( '::' )
68- if s [ 0 ]. startswith ( 'sp|' ):
69- name = s [ 0 ]. split ( "|" )[ 1 ] + '_' + s [ 1 ]
70- else :
71- name = s [ 0 ]. split ()[ 0 ] + '_' + s [ 1 ]
72- query = os .path .join (tmpdir , name + '.fa' )
89+ s = input . split ( '::: ' )
90+ ProtID = s [ 0 ]
91+ ScaffID = s [ 1 ]
92+ ScaffStart = int ( s [ 2 ])
93+ ScaffEnd = int ( s [ 3 ])
94+ #get the protein model
95+ query = os .path .join (tmpdir , ProtID + '.' + str ( os . getpid ()) + '.fa' )
7396 with open (query , 'w' ) as output :
74- rec = record_dict [ s [ 0 ]]
75- output . write ( ">%s \n %s \n " % ( rec . id , rec . seq ))
76- scaffold = s [ 1 ] + '.fa'
97+ SeqIO . write ( protein_dict [ ProtID ], output , 'fasta' )
98+ #now get the genome region, use different variable names for SeqRecords to avoid collision
99+ scaffold = ScaffID + '.' + ProtID + '.' + str ( ScaffStart ) + '-' + str ( ScaffEnd ) + '.fa'
77100 scaffold = os .path .join (tmpdir , scaffold )
78- exonerate_out = 'exonerate_' + name + '.out'
101+ with open (scaffold , 'w' ) as output2 :
102+ with open (os .path .join (tmpdir , 'scaffolds' , ScaffID + '.fa' ), 'rU' ) as fullscaff :
103+ for header , Sequence in SimpleFastaParser (fullscaff ):
104+ #grab a 1 kb cushion on either side of hit region, careful of scaffold ends
105+ start = ScaffStart - 1000
106+ if start < 1 :
107+ start = 1
108+ end = ScaffEnd + 1000
109+ if end > len (Sequence ):
110+ end = len (Sequence )
111+ output2 .write ('>%s\n %s\n ' % (header , Sequence [start :end ]))
112+ exoname = ProtID + '.' + ScaffID + '__' + str (start )+ '__'
113+ #check that input files are created and valid
114+ exonerate_out = 'exonerate.' + exoname + '.out'
79115 exonerate_out = os .path .join (tmpdir , exonerate_out )
80116 ryo = "AveragePercentIdentity: %pi\n "
81- with open (exonerate_out , 'w' ) as output :
82- subprocess .call (['exonerate' , '--model' , 'p2g' , '--showvulgar' , 'no' , '--showalignment' , 'no' , '--showquerygff' , 'no' , '--showtargetgff' , 'yes' , '--maxintron' , str (args .maxintron ), '--percent' , '80' , '--ryo' , ryo , query , scaffold ], stdout = output , stderr = FNULL )
83- os .remove (query )
84- #check filesize of exonerate output, no hits are 285 bytes, but lets just filter everything smaller than 310
85- if lib .getSize (exonerate_out ) < 310 :
117+ cmd = ['exonerate' , '--model' , 'p2g' , '--showvulgar' , 'no' , '--showalignment' , 'no' , '--showquerygff' , 'no' , '--showtargetgff' , 'yes' , '--maxintron' , str (args .maxintron ), '--percent' , '80' , '--ryo' , ryo , query , scaffold ]
118+ #run exonerate, capture errors
119+ with open (exonerate_out , 'w' ) as output3 :
120+ proc = subprocess .Popen (cmd , stdout = output3 , stderr = subprocess .PIPE )
121+ stderr = proc .communicate ()
122+ if 'WARNING' in stderr [1 ]:
123+ lib .log .debug ('%s, Len=%i, %i-%i; %i-%i' % (header , len (Sequence ), ScaffStart , ScaffEnd , start , end ))
124+ os .rename (query , os .path .join (tmpdir , 'failed' , os .path .basename (query )))
125+ os .rename (scaffold , os .path .join (tmpdir , 'failed' , os .path .basename (scaffold )))
126+ else :
127+ for y in [query , scaffold ]:
128+ try :
129+ os .remove (y )
130+ except OSError :
131+ lib .log .debug ("Error removing %s" % (y ))
132+ #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
133+ if lib .getSize (exonerate_out ) < 500 :
86134 os .remove (exonerate_out )
87-
135+
88136#make tmpdir
89137tmpdir = 'p2g_' + str (os .getpid ())
90138if not os .path .isdir (tmpdir ):
91139 os .makedirs (tmpdir )
92-
140+ os .makedirs (os .path .join (tmpdir , 'failed' ))
141+ os .makedirs (os .path .join (tmpdir , 'scaffolds' ))
142+ #check for tblastn input
93143if args .tblastn :
94144 lib .log .info ("Using pre-calculated tBLASTN result" )
95145 BlastResult = args .tblastn
96146else :
97147 lib .log .info ("Running pre-filter tBlastn step" )
98148 BlastResult = os .path .join (tmpdir , 'filter.tblastn.tab' )
99- tblastnFilter (os .path .abspath (args .genome ), os .path .abspath (args .proteins ), args .cpus , tmpdir )
149+ runtblastn (os .path .abspath (args .genome ), os .path .abspath (args .proteins ), args .cpus , tmpdir , args .ploidy * 2 ) #2X ploidy for tBLASTn filter
150+
151+ #new routine
152+ Hits = parseBlast (BlastResult )
153+ lib .log .info ("Found %i preliminary alignments" % (len (Hits )))
100154
101- #parse the results
102- parseBlast (BlastResult )
103- lib .log .info ("found %i preliminary alignments" % (len (HitList )))
155+ #index the genome and proteins
156+ protein_dict = SeqIO .index (os .path .abspath (args .proteins ), 'fasta' ) #do index here in case memory problems?
104157
105158#split genome fasta into individual scaffolds
106- if not os .path .exists (tmpdir ):
107- os .makedirs (tmpdir )
108159with open (os .path .abspath (args .genome ), 'rU' ) as input :
109160 for record in SeqIO .parse (input , "fasta" ):
110- SeqIO .write (record , os .path .join (tmpdir , record .id + ".fa" ), "fasta" )
111-
112- #Now run exonerate on hits
113- lib .log .info ("Polishing alignments with Exonerate" )
114- record_dict = SeqIO .index (os .path .abspath (args .proteins ), 'fasta' ) #do index here in case memory problems?
161+ SeqIO .write (record , os .path .join (tmpdir , 'scaffolds' , record .id + ".fa" ), "fasta" )
115162
116163#run multiprocessing exonerate
117- lib .runMultiProgress (runExonerate , HitList , args .cpus )
118- lib .log .info ("Exonerate finished" )
119-
120- #now collect all exonerate results into one
121- with open (args .out , 'wb' ) as output :
122- for root , dirs , files in os .walk (tmpdir ):
123- for file in files :
124- if file .endswith ('.out' ):
125- filename = os .path .join (root , file )
126- with open (filename , 'rU' ) as readfile :
127- for line in itertools .islice (readfile , 3 , None ):
164+ lib .runMultiProgress (runExonerate , Hits , args .cpus )
165+
166+ #now need to loop through and offset exonerate predictions back to whole scaffolds
167+ with open (args .out , 'w' ) as output :
168+ for file in os .listdir (tmpdir ):
169+ if file .endswith ('.out' ):
170+ with open (os .path .join (tmpdir , file ), 'rU' ) as exoresult :
171+ offset = int (file .split ('__' )[1 ])
172+ for line in itertools .islice (exoresult , 3 , None ):
173+ if line .startswith ('#' ) or line .startswith ('Average' ) or line .startswith ('-- completed' ):
128174 output .write (line )
129-
130- #finally clean-up your mess
131- shutil .rmtree (tmpdir )
175+ else :
176+ cols = line .split ('\t ' )
177+ cols [3 ] = str (int (cols [3 ])+ offset )
178+ cols [4 ] = str (int (cols [4 ])+ offset )
179+ output .write ('\t ' .join (cols ))
180+
181+ #output some quick summary of exonerate alignments that you found
182+ Found = lib .countGFFgenes (args .out )
183+ lib .log .info ("Exonerate finished: found %i alignments" % Found )
184+
185+ #finally clean-up your mess if failed is empty
186+ try :
187+ os .rmdir (os .path .join (tmpdir , 'failed' ))
188+ empty = True
189+ except OSError :
190+ empty = False
191+ if empty :
192+ shutil .rmtree (tmpdir )
193+ else :
194+ lib .log .error ("Failed exonerate alignments found, see files in %s" % os .path .join (tmpdir , 'failed' ))
195+ sys .exit (1 )
0 commit comments