@@ -32,6 +32,7 @@ def __init__(self, prog):
32
32
parser .add_argument ('--transcript_evidence' , help = 'Transcript evidence (map to genome with GMAP)' )
33
33
parser .add_argument ('--gmap_gff' , help = 'Pre-computed GMAP transcript alignments (GFF3)' )
34
34
parser .add_argument ('--pasa_gff' , help = 'Pre-computed PASA/TransDecoder high quality models' )
35
+ parser .add_argument ('--other_gff' , help = 'GFF gene prediction pass-through to EVM' )
35
36
parser .add_argument ('--augustus_gff' , help = 'Pre-computed Augustus gene models (GFF3)' )
36
37
parser .add_argument ('--genemark_gtf' , help = 'Pre-computed GeneMark gene models (GTF)' )
37
38
parser .add_argument ('--maker_gff' , help = 'MAKER2 GFF output' )
@@ -191,7 +192,34 @@ def __init__(self, prog):
191
192
192
193
if args .protein_evidence == 'uniprot.fa' :
193
194
args .protein_evidence = os .path .join (parentdir , 'DB' , 'uniprot_sprot.fasta' )
194
-
195
+
196
+ #convert PASA GFF and/or GFF pass-through
197
+ #convert PASA to have 'pasa_pred' in second column to make sure weights work with EVM
198
+ PASA_GFF = os .path .join (args .out , 'predict_misc' , 'pasa_predictions.gff3' )
199
+ PASA_weight = '10'
200
+ if args .pasa_gff :
201
+ if ':' in args .pasa_gff :
202
+ pasacol = args .pasa_gff .split (':' )
203
+ PASA_weight = pasacol [1 ]
204
+ args .pasa_gff = pasacol [0 ]
205
+ lib .renameGFF (args .pasa_gff , 'pasa_pred' , PASA_GFF )
206
+ #validate it will work with EVM
207
+ if not lib .evmGFFvalidate (PASA_GFF , EVM , lib .log ):
208
+ lib .log .error ("ERROR: %s is not a properly formatted PASA GFF file, please consult EvidenceModeler docs" % args .pasa_gff )
209
+ sys .exit (1 )
210
+ OTHER_GFF = os .path .join (args .out , 'predict_misc' , 'other_predictions.gff3' )
211
+ OTHER_weight = '1'
212
+ if args .other_gff :
213
+ if ':' in args .other_gff :
214
+ othercol = args .other_gff .split (':' )
215
+ OTHER_weight = othercol [1 ]
216
+ args .other_gff = othercol [0 ]
217
+ lib .renameGFF (args .other_gff , 'other_pred' , OTHER_GFF )
218
+ #validate it will work with EVM
219
+ if not lib .evmGFFvalidate (OTHER_GFF , EVM , lib .log ):
220
+ lib .log .error ("ERROR: %s is not a properly formatted GFF file, please consult EvidenceModeler docs" % args .other_gff )
221
+ sys .exit (1 )
222
+
195
223
#check input files to make sure they are not empty, first check if multiple files passed to transcript/protein evidence
196
224
input_checks = [args .input , args .masked_genome , args .repeatmasker_gff3 , args .genemark_mod , args .exonerate_proteins , args .gmap_gff , args .pasa_gff , args .repeatmodeler_lib , args .rna_bam ]
197
225
if ',' in args .protein_evidence : #there will always be something here, since defaults to uniprot
@@ -269,10 +297,9 @@ def __init__(self, prog):
269
297
#EVM command line scripts
270
298
Converter = os .path .join (EVM , 'EvmUtils' , 'misc' , 'augustus_GFF3_to_EVM_GFF3.pl' )
271
299
ExoConverter = os .path .join (EVM , 'EvmUtils' , 'misc' , 'exonerate_gff_to_alignment_gff3.pl' )
272
- Validator = os .path .join (EVM , 'EvmUtils' , 'gff3_gene_prediction_file_validator.pl' )
273
300
Converter2 = os .path .join (EVM , 'EvmUtils' , 'misc' , 'augustus_GTF_to_EVM_GFF3.pl' )
274
301
EVM2proteins = os .path .join (EVM , 'EvmUtils' , 'gff3_file_to_proteins.pl' )
275
-
302
+
276
303
#repeatmasker, run if not passed from command line
277
304
if not os .path .isfile (MaskGenome ):
278
305
if not args .repeatmodeler_lib :
@@ -292,17 +319,22 @@ def __init__(self, prog):
292
319
lib .log .error ("RepeatMasking failed, check log files." )
293
320
sys .exit (1 )
294
321
295
- #load contig names and sizes into dictionary.
322
+ #load contig names and sizes into dictionary, get masked repeat stats
296
323
ContigSizes = {}
324
+ GenomeLength = 0
325
+ maskedSize = 0
297
326
with open (MaskGenome , 'rU' ) as input :
298
327
for rec in SeqIO .parse (input , 'fasta' ):
299
328
if not rec .id in ContigSizes :
300
329
ContigSizes [rec .id ] = len (rec .seq )
330
+ GenomeLength += len (rec .seq )
331
+ maskedSize += lib .n_lower_chars (str (rec .seq ))
301
332
else :
302
333
lib .log .error ("Error, duplicate contig names, exiting" )
303
334
sys .exit (1 )
304
- GenomeLength = sum (ContigSizes .values ())
305
- lib .log .info ('Masked genome: {0:,}' .format (len (ContigSizes ))+ ' scaffolds; {0:,}' .format (GenomeLength )+ ' bp' )
335
+ percentMask = maskedSize / float (GenomeLength )
336
+ MaskedStats = '{0:.2f}%' .format (percentMask * 10 )
337
+ lib .log .info ('Masked genome: {0:,}' .format (len (ContigSizes ))+ ' scaffolds; {0:,}' .format (GenomeLength )+ ' bp; ' + MaskedStats + ' repeats masked' )
306
338
307
339
#check for previous files and setup output files
308
340
Predictions = os .path .join (args .out , 'predict_misc' , 'gene_predictions.gff3' )
@@ -326,7 +358,7 @@ def __init__(self, prog):
326
358
#append PASA data if exists
327
359
if args .pasa_gff :
328
360
with open (Predictions , 'a' ) as output :
329
- with open (args . pasa_gff ) as input :
361
+ with open (PASA_GFF ) as input :
330
362
output .write (input .read ())
331
363
#setup weights file for EVM
332
364
with open (Weights , 'w' ) as output :
@@ -401,6 +433,9 @@ def __init__(self, prog):
401
433
if args .gmap_gff :
402
434
shutil .copyfile (args .gmap_gff , trans_out )
403
435
Transcripts = os .path .abspath (trans_out )
436
+ if Transcripts :
437
+ total = lib .countGMAPtranscripts (Transcripts )
438
+ lib .log .info ('{0:,}' .format (total ) + ' transcripts aligned with GMAP' )
404
439
if not os .path .isfile (hintsE ): #use previous hints file if exists
405
440
if os .path .isfile (trans_temp ): #if transcripts are available to algin, run BLAT
406
441
#now run BLAT for Augustus hints
@@ -417,6 +452,8 @@ def __init__(self, prog):
417
452
blat2hints = os .path .join (AUGUSTUS_BASE , 'scripts' , 'blat2hints.pl' )
418
453
cmd = [blat2hints , b2h_input , b2h_output , '--minintronlen=20' , '--trunkSS' ]
419
454
lib .runSubprocess (cmd , '.' , lib .log )
455
+ total = lib .line_count (blat_sort2 )
456
+ lib .log .info ('{0:,}' .format (total ) + ' filtered BLAT alignments' )
420
457
else :
421
458
lib .log .error ("No transcripts available to generate Augustus hints, provide --transcript_evidence" )
422
459
@@ -542,11 +579,11 @@ def __init__(self, prog):
542
579
shutil .rmtree (os .path .join (args .out , 'predict_misc' , 'braker' ))
543
580
os .rename ('braker' , os .path .join (args .out , 'predict_misc' , 'braker' ))
544
581
#okay, now need to fetch the Augustus GFF and Genemark GTF files
545
- aug_out = os .path .join (args .out , 'predict_misc' , 'braker' , aug_species , 'augustus.gff3 ' )
582
+ aug_out = os .path .join (args .out , 'predict_misc' , 'braker' , aug_species , 'augustus.gff ' )
546
583
gene_out = os .path .join (args .out , 'predict_misc' , 'braker' , aug_species , 'GeneMark-ET' , 'genemark.gtf' )
547
584
#now convert to EVM format
548
585
Augustus = os .path .join (args .out , 'predict_misc' , 'augustus.evm.gff3' )
549
- cmd = ['perl' , Converter , aug_out ]
586
+ cmd = ['perl' , Converter2 , aug_out ]
550
587
lib .runSubprocess2 (cmd , '.' , lib .log , Augustus )
551
588
GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
552
589
cmd = [GeneMark2GFF , gene_out ]
@@ -573,7 +610,7 @@ def __init__(self, prog):
573
610
lib .log .info ("Training Augustus using PASA data, this may take awhile" )
574
611
GFF2GB = os .path .join (AUGUSTUS_BASE , 'scripts' , 'gff2gbSmallDNA.pl' )
575
612
trainingset = os .path .join (args .out , 'predict_misc' , 'augustus.pasa.gb' )
576
- cmd = [GFF2GB , args . pasa_gff , MaskGenome , '500' , trainingset ]
613
+ cmd = [GFF2GB , PASA_GFF , MaskGenome , '500' , trainingset ]
577
614
lib .runSubprocess (cmd , '.' , lib .log )
578
615
if args .optimize_augustus :
579
616
lib .trainAugustus (AUGUSTUS_BASE , aug_species , trainingset , MaskGenome , args .out , args .cpus , True )
@@ -844,7 +881,7 @@ def __init__(self, prog):
844
881
sys .exit (1 )
845
882
846
883
#if hints used for Augustus, get high quality models > 90% coverage to pass to EVM
847
- if os .path .isfile (hints_all ) and not args .rna_bam :
884
+ if os .path .isfile (hints_all ) or args .rna_bam :
848
885
lib .log .info ("Pulling out high quality Augustus predictions" )
849
886
hiQ_models = []
850
887
with open (aug_out , 'rU' ) as augustus :
@@ -881,16 +918,14 @@ def __init__(self, prog):
881
918
882
919
883
920
#EVM related input tasks, find all predictions and concatenate together
921
+ pred_in = [Augustus , GeneMark ]
884
922
if args .pasa_gff :
885
- if os .path .isfile (hints_all ) and not args .rna_bam :
886
- pred_in = [Augustus , GeneMark , args .pasa_gff , AugustusHiQ ]
887
- else :
888
- pred_in = [Augustus , GeneMark , args .pasa_gff ]
889
- else :
890
- if os .path .isfile (hints_all ) and not args .rna_bam :
891
- pred_in = [Augustus , GeneMark , AugustusHiQ ]
892
- else :
893
- pred_in = [Augustus , GeneMark ]
923
+ pred_in .append (PASA_GFF )
924
+ if args .other_gff :
925
+ pred_in .append (OTHER_GFF )
926
+ if os .path .isfile (hints_all ) or args .rna_bam :
927
+ pred_in .append (AugustusHiQ )
928
+
894
929
#write gene predictions file
895
930
with open (Predictions + '.tmp' , 'w' ) as output :
896
931
for f in sorted (pred_in ):
@@ -907,11 +942,13 @@ def __init__(self, prog):
907
942
if os .path .isfile (hints_all ) and not args .rna_bam :
908
943
output .write ("OTHER_PREDICTION\t HiQ\t 5\n " )
909
944
if args .pasa_gff :
910
- output .write ("OTHER_PREDICTION\t transdecoder \t 10 \n " )
945
+ output .write ("OTHER_PREDICTION\t pasa_pred \t %s \n " % PASA_weight )
911
946
if exonerate_out :
912
947
output .write ("PROTEIN\t exonerate\t 1\n " )
913
948
if Transcripts :
914
949
output .write ("TRANSCRIPT\t genome\t 1\n " )
950
+ if args .other_gff :
951
+ output .write ("OTHER_PREDICTION\t other_pred\t 1\n " % OTHER_weight )
915
952
916
953
#total up Predictions
917
954
total = lib .countGFFgenes (Predictions )
0 commit comments