1
1
#!/usr/bin/env python
2
2
3
- import sys , os , subprocess , inspect , multiprocessing , shutil , argparse , time
3
+ import sys , os , subprocess , inspect , multiprocessing , shutil , argparse , time , re
4
4
from Bio import SeqIO
5
5
currentdir = os .path .dirname (os .path .abspath (inspect .getfile (inspect .currentframe ())))
6
6
parentdir = os .path .dirname (currentdir )
@@ -35,7 +35,7 @@ def __init__(self,prog):
35
35
parser .add_argument ('--rna_bam' , help = 'BAM (sorted) of RNAseq aligned to reference for BRAKER1' )
36
36
parser .add_argument ('--min_intronlen' , default = 10 , help = 'Minimum intron length for gene models' )
37
37
parser .add_argument ('--max_intronlen' , default = 3000 , help = 'Maximum intron length for gene models' )
38
- parser .add_argument ('--min_protlen' , default = 51 , type = int , help = 'Minimum amino acid length for valid gene model' )
38
+ parser .add_argument ('--min_protlen' , default = 50 , type = int , help = 'Minimum amino acid length for valid gene model' )
39
39
parser .add_argument ('--keep_no_stops' , action = 'store_true' , help = 'Keep gene models without valid stop codons' )
40
40
parser .add_argument ('--cpus' , default = 2 , type = int , help = 'Number of CPUs to use' )
41
41
parser .add_argument ('--busco_seed_species' , default = 'anidulans' , help = 'Augustus species to use as initial training point for BUSCO' )
@@ -448,37 +448,74 @@ def __init__(self,prog):
448
448
subprocess .call (['perl' , Converter , aug_out ], stdout = output , stderr = FNULL )
449
449
450
450
if not GeneMark :
451
+ #count contigs
452
+ num_contigs = lib .countfasta (MaskGenome )
451
453
#now run GeneMark-ES, first check for gmhmm mod file, use if available otherwise run ES
452
454
if not args .genemark_mod :
453
- GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
454
- if not os .path .isfile (GeneMarkGFF3 ):
455
- if args .organism == 'fungus' :
456
- lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
457
- else :
458
- lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
459
- GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
460
- with open (GeneMarkTemp , 'w' ) as output :
461
- subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
462
- GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
463
- with open (GeneMark , 'w' ) as output :
464
- with open (GeneMarkTemp , 'rU' ) as input :
465
- lines = input .read ().replace ("Augustus" ,"GeneMark" )
466
- output .write (lines )
455
+ #if there are less than 2 data points (contigs, self-training fails), count contigs
456
+ if num_contigs < 2 :
457
+ lib .log .error ("GeneMark-ES cannot run with only a single contig, you must provide --ini_mod file to run GeneMark" )
458
+ else :
459
+ GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
460
+ if not os .path .isfile (GeneMarkGFF3 ):
461
+ if args .organism == 'fungus' :
462
+ lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
463
+ else :
464
+ lib .RunGeneMarkES (MaskGenome , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
465
+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
466
+ with open (GeneMarkTemp , 'w' ) as output :
467
+ subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
468
+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
469
+ with open (GeneMark , 'w' ) as output :
470
+ with open (GeneMarkTemp , 'rU' ) as input :
471
+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
472
+ output .write (lines )
467
473
else : #have training parameters file, so just run genemark with
468
474
GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
469
- if not os .path .isfile (GeneMarkGFF3 ):
470
- if args .organism == 'fungus' :
471
- lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
472
- else :
473
- lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
474
- GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
475
- with open (GeneMarkTemp , 'w' ) as output :
476
- subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
477
- GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
478
- with open (GeneMark , 'w' ) as output :
479
- with open (GeneMarkTemp , 'rU' ) as input :
480
- lines = input .read ().replace ("Augustus" ,"GeneMark" )
481
- output .write (lines )
475
+ if num_contigs < 2 : #now can run modified prediction on single contig
476
+ with open (MaskGenome , 'rU' ) as genome :
477
+ for line in genome :
478
+ if line .startswith ('>' ):
479
+ header = line .replace ('>' , '' )
480
+ header = header .replace ('\n ' , '' )
481
+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
482
+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
483
+ if not os .path .isfile (GeneMarkGFF3 ):
484
+ lib .log .info ("Running GeneMark on single-contig assembly" )
485
+ subprocess .call (['gmhmme3' , '-m' , args .genemark_mod , '-o' , GeneMarkGFF3 , '-f' , 'gff3' , MaskGenome ])
486
+ #now open output and reformat
487
+ lib .log .info ("Converting GeneMark GTF file to GFF3" )
488
+ with open (GeneMarkTemp , 'w' ) as geneout :
489
+ with open (GeneMarkGFF3 , 'rU' ) as genein :
490
+ for line in genein :
491
+ if not line .startswith ('#' ):
492
+ if not '\t Intron\t ' in line :
493
+ newline = line .replace ('seq' , header )
494
+ newline = newline .replace ('.hmm3' , '' )
495
+ geneout .write (newline )
496
+ GeneMarkTemp2 = os .path .join (args .out , 'predict_misc' , 'genemark.temp2.gff' )
497
+ with open (GeneMarkTemp2 , 'w' ) as output :
498
+ subprocess .call (['perl' , Converter , GeneMarkTemp ], stdout = output , stderr = FNULL )
499
+ with open (GeneMark , 'w' ) as output :
500
+ with open (GeneMarkTemp2 , 'rU' ) as input :
501
+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
502
+ output .write (lines )
503
+
504
+ else :
505
+ GeneMarkGFF3 = os .path .join (args .out , 'predict_misc' , 'genemark.gff' )
506
+ if not os .path .isfile (GeneMarkGFF3 ):
507
+ if args .organism == 'fungus' :
508
+ lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , True )
509
+ else :
510
+ lib .RunGeneMark (MaskGenome , args .genemark_mod , args .cpus , os .path .join (args .out , 'predict_misc' ), GeneMarkGFF3 , False )
511
+ GeneMarkTemp = os .path .join (args .out , 'predict_misc' , 'genemark.temp.gff' )
512
+ with open (GeneMarkTemp , 'w' ) as output :
513
+ subprocess .call (['perl' , Converter , GeneMarkGFF3 ], stdout = output , stderr = FNULL )
514
+ GeneMark = os .path .join (args .out , 'predict_misc' , 'genemark.evm.gff3' )
515
+ with open (GeneMark , 'w' ) as output :
516
+ with open (GeneMarkTemp , 'rU' ) as input :
517
+ lines = input .read ().replace ("Augustus" ,"GeneMark" )
518
+ output .write (lines )
482
519
483
520
if not Augustus :
484
521
if not args .augustus_species :
@@ -542,28 +579,70 @@ def __init__(self,prog):
542
579
if GM_check < 3 :
543
580
gmc = 0
544
581
lib .log .error ("GeneMark predictions failed, proceeding with only Augustus" )
582
+
583
+ #if hints used for Augustus, get high quality models > 80% coverage to pass to EVM
584
+ if os .path .isfile (hints_all ):
585
+ lib .log .info ("Pulling out high quality Augustus predictions" )
586
+ hiQ_models = []
587
+ with open (aug_out , 'rU' ) as augustus :
588
+ for pred in lib .readBlocks (augustus , '# start gene' ):
589
+ values = []
590
+ geneID = ''
591
+ support = ''
592
+ if pred [0 ].startswith ('# This output' ):
593
+ continue
594
+ if pred [0 ].startswith ('##gff-version 3' ):
595
+ continue
596
+ for line in pred :
597
+ line = line .replace ('\n ' , '' )
598
+ if line .startswith ('# start gene' ):
599
+ geneID = line .split (' ' )[- 1 ]
600
+ values .append (geneID )
601
+ if line .startswith ('# % of transcript supported by hints' ):
602
+ support = line .split (' ' )[- 1 ]
603
+ values .append (support )
604
+ if float (values [1 ]) > 65 : #greater than ~66% of exons supported, i.e. 2/3 or 3/4, but not less
605
+ hiQ_models .append (values [0 ])
606
+
607
+ #now open evm augustus and pull out models
608
+ HiQ = set (hiQ_models )
609
+ lib .log .info ("Found %i high quality predictions from Augustus (>66%% exon evidence)" % len (HiQ ))
610
+ HiQ_match = re .compile (r'\b(?:%s)[\.t1;]+\b' % '|' .join (HiQ ))
611
+ AugustusHiQ = os .path .join (args .out , 'predict_misc' , 'augustus-HiQ.evm.gff3' )
612
+ with open (AugustusHiQ , 'w' ) as HiQ_out :
613
+ with open (Augustus , 'rU' ) as evm_aug :
614
+ for line in evm_aug :
615
+ if HiQ_match .search (line ):
616
+ newline = line .replace ('\t Augustus\t ' , '\t HiQ\t ' )
617
+ HiQ_out .write (newline )
618
+
545
619
546
620
#EVM related input tasks, find all predictions and concatenate together
547
621
if args .pasa_gff :
548
- pred_in = [Augustus , GeneMark , args .pasa_gff ]
622
+ if os .path .isfile (hints_all ):
623
+ pred_in = [Augustus , GeneMark , args .pasa_gff , AugustusHiQ ]
624
+ else :
625
+ pred_in = [Augustus , GeneMark , args .pasa_gff ]
549
626
else :
550
- pred_in = [Augustus , GeneMark ]
627
+ if os .path .isfile (hints_all ):
628
+ pred_in = [Augustus , GeneMark , AugustusHiQ ]
629
+ else :
630
+ pred_in = [Augustus , GeneMark ]
551
631
Predictions = os .path .join (args .out , 'predict_misc' , 'predictions.gff3' )
552
632
with open (Predictions , 'w' ) as output :
553
633
for f in pred_in :
554
634
with open (f ) as input :
555
635
output .write (input .read ())
556
636
557
- #set Weights file dependent on which data is present. I have yet to find an example of where Augustus outperforms GeneMark for fungi, but i don't have too much evidence to think that genemark is perfect either....
637
+ #set Weights file dependent on which data is present.
558
638
Weights = os .path .join (args .out , 'predict_misc' , 'weights.evm.txt' )
559
639
with open (Weights , 'w' ) as output :
640
+ output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
641
+ output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
642
+ if os .path .isfile (hints_all ):
643
+ output .write ("OTHER_PREDICTION\t HiQ\t 10\n " )
560
644
if args .pasa_gff :
561
645
output .write ("OTHER_PREDICTION\t transdecoder\t 10\n " )
562
- output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
563
- output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
564
- else :
565
- output .write ("ABINITIO_PREDICTION\t Augustus\t 1\n " )
566
- output .write ("ABINITIO_PREDICTION\t GeneMark\t 1\n " )
567
646
if exonerate_out :
568
647
output .write ("PROTEIN\t exonerate\t 1\n " )
569
648
if Transcripts :
@@ -607,6 +686,12 @@ def __init__(self,prog):
607
686
else :
608
687
lib .log .info ('{0:,}' .format (total ) + ' total gene models from EVM' )
609
688
689
+ #move EVM folder to predict folder
690
+ if os .path .isdir ('EVM_tmp' ):
691
+ if os .path .isdir (os .path .join (args .out , 'predict_misc' , 'EVM' )):
692
+ shutil .rmtree (os .path .join (args .out , 'predict_misc' , 'EVM' ))
693
+ os .rename ('EVM_tmp' , os .path .join (args .out , 'predict_misc' , 'EVM' ))
694
+
610
695
#run tRNAscan
611
696
lib .log .info ("Predicting tRNAs" )
612
697
tRNAscan = os .path .join (args .out , 'predict_misc' , 'trnascan.gff3' )
@@ -631,7 +716,7 @@ def __init__(self,prog):
631
716
lib .log .info ('{0:,}' .format (total ) + ' total gene models' )
632
717
633
718
#filter bad models
634
- lib .log .info ("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args .min_protlen - 1 ))
719
+ lib .log .info ("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args .min_protlen ))
635
720
Blast_rep_remove = os .path .join (args .out , 'predict_misc' , 'repeat.gene.models.txt' )
636
721
if not os .path .isfile (Blast_rep_remove ):
637
722
lib .RepeatBlast (GAG_proteins , args .cpus , 1e-10 , os .path .join (args .out , 'predict_misc' ), Blast_rep_remove )
0 commit comments