Skip to content

Commit 66433ee

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
updates to v0.3.1
1 parent 021da94 commit 66433ee

File tree

5 files changed

+167
-56
lines changed

5 files changed

+167
-56
lines changed

bin/augustus_parallel.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,9 @@ def runAugustus(Input):
4848
aug_out = os.path.join(tmpdir, Input+'.augustus.gff3')
4949
with open(aug_out, 'w') as output:
5050
if args.hints:
51-
subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL)
51+
subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL)
5252
else:
53-
subprocess.call(['augustus', species, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL)
53+
subprocess.call(['augustus', species, '--gff3=on', '--stopCodonExcludedFromCDS=False', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL)
5454

5555

5656
#first step is to split input fasta file into individual files in tmp folder

bin/funannotate-predict.py

+123-38
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22

3-
import sys, os, subprocess, inspect, multiprocessing, shutil, argparse, time
3+
import sys, os, subprocess, inspect, multiprocessing, shutil, argparse, time, re
44
from Bio import SeqIO
55
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
66
parentdir = os.path.dirname(currentdir)
@@ -35,7 +35,7 @@ def __init__(self,prog):
3535
parser.add_argument('--rna_bam', help='BAM (sorted) of RNAseq aligned to reference for BRAKER1')
3636
parser.add_argument('--min_intronlen', default=10, help='Minimum intron length for gene models')
3737
parser.add_argument('--max_intronlen', default=3000, help='Maximum intron length for gene models')
38-
parser.add_argument('--min_protlen', default=51, type=int, help='Minimum amino acid length for valid gene model')
38+
parser.add_argument('--min_protlen', default=50, type=int, help='Minimum amino acid length for valid gene model')
3939
parser.add_argument('--keep_no_stops', action='store_true', help='Keep gene models without valid stop codons')
4040
parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use')
4141
parser.add_argument('--busco_seed_species', default='anidulans', help='Augustus species to use as initial training point for BUSCO')
@@ -448,37 +448,74 @@ def __init__(self,prog):
448448
subprocess.call(['perl', Converter, aug_out], stdout = output, stderr = FNULL)
449449

450450
if not GeneMark:
451+
#count contigs
452+
num_contigs = lib.countfasta(MaskGenome)
451453
#now run GeneMark-ES, first check for gmhmm mod file, use if available otherwise run ES
452454
if not args.genemark_mod:
453-
GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
454-
if not os.path.isfile(GeneMarkGFF3):
455-
if args.organism == 'fungus':
456-
lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
457-
else:
458-
lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
459-
GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
460-
with open(GeneMarkTemp, 'w') as output:
461-
subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
462-
GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
463-
with open(GeneMark, 'w') as output:
464-
with open(GeneMarkTemp, 'rU') as input:
465-
lines = input.read().replace("Augustus","GeneMark")
466-
output.write(lines)
455+
#if there are less than 2 data points (contigs, self-training fails), count contigs
456+
if num_contigs < 2:
457+
lib.log.error("GeneMark-ES cannot run with only a single contig, you must provide --ini_mod file to run GeneMark")
458+
else:
459+
GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
460+
if not os.path.isfile(GeneMarkGFF3):
461+
if args.organism == 'fungus':
462+
lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
463+
else:
464+
lib.RunGeneMarkES(MaskGenome, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
465+
GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
466+
with open(GeneMarkTemp, 'w') as output:
467+
subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
468+
GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
469+
with open(GeneMark, 'w') as output:
470+
with open(GeneMarkTemp, 'rU') as input:
471+
lines = input.read().replace("Augustus","GeneMark")
472+
output.write(lines)
467473
else: #have training parameters file, so just run genemark with
468474
GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
469-
if not os.path.isfile(GeneMarkGFF3):
470-
if args.organism == 'fungus':
471-
lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
472-
else:
473-
lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
474-
GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
475-
with open(GeneMarkTemp, 'w') as output:
476-
subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
477-
GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
478-
with open(GeneMark, 'w') as output:
479-
with open(GeneMarkTemp, 'rU') as input:
480-
lines = input.read().replace("Augustus","GeneMark")
481-
output.write(lines)
475+
if num_contigs < 2: #now can run modified prediction on single contig
476+
with open(MaskGenome, 'rU') as genome:
477+
for line in genome:
478+
if line.startswith('>'):
479+
header = line.replace('>', '')
480+
header = header.replace('\n', '')
481+
GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
482+
GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
483+
if not os.path.isfile(GeneMarkGFF3):
484+
lib.log.info("Running GeneMark on single-contig assembly")
485+
subprocess.call(['gmhmme3', '-m', args.genemark_mod, '-o', GeneMarkGFF3, '-f', 'gff3', MaskGenome])
486+
#now open output and reformat
487+
lib.log.info("Converting GeneMark GTF file to GFF3")
488+
with open(GeneMarkTemp, 'w') as geneout:
489+
with open(GeneMarkGFF3, 'rU') as genein:
490+
for line in genein:
491+
if not line.startswith('#'):
492+
if not '\tIntron\t' in line:
493+
newline = line.replace('seq', header)
494+
newline = newline.replace('.hmm3', '')
495+
geneout.write(newline)
496+
GeneMarkTemp2 = os.path.join(args.out, 'predict_misc', 'genemark.temp2.gff')
497+
with open(GeneMarkTemp2, 'w') as output:
498+
subprocess.call(['perl', Converter, GeneMarkTemp], stdout = output, stderr = FNULL)
499+
with open(GeneMark, 'w') as output:
500+
with open(GeneMarkTemp2, 'rU') as input:
501+
lines = input.read().replace("Augustus","GeneMark")
502+
output.write(lines)
503+
504+
else:
505+
GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
506+
if not os.path.isfile(GeneMarkGFF3):
507+
if args.organism == 'fungus':
508+
lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, True)
509+
else:
510+
lib.RunGeneMark(MaskGenome, args.genemark_mod, args.cpus, os.path.join(args.out, 'predict_misc'), GeneMarkGFF3, False)
511+
GeneMarkTemp = os.path.join(args.out, 'predict_misc', 'genemark.temp.gff')
512+
with open(GeneMarkTemp, 'w') as output:
513+
subprocess.call(['perl', Converter, GeneMarkGFF3], stdout = output, stderr = FNULL)
514+
GeneMark = os.path.join(args.out, 'predict_misc', 'genemark.evm.gff3')
515+
with open(GeneMark, 'w') as output:
516+
with open(GeneMarkTemp, 'rU') as input:
517+
lines = input.read().replace("Augustus","GeneMark")
518+
output.write(lines)
482519

483520
if not Augustus:
484521
if not args.augustus_species:
@@ -542,28 +579,70 @@ def __init__(self,prog):
542579
if GM_check < 3:
543580
gmc = 0
544581
lib.log.error("GeneMark predictions failed, proceeding with only Augustus")
582+
583+
#if hints used for Augustus, get high quality models > 80% coverage to pass to EVM
584+
if os.path.isfile(hints_all):
585+
lib.log.info("Pulling out high quality Augustus predictions")
586+
hiQ_models = []
587+
with open(aug_out, 'rU') as augustus:
588+
for pred in lib.readBlocks(augustus, '# start gene'):
589+
values = []
590+
geneID = ''
591+
support = ''
592+
if pred[0].startswith('# This output'):
593+
continue
594+
if pred[0].startswith('##gff-version 3'):
595+
continue
596+
for line in pred:
597+
line = line.replace('\n', '')
598+
if line.startswith('# start gene'):
599+
geneID = line.split(' ')[-1]
600+
values.append(geneID)
601+
if line.startswith('# % of transcript supported by hints'):
602+
support = line.split(' ')[-1]
603+
values.append(support)
604+
if float(values[1]) > 65: #greater than ~66% of exons supported, i.e. 2/3 or 3/4, but not less
605+
hiQ_models.append(values[0])
606+
607+
#now open evm augustus and pull out models
608+
HiQ = set(hiQ_models)
609+
lib.log.info("Found %i high quality predictions from Augustus (>66%% exon evidence)" % len(HiQ))
610+
HiQ_match = re.compile(r'\b(?:%s)[\.t1;]+\b' % '|'.join(HiQ))
611+
AugustusHiQ = os.path.join(args.out, 'predict_misc', 'augustus-HiQ.evm.gff3')
612+
with open(AugustusHiQ, 'w') as HiQ_out:
613+
with open(Augustus, 'rU') as evm_aug:
614+
for line in evm_aug:
615+
if HiQ_match.search(line):
616+
newline = line.replace('\tAugustus\t', '\tHiQ\t')
617+
HiQ_out.write(newline)
618+
545619

546620
#EVM related input tasks, find all predictions and concatenate together
547621
if args.pasa_gff:
548-
pred_in = [Augustus, GeneMark, args.pasa_gff]
622+
if os.path.isfile(hints_all):
623+
pred_in = [Augustus, GeneMark, args.pasa_gff, AugustusHiQ]
624+
else:
625+
pred_in = [Augustus, GeneMark, args.pasa_gff]
549626
else:
550-
pred_in = [Augustus, GeneMark]
627+
if os.path.isfile(hints_all):
628+
pred_in = [Augustus, GeneMark, AugustusHiQ]
629+
else:
630+
pred_in = [Augustus, GeneMark]
551631
Predictions = os.path.join(args.out, 'predict_misc', 'predictions.gff3')
552632
with open(Predictions, 'w') as output:
553633
for f in pred_in:
554634
with open(f) as input:
555635
output.write(input.read())
556636

557-
#set Weights file dependent on which data is present. I have yet to find an example of where Augustus outperforms GeneMark for fungi, but i don't have too much evidence to think that genemark is perfect either....
637+
#set Weights file dependent on which data is present.
558638
Weights = os.path.join(args.out, 'predict_misc', 'weights.evm.txt')
559639
with open(Weights, 'w') as output:
640+
output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
641+
output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
642+
if os.path.isfile(hints_all):
643+
output.write("OTHER_PREDICTION\tHiQ\t10\n")
560644
if args.pasa_gff:
561645
output.write("OTHER_PREDICTION\ttransdecoder\t10\n")
562-
output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
563-
output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
564-
else:
565-
output.write("ABINITIO_PREDICTION\tAugustus\t1\n")
566-
output.write("ABINITIO_PREDICTION\tGeneMark\t1\n")
567646
if exonerate_out:
568647
output.write("PROTEIN\texonerate\t1\n")
569648
if Transcripts:
@@ -607,6 +686,12 @@ def __init__(self,prog):
607686
else:
608687
lib.log.info('{0:,}'.format(total) + ' total gene models from EVM')
609688

689+
#move EVM folder to predict folder
690+
if os.path.isdir('EVM_tmp'):
691+
if os.path.isdir(os.path.join(args.out, 'predict_misc', 'EVM')):
692+
shutil.rmtree(os.path.join(args.out, 'predict_misc', 'EVM'))
693+
os.rename('EVM_tmp', os.path.join(args.out, 'predict_misc', 'EVM'))
694+
610695
#run tRNAscan
611696
lib.log.info("Predicting tRNAs")
612697
tRNAscan = os.path.join(args.out, 'predict_misc', 'trnascan.gff3')
@@ -631,7 +716,7 @@ def __init__(self,prog):
631716
lib.log.info('{0:,}'.format(total) + ' total gene models')
632717

633718
#filter bad models
634-
lib.log.info("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args.min_protlen - 1))
719+
lib.log.info("Filtering out bad gene models (< %i aa in length, transposable elements, etc)." % (args.min_protlen))
635720
Blast_rep_remove = os.path.join(args.out, 'predict_misc', 'repeat.gene.models.txt')
636721
if not os.path.isfile(Blast_rep_remove):
637722
lib.RepeatBlast(GAG_proteins, args.cpus, 1e-10, os.path.join(args.out, 'predict_misc'), Blast_rep_remove)

bin/funannotate-runEVM.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ def safe_run(*args, **kwargs):
9393
x = num_lines
9494
else:
9595
x = cpus - 1
96+
if x < 1:
97+
x = 1
9698
lib.log.info("Running EVM commands with %i CPUs" % (x))
9799
#print "Splitting over", cpus, "CPUs"
98100
n = int(round(num_lines / x))
@@ -133,4 +135,4 @@ def safe_run(*args, **kwargs):
133135
shutil.copyfileobj(readfile, out)
134136

135137
#remove your mess
136-
shutil.rmtree(tmpdir)
138+
#shutil.rmtree(tmpdir)

funannotate.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def fmtcols(mylist, cols):
3131
for i in range(0,num_lines))
3232
return "\n".join(lines)
3333

34-
version = '0.3.0'
34+
version = '0.3.1'
3535

3636
default_help = """
3737
Usage: funannotate <command> <arguments>

lib/library.py

+38-14
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,16 @@ def checkInternet():
6464
except urllib2.URLError as err: pass
6565
return False
6666

67+
def readBlocks(source, pattern):
68+
buffer = []
69+
for line in source:
70+
if line.startswith(pattern):
71+
if buffer: yield buffer
72+
buffer = [ line ]
73+
else:
74+
buffer.append( line )
75+
yield buffer
76+
6777
def get_parent_dir(directory):
6878
return os.path.dirname(directory)
6979

@@ -890,26 +900,40 @@ def RemoveBadModels(proteins, gff, length, repeats, BlastResults, tmpdir, output
890900
remove = [w.replace('evm.TU.','') for w in remove]
891901
remove = [w.replace('evm.model.','') for w in remove]
892902
remove = set(remove)
893-
remove_match = re.compile(r'\b(?:%s)[\.;]+\b' % '|'.join(remove))
894-
with open(output, 'w') as out:
895-
with open(os.path.join(tmpdir, 'bad_models.gff'), 'w') as out2:
903+
if len(remove) > 0:
904+
remove_match = re.compile(r'\b(?:%s)[\.;]+\b' % '|'.join(remove))
905+
with open(output, 'w') as out:
906+
with open(os.path.join(tmpdir, 'bad_models.gff'), 'w') as out2:
907+
with open(gff, 'rU') as GFF:
908+
for line in GFF:
909+
if '\tstart_codon\t' in line:
910+
continue
911+
if '\tstop_codon\t' in line:
912+
continue
913+
if not remove_match.search(line):
914+
line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
915+
out.write(line)
916+
else:
917+
if "\tgene\t" in line:
918+
bad_ninth = line.split('ID=')[-1]
919+
bad_ID = bad_ninth.split(";")[0]
920+
bad_reason = reason.get(bad_ID)
921+
if bad_reason:
922+
line = line.replace('\n', ';'+bad_reason+'\n')
923+
else:
924+
line = line.replace('\n', ';remove_reason=unknown;\n')
925+
out2.write(line)
926+
else: #if nothing to remove, just print out GFF
927+
with open(output, 'w') as out:
896928
with open(gff, 'rU') as GFF:
897929
for line in GFF:
898930
if '\tstart_codon\t' in line:
899931
continue
900932
if '\tstop_codon\t' in line:
901933
continue
902-
if not remove_match.search(line):
903-
line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
904-
out.write(line)
905-
else:
906-
if "\tgene\t" in line:
907-
bad_ninth = line.split('ID=')[-1]
908-
bad_ID = bad_ninth.split(";")[0]
909-
bad_reason = reason.get(bad_ID)
910-
line = line.replace('\n', ';'+bad_reason+'\n')
911-
out2.write(line)
912-
934+
line = re.sub(';Name=.*$', ';', line) #remove the Name attribute as it sticks around in GBK file
935+
out.write(line)
936+
913937
def CleantRNAtbl(GFF, TBL, output):
914938
#clean up genbank tbl file from gag output
915939
#try to read through GFF file, make dictionary of tRNA genes and products

0 commit comments

Comments
 (0)