Skip to content

Commit 96ec504

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
add other_gff, better handling pasa_gff, custom weights, braker-hiq fix
1 parent 979ab8c commit 96ec504

File tree

3 files changed

+119
-37
lines changed

3 files changed

+119
-37
lines changed

bin/funannotate-predict.py

Lines changed: 58 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def __init__(self, prog):
3232
parser.add_argument('--transcript_evidence', help='Transcript evidence (map to genome with GMAP)')
3333
parser.add_argument('--gmap_gff', help='Pre-computed GMAP transcript alignments (GFF3)')
3434
parser.add_argument('--pasa_gff', help='Pre-computed PASA/TransDecoder high quality models')
35+
parser.add_argument('--other_gff', help='GFF gene prediction pass-through to EVM')
3536
parser.add_argument('--augustus_gff', help='Pre-computed Augustus gene models (GFF3)')
3637
parser.add_argument('--genemark_gtf', help='Pre-computed GeneMark gene models (GTF)')
3738
parser.add_argument('--maker_gff', help='MAKER2 GFF output')
@@ -191,7 +192,34 @@ def __init__(self, prog):
191192

192193
if args.protein_evidence == 'uniprot.fa':
193194
args.protein_evidence = os.path.join(parentdir, 'DB', 'uniprot_sprot.fasta')
194-
195+
196+
#convert PASA GFF and/or GFF pass-through
197+
#convert PASA to have 'pasa_pred' in second column to make sure weights work with EVM
198+
PASA_GFF = os.path.join(args.out, 'predict_misc', 'pasa_predictions.gff3')
199+
PASA_weight = '10'
200+
if args.pasa_gff:
201+
if ':' in args.pasa_gff:
202+
pasacol = args.pasa_gff.split(':')
203+
PASA_weight = pasacol[1]
204+
args.pasa_gff = pasacol[0]
205+
lib.renameGFF(args.pasa_gff, 'pasa_pred', PASA_GFF)
206+
#validate it will work with EVM
207+
if not lib.evmGFFvalidate(PASA_GFF, EVM, lib.log):
208+
lib.log.error("ERROR: %s is not a properly formatted PASA GFF file, please consult EvidenceModeler docs" % args.pasa_gff)
209+
sys.exit(1)
210+
OTHER_GFF = os.path.join(args.out, 'predict_misc', 'other_predictions.gff3')
211+
OTHER_weight = '1'
212+
if args.other_gff:
213+
if ':' in args.other_gff:
214+
othercol = args.other_gff.split(':')
215+
OTHER_weight = othercol[1]
216+
args.other_gff = othercol[0]
217+
lib.renameGFF(args.other_gff, 'other_pred', OTHER_GFF)
218+
#validate it will work with EVM
219+
if not lib.evmGFFvalidate(OTHER_GFF, EVM, lib.log):
220+
lib.log.error("ERROR: %s is not a properly formatted GFF file, please consult EvidenceModeler docs" % args.other_gff)
221+
sys.exit(1)
222+
195223
#check input files to make sure they are not empty, first check if multiple files passed to transcript/protein evidence
196224
input_checks = [args.input, args.masked_genome, args.repeatmasker_gff3, args.genemark_mod, args.exonerate_proteins, args.gmap_gff, args.pasa_gff, args.repeatmodeler_lib, args.rna_bam]
197225
if ',' in args.protein_evidence: #there will always be something here, since defaults to uniprot
@@ -269,10 +297,9 @@ def __init__(self, prog):
269297
#EVM command line scripts
270298
Converter = os.path.join(EVM, 'EvmUtils', 'misc', 'augustus_GFF3_to_EVM_GFF3.pl')
271299
ExoConverter = os.path.join(EVM, 'EvmUtils', 'misc', 'exonerate_gff_to_alignment_gff3.pl')
272-
Validator = os.path.join(EVM, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
273300
Converter2 = os.path.join(EVM, 'EvmUtils', 'misc', 'augustus_GTF_to_EVM_GFF3.pl')
274301
EVM2proteins = os.path.join(EVM, 'EvmUtils', 'gff3_file_to_proteins.pl')
275-
302+
276303
#repeatmasker, run if not passed from command line
277304
if not os.path.isfile(MaskGenome):
278305
if not args.repeatmodeler_lib:
@@ -292,17 +319,22 @@ def __init__(self, prog):
292319
lib.log.error("RepeatMasking failed, check log files.")
293320
sys.exit(1)
294321

295-
#load contig names and sizes into dictionary.
322+
#load contig names and sizes into dictionary, get masked repeat stats
296323
ContigSizes = {}
324+
GenomeLength = 0
325+
maskedSize = 0
297326
with open(MaskGenome, 'rU') as input:
298327
for rec in SeqIO.parse(input, 'fasta'):
299328
if not rec.id in ContigSizes:
300329
ContigSizes[rec.id] = len(rec.seq)
330+
GenomeLength += len(rec.seq)
331+
maskedSize += lib.n_lower_chars(str(rec.seq))
301332
else:
302333
lib.log.error("Error, duplicate contig names, exiting")
303334
sys.exit(1)
304-
GenomeLength = sum(ContigSizes.values())
305-
lib.log.info('Masked genome: {0:,}'.format(len(ContigSizes))+' scaffolds; {0:,}'.format(GenomeLength)+ ' bp')
335+
percentMask = maskedSize / float(GenomeLength)
336+
MaskedStats = '{0:.2f}%'.format(percentMask*10)
337+
lib.log.info('Masked genome: {0:,}'.format(len(ContigSizes))+' scaffolds; {0:,}'.format(GenomeLength)+ ' bp; '+MaskedStats+' repeats masked')
306338

307339
#check for previous files and setup output files
308340
Predictions = os.path.join(args.out, 'predict_misc', 'gene_predictions.gff3')
@@ -326,7 +358,7 @@ def __init__(self, prog):
326358
#append PASA data if exists
327359
if args.pasa_gff:
328360
with open(Predictions, 'a') as output:
329-
with open(args.pasa_gff) as input:
361+
with open(PASA_GFF) as input:
330362
output.write(input.read())
331363
#setup weights file for EVM
332364
with open(Weights, 'w') as output:
@@ -401,6 +433,9 @@ def __init__(self, prog):
401433
if args.gmap_gff:
402434
shutil.copyfile(args.gmap_gff, trans_out)
403435
Transcripts = os.path.abspath(trans_out)
436+
if Transcripts:
437+
total = lib.countGMAPtranscripts(Transcripts)
438+
lib.log.info('{0:,}'.format(total) + ' transcripts aligned with GMAP')
404439
if not os.path.isfile(hintsE): #use previous hints file if exists
405440
if os.path.isfile(trans_temp): #if transcripts are available to algin, run BLAT
406441
#now run BLAT for Augustus hints
@@ -417,6 +452,8 @@ def __init__(self, prog):
417452
blat2hints = os.path.join(AUGUSTUS_BASE, 'scripts', 'blat2hints.pl')
418453
cmd = [blat2hints, b2h_input, b2h_output, '--minintronlen=20', '--trunkSS']
419454
lib.runSubprocess(cmd, '.', lib.log)
455+
total = lib.line_count(blat_sort2)
456+
lib.log.info('{0:,}'.format(total) + ' filtered BLAT alignments')
420457
else:
421458
lib.log.error("No transcripts available to generate Augustus hints, provide --transcript_evidence")
422459

@@ -542,11 +579,11 @@ def __init__(self, prog):
542579
shutil.rmtree(os.path.join(args.out, 'predict_misc', 'braker'))
543580
os.rename('braker', os.path.join(args.out, 'predict_misc', 'braker'))
544581
#okay, now need to fetch the Augustus GFF and Genemark GTF files
545-
aug_out = os.path.join(args.out, 'predict_misc', 'braker', aug_species, 'augustus.gff3')
582+
aug_out = os.path.join(args.out, 'predict_misc', 'braker', aug_species, 'augustus.gff')
546583
gene_out = os.path.join(args.out, 'predict_misc', 'braker', aug_species, 'GeneMark-ET', 'genemark.gtf')
547584
#now convert to EVM format
548585
Augustus = os.path.join(args.out, 'predict_misc', 'augustus.evm.gff3')
549-
cmd = ['perl', Converter, aug_out]
586+
cmd = ['perl', Converter2, aug_out]
550587
lib.runSubprocess2(cmd, '.', lib.log, Augustus)
551588
GeneMarkGFF3 = os.path.join(args.out, 'predict_misc', 'genemark.gff')
552589
cmd = [GeneMark2GFF, gene_out]
@@ -573,7 +610,7 @@ def __init__(self, prog):
573610
lib.log.info("Training Augustus using PASA data, this may take awhile")
574611
GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl')
575612
trainingset = os.path.join(args.out, 'predict_misc', 'augustus.pasa.gb')
576-
cmd = [GFF2GB, args.pasa_gff, MaskGenome, '500', trainingset]
613+
cmd = [GFF2GB, PASA_GFF, MaskGenome, '500', trainingset]
577614
lib.runSubprocess(cmd, '.', lib.log)
578615
if args.optimize_augustus:
579616
lib.trainAugustus(AUGUSTUS_BASE, aug_species, trainingset, MaskGenome, args.out, args.cpus, True)
@@ -844,7 +881,7 @@ def __init__(self, prog):
844881
sys.exit(1)
845882

846883
#if hints used for Augustus, get high quality models > 90% coverage to pass to EVM
847-
if os.path.isfile(hints_all) and not args.rna_bam:
884+
if os.path.isfile(hints_all) or args.rna_bam:
848885
lib.log.info("Pulling out high quality Augustus predictions")
849886
hiQ_models = []
850887
with open(aug_out, 'rU') as augustus:
@@ -881,16 +918,14 @@ def __init__(self, prog):
881918

882919

883920
#EVM related input tasks, find all predictions and concatenate together
921+
pred_in = [Augustus, GeneMark]
884922
if args.pasa_gff:
885-
if os.path.isfile(hints_all) and not args.rna_bam:
886-
pred_in = [Augustus, GeneMark, args.pasa_gff, AugustusHiQ]
887-
else:
888-
pred_in = [Augustus, GeneMark, args.pasa_gff]
889-
else:
890-
if os.path.isfile(hints_all) and not args.rna_bam:
891-
pred_in = [Augustus, GeneMark, AugustusHiQ]
892-
else:
893-
pred_in = [Augustus, GeneMark]
923+
pred_in.append(PASA_GFF)
924+
if args.other_gff:
925+
pred_in.append(OTHER_GFF)
926+
if os.path.isfile(hints_all) or args.rna_bam:
927+
pred_in.append(AugustusHiQ)
928+
894929
#write gene predictions file
895930
with open(Predictions+'.tmp', 'w') as output:
896931
for f in sorted(pred_in):
@@ -907,11 +942,13 @@ def __init__(self, prog):
907942
if os.path.isfile(hints_all) and not args.rna_bam:
908943
output.write("OTHER_PREDICTION\tHiQ\t5\n")
909944
if args.pasa_gff:
910-
output.write("OTHER_PREDICTION\ttransdecoder\t10\n")
945+
output.write("OTHER_PREDICTION\tpasa_pred\t%s\n" % PASA_weight)
911946
if exonerate_out:
912947
output.write("PROTEIN\texonerate\t1\n")
913948
if Transcripts:
914949
output.write("TRANSCRIPT\tgenome\t1\n")
950+
if args.other_gff:
951+
output.write("OTHER_PREDICTION\tother_pred\t1\n" % OTHER_weight)
915952

916953
#total up Predictions
917954
total = lib.countGFFgenes(Predictions)

funannotate.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def flatten(l):
1919
flatList.append(elem)
2020
return flatList
2121

22-
version = '0.6.1'
22+
version = '0.6.2'
2323

2424
default_help = """
2525
Usage: funannotate <command> <arguments>
@@ -42,7 +42,7 @@ def flatten(l):
4242
outgroups Manage outgroups for funannotate compare
4343
eggnog Manage EggNog Databases
4444
45-
Written by Jon Palmer (2016) [email protected]
45+
Written by Jon Palmer (2016-2017) [email protected]
4646
""" % version
4747

4848
if len(sys.argv) > 1:
@@ -60,7 +60,7 @@ def flatten(l):
6060
-c, --cov Percent coverage of overlap. Default = 95
6161
-m, --minlen Minimum length of contig to keep. Default = 500
6262
63-
Written by Jon Palmer (2016) [email protected]
63+
Written by Jon Palmer (2016-2017) [email protected]
6464
""" % (sys.argv[1], version)
6565

6666
arguments = sys.argv[2:]
@@ -87,7 +87,7 @@ def flatten(l):
8787
-b, --base Base name to relabel contigs. Default: scaffold
8888
--minlen Shorter contigs are discarded. Default: 0
8989
90-
Written by Jon Palmer (2016) [email protected]
90+
Written by Jon Palmer (2016-2017) [email protected]
9191
""" % (sys.argv[1], version)
9292

9393
arguments = sys.argv[2:]
@@ -120,7 +120,8 @@ def flatten(l):
120120
Optional: --isolate Strain isolate, e.g. Af293
121121
--name Locus tag name (assigned by NCBI?). Default: FUN_
122122
--maker_gff MAKER2 GFF file. Parse results directly to EVM.
123-
--pasa_gff PASA generated gene models
123+
--pasa_gff PASA generated gene models. filename:weight
124+
--other_gff Annotation pass-through to EVM. filename:weight
124125
--rna_bam RNA-seq mapped to genome to train Augustus/GeneMark-ET
125126
--augustus_species Augustus species config. Default: uses species name
126127
--genemark_mod GeneMark ini mod file.
@@ -152,7 +153,7 @@ def flatten(l):
152153
--GENEMARK_PATH
153154
--BAMTOOLS_PATH
154155
155-
Written by Jon Palmer (2016) [email protected]
156+
Written by Jon Palmer (2016-2017) [email protected]
156157
""" % (sys.argv[1], version)
157158

158159
arguments = sys.argv[2:]
@@ -202,7 +203,7 @@ def flatten(l):
202203
ENV Vars: By default loaded from your $PATH, however you can specify at run-time if not in PATH
203204
--AUGUSTUS_CONFIG_PATH
204205
205-
Written by Jon Palmer (2016) [email protected]
206+
Written by Jon Palmer (2016-2017) [email protected]
206207
""" % (sys.argv[1], version)
207208

208209
arguments = sys.argv[2:]
@@ -236,7 +237,7 @@ def flatten(l):
236237
--eggnog_db EggNog database used for annotation. Default: fuNOG
237238
--proteinortho ProteinOrtho5 POFF results.
238239
239-
Written by Jon Palmer (2016) [email protected]
240+
Written by Jon Palmer (2016-2017) [email protected]
240241
""" % (sys.argv[1], version)
241242

242243
arguments = sys.argv[2:]
@@ -262,7 +263,7 @@ def flatten(l):
262263
-e, --error_report NCBI FCSreport.txt
263264
--sbt NCBI template submission file
264265
265-
Written by Jon Palmer (2016) [email protected]
266+
Written by Jon Palmer (2016-2017) [email protected]
266267
""" % (sys.argv[1], version)
267268

268269
arguments = sys.argv[2:]
@@ -308,7 +309,7 @@ def flatten(l):
308309
Options: -m, --mode Download/format databases and/or check dependencies [all,db,dep]
309310
-d, --database Path to funannotate databse
310311
311-
Written by Jon Palmer (2016) [email protected]
312+
Written by Jon Palmer (2016-2017) [email protected]
312313
""" % (sys.argv[1], version)
313314
arguments = sys.argv[2:]
314315
if len(arguments) > 0:
@@ -338,7 +339,7 @@ def flatten(l):
338339
--show_buscos List the busco_db options
339340
--show_outgroups List the installed outgroup species.
340341
341-
Written by Jon Palmer (2016) [email protected]
342+
Written by Jon Palmer (2016-2017) [email protected]
342343
""" % (sys.argv[1], version)
343344

344345
arguments = sys.argv[2:]
@@ -377,7 +378,7 @@ def flatten(l):
377378
--show_installed Show EggNog 4.5 Databases Installed
378379
--show_all Show all available Databases
379380
380-
Written by Jon Palmer (2016) [email protected]
381+
Written by Jon Palmer (2016-2017) [email protected]
381382
""" % (sys.argv[1], version)
382383

383384
arguments = sys.argv[2:]

lib/library.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,18 @@ def runSubprocess4(cmd, dir, logfile):
207207
logfile.debug(' '.join(cmd))
208208
proc = subprocess.Popen(cmd, cwd=dir, stdout=FNULL, stderr=FNULL)
209209
proc.communicate()
210+
211+
def evmGFFvalidate(input, evmpath, logfile):
212+
Validator = os.path.join(evmpath, 'EvmUtils', 'gff3_gene_prediction_file_validator.pl')
213+
cmd = [Validator, input]
214+
logfile.debug(' '.join(cmd))
215+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
216+
stdout, stderr = proc.communicate()
217+
if not stderr:
218+
return True
219+
else:
220+
logfile.debug(stderr)
221+
False
210222

211223
def hashfile(afile, hasher, blocksize=65536):
212224
buf = afile.read(blocksize)
@@ -461,6 +473,20 @@ def countfasta(input):
461473
count += 1
462474
return count
463475

476+
def renameGFF(input, newname, output):
477+
with open(output, 'w') as outfile:
478+
with open(input, 'rU') as infile:
479+
for line in infile:
480+
if line.startswith('>'): #remove any fasta sequences
481+
continue
482+
if line.startswith('#'):
483+
outfile.write(line)
484+
else:
485+
cols = line.split('\t')
486+
#make sure it has correct columns to be GFF
487+
if len(cols) == 9:
488+
outfile.write('%s\t%s\t%s' % (cols[0], newname, '\t'.join(cols[2:])))
489+
464490
def countGFFgenes(input):
465491
count = 0
466492
with open(input, 'rU') as f:
@@ -469,6 +495,14 @@ def countGFFgenes(input):
469495
count += 1
470496
return count
471497

498+
def countGMAPtranscripts(input):
499+
count = 0
500+
with open(input, 'rU') as f:
501+
for line in f:
502+
if line.startswith('###'):
503+
count += 1
504+
return count
505+
472506
def runMultiProgress(function, inputList, cpus):
473507
#setup pool
474508
p = multiprocessing.Pool(cpus)
@@ -537,7 +571,14 @@ def gb2output(input, output1, output2, output3):
537571

538572
def sortGFF(input, output, order):
539573
cmd = ['bedtools', 'sort', '-header', '-faidx', order, '-i', input]
540-
runSubprocess2(cmd, '.', log, output)
574+
with open(output, 'w') as out:
575+
proc = subprocess.Popen(cmd, stdout=out, stderr=subprocess.PIPE)
576+
stderr = proc.communicate()
577+
if stderr:
578+
if stderr[0] == None:
579+
if stderr[1] != '':
580+
log.error("Sort GFF failed, unreferenced scaffold present in gene predictions, check logfile")
581+
sys.exit(1)
541582

542583
def checkGenBank(input):
543584
count = 0
@@ -1165,7 +1206,10 @@ def RepeatMask(input, library, cpus, tmpdir, output, debug):
11651206
rm_gff3 = os.path.join(tmpdir, 'repeatmasker.gff3')
11661207
cmd = ['rmOutToGFF3.pl', file]
11671208
runSubprocess2(cmd, outdir, log, rm_gff3)
1168-
1209+
1210+
def n_lower_chars(string):
1211+
return sum(1 for c in string if c.islower())
1212+
11691213
def CheckAugustusSpecies(input):
11701214
#get the possible species from augustus
11711215
augustus_list = []
@@ -1441,12 +1485,12 @@ def ParseErrorReport(input, Errsummary, val, Discrep, output, keep_stops):
14411485
#parse the discrepency report and look for overlapping genes, so far, all have been tRNA's in introns, so just get those for now.
14421486
with open(Discrep, 'rU') as discrep:
14431487
for line in discrep:
1444-
if line.startswith('DiscRep_ALL:FIND_OVERLAPPED_GENES::'): #skip one line and then move through next lines until line starts with nothing
1488+
if line.startswith('DiscRep_ALL:FIND_OVERLAPPED_GENES::') or line.startswith('DiscRep_ALL:FIND_BADLEN_TRNAS::'): #skip one line and then move through next lines until line starts with nothing
14451489
num = line.split(' ')[0]
14461490
num = num.split('::')[-1]
14471491
num = int(num)
14481492
for i in range(num):
1449-
gene = discrep.next().split('\t')[1]
1493+
gene = discrep.next().split('\t')[-1].replace('\n', '')
14501494
tRNA = gene + '_tRNA'
14511495
exon = gene + '_exon'
14521496
remove.append(gene)

0 commit comments

Comments
 (0)