Skip to content

Commit bfea9f0

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
updates to v0.3.2
1 parent f5a89d7 commit bfea9f0

8 files changed

+300
-50
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ To see the help menu, simply type `funannotate` in the terminal window. Similar
3232
$ funannotate
3333
3434
Usage: funannotate <command> <arguments>
35-
version: 0.1.4
35+
version: 0.3.2
3636
3737
Description: Funannotate is a genome prediction, annotation, and comparison pipeline.
3838

bin/funannotate-predict.py

+161-31
Large diffs are not rendered by default.

docs/ubuntu_install.md

+1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ brew doctor
4343
brew tap homebrew/dupes
4444
brew tap homebrew/science
4545
brew tap nextgenusfs/tap
46+
brew update
4647
```
4748

4849
4) Install funannotate and dependencies using LinuxBrew

funannotate.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@ def fmtcols(mylist, cols):
129129
--genemark_mod GeneMark ini mod file.
130130
--protein_evidence Proteins to map to genome (prot1.fa,prot2.fa,uniprot.fa). Default: uniprot.fa
131131
--transcript_evidence mRNA/ESTs to align to genome (trans1.fa,ests.fa,trinity.fa). Default: none
132-
--busco_seed_species Augustus pre-trained species to start BUSCO. Default: generic
132+
--busco_seed_species Augustus pre-trained species to start BUSCO. Default: anidulans
133+
--optimize_augustus Run 'optimze_augustus.pl' to refine training (long runtime)
133134
--busco_db BUSCO models. Default: fungi [fungi,vertebrata,metazoa,eukaryota,arthropoda]
134135
--organism Fungal-specific options. Default: fungus. [fungus,other]
135136

lib/library.py

+22-15
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from __future__ import division
2-
import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing
2+
import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools
33
from natsort import natsorted
44
import warnings
55
from Bio import SeqIO
@@ -74,6 +74,9 @@ def readBlocks(source, pattern):
7474
buffer.append( line )
7575
yield buffer
7676

77+
def empty_line_sep(line):
78+
return line=='\n'
79+
7780
def get_parent_dir(directory):
7881
return os.path.dirname(directory)
7982

@@ -1732,31 +1735,35 @@ def getTrainResults(input):
17321735
values3 = line.split('|') #get [6] and [7]
17331736
return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7])
17341737

1735-
def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus):
1738+
def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize):
17361739
RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
17371740
OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
1741+
NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
17381742
aug_cpus = '--cpus='+str(cpus)
17391743
species = '--species='+train_species
17401744
aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log')
17411745
trainingdir = 'tmp_opt_'+train_species
17421746
with open(aug_log, 'w') as logfile:
1743-
subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes
1744-
if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining
1745-
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
1747+
if not CheckAugustusSpecies(train_species):
1748+
subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile)
1749+
#run etraining again to only use best models from EVM for training
1750+
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
1751+
subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes
17461752
with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining:
17471753
subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining)
17481754
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'))
17491755
log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
1750-
#now run optimization
1751-
subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
1752-
#run etraining again
1753-
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
1754-
with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
1755-
subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining)
1756-
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
1757-
log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
1758-
#clean up tmp folder
1759-
shutil.rmtree(trainingdir)
1756+
if optimize:
1757+
#now run optimization
1758+
subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile)
1759+
#run etraining again
1760+
subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile)
1761+
with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining:
1762+
subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining)
1763+
train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'))
1764+
log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
1765+
#clean up tmp folder
1766+
shutil.rmtree(trainingdir)
17601767

17611768
HEADER = '''
17621769
<!DOCTYPE html>

util/filter_buscos.py

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python
2+
3+
#script to reformat Augustus BUSCO results
4+
import sys, os, itertools
5+
6+
if len(sys.argv) < 4:
7+
print("Usage: filter_buscos.py busco.evm.gff3 full_table_species busco.final.gff3")
8+
sys.exit(1)
9+
10+
def group_separator(line):
11+
return line=='\n'
12+
13+
#parse the busco table into dictionary format
14+
busco_complete = {}
15+
with open(sys.argv[2], 'rU') as buscoinput:
16+
for line in buscoinput:
17+
cols = line.split('\t')
18+
if cols[1] == 'Complete':
19+
ID = cols[2].replace('evm.model.', '')
20+
if not ID in busco_complete:
21+
busco_complete[ID] = (cols[0], cols[3])
22+
else:
23+
score = busco_complete.get(ID)[1]
24+
if float(cols[3]) > float(score):
25+
busco_complete[ID] = (cols[0], cols[3])
26+
print ID, 'updated dictionary'
27+
else:
28+
print ID, 'is repeated and score is less'
29+
30+
#now parse the evm busco file, group them
31+
results = []
32+
with open(sys.argv[1]) as f:
33+
for key, group in itertools.groupby(f, group_separator):
34+
if not key:
35+
results.append(list(group))
36+
37+
#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
38+
'''
39+
scaffold_1 EVM gene 18407 18947 . - . ID=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
40+
scaffold_1 EVM mRNA 18407 18947 . - . ID=evm.model.scaffold_1.1;Parent=evm.TU.scaffold_1.1;Name=EVM%20prediction%20scaffold_1.1
41+
scaffold_1 EVM exon 18772 18947 . - . ID=evm.model.scaffold_1.1.exon1;Parent=evm.model.scaffold_1.1
42+
scaffold_1 EVM CDS 18772 18947 . - 0 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
43+
scaffold_1 EVM exon 18407 18615 . - . ID=evm.model.scaffold_1.1.exon2;Parent=evm.model.scaffold_1.1
44+
scaffold_1 EVM CDS 18407 18615 . - 1 ID=cds.evm.model.scaffold_1.1;Parent=evm.model.scaffold_1.1
45+
'''
46+
counter = 0
47+
with open(sys.argv[3], 'w') as output:
48+
for i in results:
49+
counter += 1
50+
cols = i[0].split('\t')
51+
ID = cols[8].split(';')[0]
52+
ID = ID.replace('ID=', '')
53+
lookup = ID.replace('evm.TU.', '')
54+
if lookup in busco_complete:
55+
name = busco_complete.get(lookup)[0]
56+
geneID = 'gene'+str(counter)
57+
mrnaID = 'mrna'+str(counter)
58+
newblock = ''.join(i)
59+
newblock = newblock.replace('EVM%20prediction%20'+lookup, name)
60+
newblock = newblock.replace(ID, geneID)
61+
newblock = newblock.replace('evm.model.'+lookup, mrnaID)
62+
output.write(newblock+'\n')

util/fix_busco_naming.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python
2+
3+
#script to reformat Augustus BUSCO results
4+
import sys, os, itertools
5+
6+
if len(sys.argv) < 4:
7+
print("Usage: fix_busco_naming.py busco_augustus.gff3 full_table_species busco_augustus.fixed.gff3")
8+
sys.exit(1)
9+
10+
def group_separator(line):
11+
return line=='\n'
12+
13+
#parse the busco table into dictionary format
14+
busco_complete = {}
15+
with open(sys.argv[2], 'rU') as buscoinput:
16+
for line in buscoinput:
17+
cols = line.split('\t')
18+
if cols[1] == 'Complete':
19+
if not cols[0] in busco_complete:
20+
busco_complete[cols[0]] = cols[2]+':'+cols[3]+'-'+cols[4]
21+
22+
#now parse the augustus input file where gene numbers are likely repeated.
23+
results = []
24+
with open(sys.argv[1]) as f:
25+
for key, group in itertools.groupby(f, group_separator):
26+
if not key:
27+
results.append(list(group))
28+
29+
#loop through each gene model, lookup the BUSCO name, and then replace the name with counter based and busco model name
30+
counter = 0
31+
inverse_busco = {v: k for k, v in busco_complete.items()}
32+
with open(sys.argv[3], 'w') as output:
33+
for i in results:
34+
counter += 1
35+
cols = i[0].split('\t')
36+
lookup = cols[0]+':'+cols[3]+'-'+cols[4]
37+
if lookup in inverse_busco:
38+
name = inverse_busco.get(lookup)
39+
else:
40+
name = 'unknown_model'
41+
ID = cols[8].split(';')[0]
42+
ID = ID.replace('ID=', '')
43+
newID = 'gene'+str(counter)
44+
newblock = ''.join(i)
45+
newblock = newblock.replace('Augustus%20prediction', name)
46+
newblock = newblock.replace(ID, newID)
47+
output.write(newblock+'\n')

util/funannotate-BUSCO.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1089,7 +1089,7 @@ def measuring (nested):
10891089
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
10901090
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
10911091
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
1092-
1092+
scaff_end = scaff_end.replace(']', '')
10931093
out.write('%s\tComplete\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
10941094
csc[entity] = seq_id
10951095

@@ -1106,6 +1106,7 @@ def measuring (nested):
11061106
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
11071107
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
11081108
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
1109+
scaff_end = scaff_end.replace(']', '')
11091110
out.write('%s\tDuplicated\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
11101111

11111112

@@ -1122,6 +1123,7 @@ def measuring (nested):
11221123
scaff = seq_id.strip(']').split('[')[1].split(':')[0]
11231124
scaff_start = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[0]
11241125
scaff_end = seq_id.strip(']').split('[')[1].split(':')[1].split('-')[1]
1126+
scaff_end = scaff_end.replace(']', '')
11251127
out.write('%s\tFragmented\t%s\t%s\t%s\t%s\t%s\n' % (entity, scaff, scaff_start, scaff_end, bit_score, seq_len))
11261128

11271129
missing = []
@@ -1227,7 +1229,7 @@ def measuring (nested):
12271229
gff_file.write(line)
12281230
gff_file.close()
12291231

1230-
subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 1000 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
1232+
subprocess.call('$AUGUSTUS_CONFIG_PATH/../scripts/gff2gbSmallDNA.pl %s/gffs/%s.gff %s 500 %s/gb/%s.raw.gb 1>/dev/null 2>/dev/null' %
12311233
(mainout, entry, args['genome'], mainout, entry), shell = True)
12321234
#bacteria clade needs to be flagged as "prokaryotic"
12331235
NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')

0 commit comments

Comments
 (0)