Skip to content

Commit 8ff6abb

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
updates to v0.3.0
1 parent 01ac8e3 commit 8ff6abb

7 files changed

+1487
-1051
lines changed

bin/augustus_parallel.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python
2+
3+
import sys, multiprocessing, subprocess, os, shutil, argparse, time
4+
from Bio import SeqIO
5+
6+
#setup menu with argparse
7+
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
8+
def __init__(self,prog):
9+
super(MyFormatter,self).__init__(prog,max_help_position=48)
10+
parser=argparse.ArgumentParser(prog='augustus_parallel.py', usage="%(prog)s [options] -i genome.fasta -s botrytis_cinera -o new_genome",
11+
description='''Script that does it all...''',
12+
epilog="""Written by Jon Palmer (2016) [email protected]""",
13+
formatter_class = MyFormatter)
14+
parser.add_argument('-i','--input', required=True, help='Genome in FASTA format')
15+
parser.add_argument('-o','--out', required=True, help='Basename of output files')
16+
parser.add_argument('-s','--species', required=True, help='Augustus species name')
17+
parser.add_argument('--hints', help='Hints file (PE)')
18+
parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to run')
19+
args=parser.parse_args()
20+
21+
#check for augustus installation
22+
try:
23+
AUGUSTUS = os.environ["AUGUSTUS_CONFIG_PATH"]
24+
except KeyError:
25+
if not args.AUGUSTUS_CONFIG_PATH:
26+
print("$AUGUSTUS_CONFIG_PATH environmental variable not found, Augustus is not properly configured")
27+
os._exit(1)
28+
if AUGUSTUS.endswith('config'):
29+
AUGUSTUS_BASE = AUGUSTUS.replace('config', '')
30+
elif AUGUSTUS.endswith('config'+os.sep):
31+
AUGUSTUS_BASE = AUGUSTUS.replace('config'+os.sep, '')
32+
33+
#setup hints and extrinic input, hard coded for protein and transcript alignments from funannotate
34+
extrinsic = '--extrinsicCfgFile='+os.path.join(AUGUSTUS_BASE, 'config', 'extrinsic', 'extrinsic.E.XNT.cfg')
35+
36+
def countGFFgenes(input):
37+
count = 0
38+
with open(input, 'rU') as f:
39+
for line in f:
40+
if "\tgene\t" in line:
41+
count += 1
42+
return count
43+
44+
def runAugustus(Input):
45+
FNULL = open(os.devnull, 'w')
46+
species='--species='+args.species
47+
hints_input = '--hintsfile='+os.path.join(tmpdir, Input+'.hints.gff')
48+
aug_out = os.path.join(tmpdir, Input+'.augustus.gff3')
49+
with open(aug_out, 'w') as output:
50+
if args.hints:
51+
subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL)
52+
else:
53+
subprocess.call(['augustus', species, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL)
54+
55+
56+
#first step is to split input fasta file into individual files in tmp folder
57+
print("Splitting contigs and hints files")
58+
tmpdir = 'augustus_tmp_'+str(os.getpid())
59+
os.makedirs(tmpdir)
60+
scaffolds = []
61+
with open(args.input, 'rU') as InputFasta:
62+
for record in SeqIO.parse(InputFasta, 'fasta'):
63+
name = str(record.id)
64+
scaffolds.append(name)
65+
outputfile = os.path.join(tmpdir, name+'.fa')
66+
with open(outputfile, 'w') as output:
67+
SeqIO.write(record, output, 'fasta')
68+
69+
#if hints file passed, split it up by scaffold
70+
if args.hints:
71+
for i in scaffolds:
72+
with open(os.path.join(tmpdir, i+'.hints.gff'), 'w') as output:
73+
with open(args.hints, 'rU') as hintsfile:
74+
for line in hintsfile:
75+
cols = line.split('\t')
76+
if cols[0] == i:
77+
output.write(line)
78+
79+
#now loop through each scaffold running augustus
80+
if args.cpus > len(scaffolds):
81+
num = len(scaffolds)
82+
else:
83+
num = args.cpus
84+
print("Running augustus on %i scaffolds, using %i CPUs" % (len(scaffolds), num))
85+
p = multiprocessing.Pool(num)
86+
rs = p.map_async(runAugustus, scaffolds)
87+
p.close()
88+
while (True):
89+
if (rs.ready()): break
90+
remaining = rs._number_left
91+
print "Waiting for", remaining, "augustus jobs to complete..."
92+
time.sleep(30)
93+
print("Augustus prediction is finished, now concatenating results")
94+
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
95+
for file in scaffolds:
96+
file = os.path.join(tmpdir, file+'.augustus.gff3')
97+
with open(file) as input:
98+
output.write(input.read())
99+
100+
join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
101+
with open(args.out, 'w') as finalout:
102+
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as input:
103+
subprocess.call([join_script],stdin = input, stdout = finalout)
104+
shutil.rmtree(tmpdir)
105+
print("Found %i total gene models" % countGFFgenes(args.out))

bin/funannotate-compare.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def __init__(self,prog):
2020
super(MyFormatter,self).__init__(prog,max_help_position=48)
2121
parser=argparse.ArgumentParser(prog='funannotate-compare.py', usage="%(prog)s [options] genome1.gbk genome2.gbk",
2222
description='''Funannotate comparative genomics.''',
23-
epilog="""Written by Jon Palmer (2015) [email protected]""",
23+
epilog="""Written by Jon Palmer (2016) [email protected]""",
2424
formatter_class = MyFormatter)
2525
parser.add_argument('-i','--input', nargs='+', help='List of funannotate genome folders')
2626
parser.add_argument('-o','--out', default='funannotate_compare', help='Name of output folder')
@@ -30,6 +30,7 @@ def __init__(self,prog):
3030
parser.add_argument('--bootstrap', default=100, type=int, help='Number of bootstraps to run with RAxML')
3131
parser.add_argument('--num_orthos', default=500, type=int, help='Number of Single-copy orthologs to run with RAxML')
3232
parser.add_argument('--outgroup', help='Name of species for RAxML outgroup')
33+
parser.add_argument('--eggnog_db', default='fuNOG', help='EggNog database')
3334
args=parser.parse_args()
3435

3536

@@ -756,7 +757,7 @@ def __init__(self,prog):
756757
goList.append(description)
757758
goDict[col[0]] = goList
758759

759-
EggNog = lib.eggnog2dict()
760+
EggNog = lib.eggnog2dict(os.path.join(parentdir, 'DB', args.eggnog_db+'.annotations.tsv'))
760761
iprDict = lib.dictFlipLookup(ipr, INTERPRO)
761762
pfamDict = lib.dictFlipLookup(pfam, PFAM)
762763
meropsDict = lib.dictFlip(merops)

bin/funannotate-functional.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self,prog):
1717
super(MyFormatter,self).__init__(prog,max_help_position=48)
1818
parser=argparse.ArgumentParser(prog='funannotate-functional.py', usage="%(prog)s [options] -i genome.fasta -g genome.gff -o test -e [email protected]",
1919
description='''Script that adds functional annotation to a genome.''',
20-
epilog="""Written by Jon Palmer (2015) [email protected]""",
20+
epilog="""Written by Jon Palmer (2016) [email protected]""",
2121
formatter_class = MyFormatter)
2222
parser.add_argument('-i','--input', help='Folder from funannotate predict.')
2323
parser.add_argument('--genbank', help='Annotated genome in GenBank format')
@@ -36,6 +36,8 @@ def __init__(self,prog):
3636
parser.add_argument('--skip_iprscan', action='store_true', help='skip InterProScan remote query')
3737
parser.add_argument('--force', action='store_true', help='Over-write output folder')
3838
parser.add_argument('--AUGUSTUS_CONFIG_PATH', help='Path to Augustus config directory, $AUGUSTUS_CONFIG_PATH')
39+
parser.add_argument('--eggnog_db', default='fuNOG', help='EggNog database')
40+
parser.add_argument('--busco_db', default='fungi', choices=['fungi', 'metazoa', 'eukaryota', 'arthropoda', 'vertebrata'], help='BUSCO model database')
3941
args=parser.parse_args()
4042

4143
def runIPRpython(Input):
@@ -113,6 +115,23 @@ def runIPRpython(Input):
113115
if args.skip_iprscan:
114116
lib.log.error("To run InterProScan you need to specify an email address to identify yourself to the online service")
115117
os._exit(1)
118+
119+
#check EggNog database, download if necessary.
120+
if not args.eggnog_db in lib.Nogs:
121+
lib.log.error("%s is not a valid EggNog group, options are:\n%s" % (args.eggnog_db, ', '.join(lib.Nogs)))
122+
os._exit(1)
123+
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
124+
lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db)
125+
subprocess.call([os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')], stdout=FNULL, stderr=FNULL)
126+
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
127+
lib.log.error("Downloading failed, exiting")
128+
os._exit(1)
129+
else:
130+
lib.log.error("%s downloaded and formatted, moving on." % args.eggnog_db)
131+
132+
#check buscos, download if necessary
133+
if not os.path.isdir(os.path.join(parentdir, 'DB', args.busco_db)):
134+
lib.download_buscos(args.busco_db)
116135

117136
#need to do some checks here of the input
118137
if not args.input:
@@ -252,14 +271,15 @@ def runIPRpython(Input):
252271
eggnog_out = os.path.join(outputdir, 'annotate_misc', 'annotations.eggnog.txt')
253272
lib.log.info("Annotating proteins with EggNog 4.5 database")
254273
if not lib.checkannotations(eggnog_out):
255-
lib.runEggNog(Proteins, args.cpus, 1e-10, os.path.join(outputdir, 'annotate_misc'), eggnog_out)
274+
lib.runEggNog(Proteins, os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm'), os.path.join(parentdir, 'DB', args.eggnog_db+'.annotations.tsv'), args.cpus, 1e-10, os.path.join(outputdir, 'annotate_misc'), eggnog_out)
256275
num_annotations = lib.line_count(eggnog_out)
257276
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
258277
#run BUSCO OGS search
259278
busco_out = os.path.join(outputdir, 'annotate_misc', 'annotations.busco.txt')
260-
lib.log.info("Annotating proteins with BUSCO models")
279+
lib.log.info("Annotating proteins with BUSCO %s models" % args.busco_db)
280+
buscoDB = os.path.join(parentdir, 'DB', args.busco_db)
261281
if not lib.checkannotations(busco_out):
262-
lib.runBUSCO(Proteins, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out)
282+
lib.runBUSCO(Proteins, buscoDB, args.cpus, os.path.join(outputdir, 'annotate_misc'), busco_out)
263283
num_annotations = lib.line_count(busco_out)
264284
lib.log.info('{0:,}'.format(num_annotations) + ' annotations added')
265285
#run signalP if installed, have to manually install, so test if exists first, then run it if it does

0 commit comments

Comments
 (0)