|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +import sys, multiprocessing, subprocess, os, shutil, argparse, time |
| 4 | +from Bio import SeqIO |
| 5 | + |
| 6 | +#setup menu with argparse |
| 7 | +class MyFormatter(argparse.ArgumentDefaultsHelpFormatter): |
| 8 | + def __init__(self,prog): |
| 9 | + super(MyFormatter,self).__init__(prog,max_help_position=48) |
| 10 | +parser=argparse.ArgumentParser(prog='augustus_parallel.py', usage="%(prog)s [options] -i genome.fasta -s botrytis_cinera -o new_genome", |
| 11 | + description='''Script that does it all...''', |
| 12 | + epilog="""Written by Jon Palmer (2016) [email protected]""", |
| 13 | + formatter_class = MyFormatter) |
| 14 | +parser.add_argument('-i','--input', required=True, help='Genome in FASTA format') |
| 15 | +parser.add_argument('-o','--out', required=True, help='Basename of output files') |
| 16 | +parser.add_argument('-s','--species', required=True, help='Augustus species name') |
| 17 | +parser.add_argument('--hints', help='Hints file (PE)') |
| 18 | +parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to run') |
| 19 | +args=parser.parse_args() |
| 20 | + |
| 21 | +#check for augustus installation |
| 22 | +try: |
| 23 | + AUGUSTUS = os.environ["AUGUSTUS_CONFIG_PATH"] |
| 24 | +except KeyError: |
| 25 | + if not args.AUGUSTUS_CONFIG_PATH: |
| 26 | + print("$AUGUSTUS_CONFIG_PATH environmental variable not found, Augustus is not properly configured") |
| 27 | + os._exit(1) |
| 28 | +if AUGUSTUS.endswith('config'): |
| 29 | + AUGUSTUS_BASE = AUGUSTUS.replace('config', '') |
| 30 | +elif AUGUSTUS.endswith('config'+os.sep): |
| 31 | + AUGUSTUS_BASE = AUGUSTUS.replace('config'+os.sep, '') |
| 32 | + |
| 33 | +#setup hints and extrinic input, hard coded for protein and transcript alignments from funannotate |
| 34 | +extrinsic = '--extrinsicCfgFile='+os.path.join(AUGUSTUS_BASE, 'config', 'extrinsic', 'extrinsic.E.XNT.cfg') |
| 35 | + |
| 36 | +def countGFFgenes(input): |
| 37 | + count = 0 |
| 38 | + with open(input, 'rU') as f: |
| 39 | + for line in f: |
| 40 | + if "\tgene\t" in line: |
| 41 | + count += 1 |
| 42 | + return count |
| 43 | + |
| 44 | +def runAugustus(Input): |
| 45 | + FNULL = open(os.devnull, 'w') |
| 46 | + species='--species='+args.species |
| 47 | + hints_input = '--hintsfile='+os.path.join(tmpdir, Input+'.hints.gff') |
| 48 | + aug_out = os.path.join(tmpdir, Input+'.augustus.gff3') |
| 49 | + with open(aug_out, 'w') as output: |
| 50 | + if args.hints: |
| 51 | + subprocess.call(['augustus', species, hints_input, extrinsic, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr= FNULL) |
| 52 | + else: |
| 53 | + subprocess.call(['augustus', species, '--gff3=on', os.path.join(tmpdir, Input+'.fa')], stdout = output, stderr = FNULL) |
| 54 | + |
| 55 | + |
| 56 | +#first step is to split input fasta file into individual files in tmp folder |
| 57 | +print("Splitting contigs and hints files") |
| 58 | +tmpdir = 'augustus_tmp_'+str(os.getpid()) |
| 59 | +os.makedirs(tmpdir) |
| 60 | +scaffolds = [] |
| 61 | +with open(args.input, 'rU') as InputFasta: |
| 62 | + for record in SeqIO.parse(InputFasta, 'fasta'): |
| 63 | + name = str(record.id) |
| 64 | + scaffolds.append(name) |
| 65 | + outputfile = os.path.join(tmpdir, name+'.fa') |
| 66 | + with open(outputfile, 'w') as output: |
| 67 | + SeqIO.write(record, output, 'fasta') |
| 68 | + |
| 69 | +#if hints file passed, split it up by scaffold |
| 70 | +if args.hints: |
| 71 | + for i in scaffolds: |
| 72 | + with open(os.path.join(tmpdir, i+'.hints.gff'), 'w') as output: |
| 73 | + with open(args.hints, 'rU') as hintsfile: |
| 74 | + for line in hintsfile: |
| 75 | + cols = line.split('\t') |
| 76 | + if cols[0] == i: |
| 77 | + output.write(line) |
| 78 | + |
| 79 | +#now loop through each scaffold running augustus |
| 80 | +if args.cpus > len(scaffolds): |
| 81 | + num = len(scaffolds) |
| 82 | +else: |
| 83 | + num = args.cpus |
| 84 | +print("Running augustus on %i scaffolds, using %i CPUs" % (len(scaffolds), num)) |
| 85 | +p = multiprocessing.Pool(num) |
| 86 | +rs = p.map_async(runAugustus, scaffolds) |
| 87 | +p.close() |
| 88 | +while (True): |
| 89 | + if (rs.ready()): break |
| 90 | + remaining = rs._number_left |
| 91 | + print "Waiting for", remaining, "augustus jobs to complete..." |
| 92 | + time.sleep(30) |
| 93 | +print("Augustus prediction is finished, now concatenating results") |
| 94 | +with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output: |
| 95 | + for file in scaffolds: |
| 96 | + file = os.path.join(tmpdir, file+'.augustus.gff3') |
| 97 | + with open(file) as input: |
| 98 | + output.write(input.read()) |
| 99 | + |
| 100 | +join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl') |
| 101 | +with open(args.out, 'w') as finalout: |
| 102 | + with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as input: |
| 103 | + subprocess.call([join_script],stdin = input, stdout = finalout) |
| 104 | +shutil.rmtree(tmpdir) |
| 105 | +print("Found %i total gene models" % countGFFgenes(args.out)) |
0 commit comments