|
1 | 1 | from __future__ import division
|
2 |
| -import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing |
| 2 | +import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools |
3 | 3 | from natsort import natsorted
|
4 | 4 | import warnings
|
5 | 5 | from Bio import SeqIO
|
@@ -74,6 +74,9 @@ def readBlocks(source, pattern):
|
74 | 74 | buffer.append( line )
|
75 | 75 | yield buffer
|
76 | 76 |
|
| 77 | +def empty_line_sep(line): |
| 78 | + return line=='\n' |
| 79 | + |
77 | 80 | def get_parent_dir(directory):
|
78 | 81 | return os.path.dirname(directory)
|
79 | 82 |
|
@@ -1732,31 +1735,35 @@ def getTrainResults(input):
|
1732 | 1735 | values3 = line.split('|') #get [6] and [7]
|
1733 | 1736 | return (values1[1], values1[2], values2[6], values2[7], values3[6], values3[7])
|
1734 | 1737 |
|
1735 |
| -def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus): |
| 1738 | +def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, optimize): |
1736 | 1739 | RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
|
1737 | 1740 | OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
|
| 1741 | + NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl') |
1738 | 1742 | aug_cpus = '--cpus='+str(cpus)
|
1739 | 1743 | species = '--species='+train_species
|
1740 | 1744 | aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log')
|
1741 | 1745 | trainingdir = 'tmp_opt_'+train_species
|
1742 | 1746 | with open(aug_log, 'w') as logfile:
|
1743 |
| - subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 100 models for testing purposes |
1744 |
| - if not CheckAugustusSpecies(train_species): #check if training set exists, if not run etraining |
1745 |
| - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1747 | + if not CheckAugustusSpecies(train_species): |
| 1748 | + subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile) |
| 1749 | + #run etraining again to only use best models from EVM for training |
| 1750 | + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1751 | + subprocess.call([RANDOMSPLIT, trainingset, '200']) #split off 200 models for testing purposes |
1746 | 1752 | with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining:
|
1747 | 1753 | subprocess.call(['augustus', species, trainingset+'.test'], stdout=initialtraining)
|
1748 | 1754 | train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'))
|
1749 | 1755 | log.info('Initial training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly')
|
1750 |
| - #now run optimization |
1751 |
| - subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) |
1752 |
| - #run etraining again |
1753 |
| - subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
1754 |
| - with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: |
1755 |
| - subprocess.call(['augustus', species, os.path.join(trainingdir, 'bucket1.gb')], stdout=finaltraining) |
1756 |
| - train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) |
1757 |
| - log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') |
1758 |
| - #clean up tmp folder |
1759 |
| - shutil.rmtree(trainingdir) |
| 1756 | + if optimize: |
| 1757 | + #now run optimization |
| 1758 | + subprocess.call([OPTIMIZE, species, aug_cpus, trainingset], stderr = logfile, stdout = logfile) |
| 1759 | + #run etraining again |
| 1760 | + subprocess.call(['etraining', species, trainingset], stderr = logfile, stdout = logfile) |
| 1761 | + with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: |
| 1762 | + subprocess.call(['augustus', species, trainingset+'.test'], stdout=finaltraining) |
| 1763 | + train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt')) |
| 1764 | + log.info('Optimized training: '+'{0:.2%}'.format(float(train_results[4]))+' genes predicted exactly and '+'{0:.2%}'.format(float(train_results[2]))+' of exons predicted exactly') |
| 1765 | + #clean up tmp folder |
| 1766 | + shutil.rmtree(trainingdir) |
1760 | 1767 |
|
1761 | 1768 | HEADER = '''
|
1762 | 1769 | <!DOCTYPE html>
|
|
0 commit comments