Skip to content

Commit ca6f11b

Browse files
author
Jon Palmer
committed
auto detect training files in predict step
1 parent 7cbe2c2 commit ca6f11b

File tree

2 files changed

+29
-46
lines changed

2 files changed

+29
-46
lines changed

bin/funannotate-predict.py

+26-34
Original file line numberDiff line numberDiff line change
@@ -61,33 +61,6 @@ def __init__(self, prog):
6161
parser.add_argument('--BAMTOOLS_PATH', help='Path to BamTools exe directory, $BAMTOOLS_PATH')
6262
args=parser.parse_args()
6363

64-
def download(url, name):
65-
file_name = name
66-
try:
67-
u = urllib2.urlopen(url)
68-
f = open(file_name, 'wb')
69-
meta = u.info()
70-
file_size = int(meta.getheaders("Content-Length")[0])
71-
lib.log.info("Downloading: {0} Bytes: {1}".format(url, file_size))
72-
file_size_dl = 0
73-
block_sz = 8192
74-
while True:
75-
buffer = u.read(block_sz)
76-
if not buffer:
77-
break
78-
file_size_dl += len(buffer)
79-
f.write(buffer)
80-
p = float(file_size_dl) / file_size
81-
status = r"{0} [{1:.2%}]".format(file_size_dl, p)
82-
status = status + chr(8)*(len(status)+1)
83-
sys.stdout.write(status)
84-
sys.stdout.flush()
85-
f.close()
86-
except socket.error as e:
87-
if e.errno != errno.ECONNRESET:
88-
raise
89-
pass
90-
9164
#check for conflicting folder names to avoid problems
9265
conflict = ['busco', 'busco_proteins', 'RepeatMasker', 'RepeatModeler', 'genemark', 'EVM_tmp', 'braker']
9366
if args.out in conflict:
@@ -203,6 +176,32 @@ def download(url, name):
203176
programs = ['exonerate', 'diamond', 'tbl2asn', 'gmes_petap.pl', 'rmblastn', 'BuildDatabase', 'RepeatModeler', 'RepeatMasker', GeneMark2GFF, AutoAug, 'bedtools', 'gmap', 'gmap_build', 'blat', 'pslCDnaFilter', 'augustus', 'etraining', 'rmOutToGFF3.pl']
204177
lib.CheckDependencies(programs)
205178

179+
#look for pre-existing data in training folder
180+
#look for pre-existing training data to use
181+
pre_existing = []
182+
if os.path.isdir(os.path.join(args.out, 'training')):
183+
traindir = os.path.join(args.out, 'training')
184+
if os.path.isfile(os.path.join(traindir, 'funannotate_train.coordSorted.bam')):
185+
if not args.rna_bam:
186+
args.rna_bam = os.path.join(traindir, 'funannotate_train.coordSorted.bam')
187+
pre_existing.append(' --rna_bam '+os.path.join(traindir, 'funannotate_train.coordSorted.bam'))
188+
if os.path.isfile(os.path.join(traindir, 'funannotate_train.trinity-GG.fasta')):
189+
if not args.transcript_evidence:
190+
args.transcript_evidence = [os.path.join(traindir, 'funannotate_train.trinity-GG.fasta')]
191+
pre_existing.append(' --transcript_evidence '+os.path.join(traindir, 'funannotate_train.trinity-GG.fasta'))
192+
else: #maybe passed a different one? then append to the list
193+
if not os.path.join(traindir, 'funannotate_train.trinity-GG.fasta') in args.transcript_evidence:
194+
args.transcript_evidence.append(os.path.join(traindir, 'funannotate_train.trinity-GG.fasta'))
195+
pre_existing.append(' --transcript_evidence '+' '.join(args.transcript_evidence))
196+
if os.path.isfile(os.path.join(traindir, 'funannotate_train.pasa.gff3')):
197+
if not args.pasa_gff:
198+
args.pasa_gff = os.path.join(traindir, 'funannotate_train.pasa.gff3')
199+
pre_existing.append(' --pasa_gff '+os.path.join(traindir, 'funannotate_train.pasa.gff3'))
200+
if len(pre_existing) > 0:
201+
lib.log.info("Found training files, will re-use these files:\n%s" % '\n'.join(pre_existing))
202+
203+
204+
206205
#see if organism/species/isolate was passed at command line, build PASA naming scheme
207206
organism = None
208207
if args.species:
@@ -234,13 +233,6 @@ def download(url, name):
234233
if augustuscheck[1] == 0:
235234
lib.log.error("ERROR: %s is not installed properly for BRAKER (check bam2hints compilation)" % augustuscheck[0])
236235
sys.exit(1)
237-
#Braker has some changed output behavior, hate to do this, but requiring at least v2.02
238-
#although braker.pl --version doesn't output a version... so dumb.
239-
#note Braker v2 apparently has a new config file requirement, check for it, download it if it doesn't exist
240-
#braker_extrinsic = os.path.join(AUGUSTUS_BASE, 'config', 'extrinsic', 'extrinsic.M.RM.E.W.P.cfg')
241-
#if not os.path.isfile(braker_extrinsic): #download it
242-
# lib.log.info("Augustus extrinsic file missing, will try to download and install")
243-
# download('https://raw.githubusercontent.com/nextgenusfs/augustus/master/config/extrinsic/extrinsic.M.RM.E.W.P.cfg', braker_extrinsic)
244236

245237
if not augspeciescheck: #means training needs to be done
246238
if augustuscheck[2] == 0:

bin/funannotate-train.py

+3-12
Original file line numberDiff line numberDiff line change
@@ -623,23 +623,14 @@ def runPASAtrain(genome, transcripts, stranded, intronlen, cpus, dbname, output)
623623
if args.strain:
624624
lib.log.info('Trinity/PASA has completed, you are now ready to run funanotate predict, for example:\n\n\
625625
funannotate predict -i {:} \\\n\
626-
--transcript_evidence {:} \\\n\
627-
--rna_bam {:} \\\n\
628-
--pasa_gff {:} \\\n\
629-
-o {:} -s "{:}" --strain {:} --cpus {:}\n'.format(args.input, TranscriptFinal, BAMfinal, PASA_gff, args.out, organism, args.strain, args.cpus))
626+
-o {:} -s "{:}" --strain {:} --cpus {:}\n'.format(args.input, args.out, organism, args.strain, args.cpus))
630627
elif args.isolate:
631628
lib.log.info('Trinity/PASA has completed, you are now ready to run funanotate predict, for example:\n\n\
632629
funannotate predict -i {:} \\\n\
633-
--transcript_evidence {:} \\\n\
634-
--rna_bam {:} \\\n\
635-
--pasa_gff {:} \\\n\
636-
-o {:} -s "{:}" --isolate {:} --cpus {:}\n'.format(args.input, TranscriptFinal, BAMfinal, PASA_gff, args.out, organism, args.isolate, args.cpus))
630+
-o {:} -s "{:}" --isolate {:} --cpus {:}\n'.format(args.input, args.out, organism, args.isolate, args.cpus))
637631
else:
638632
lib.log.info('Trinity/PASA has completed, you are now ready to run funanotate predict, for example:\n\n\
639633
funannotate predict -i {:} \\\n\
640-
--transcript_evidence {:} \\\n\
641-
--rna_bam {:} \\\n\
642-
--pasa_gff {:} \\\n\
643-
-o {:} -s "{:}" --cpus {:}\n'.format(args.input, TranscriptFinal, BAMfinal, PASA_gff, args.out, organism, args.cpus))
634+
-o {:} -s "{:}" --cpus {:}\n'.format(args.input, args.out, organism, args.cpus))
644635
print("-------------------------------------------------------")
645636
sys.exit(1)

0 commit comments

Comments
 (0)