Skip to content

Commit ba2bc85

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
fixes/updates for more complete logging
1 parent 671e7fe commit ba2bc85

10 files changed

+308
-240
lines changed

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ funannotate will likely run on any POSIX system, although it has only been teste
88

99
* [Mac OSX install instructions](docs/mac_install.md)
1010
* [Ubuntu install instructions](docs/ubuntu_install.md)
11+
* [FAQS](docs/faqs.md)
1112

1213
###Setup
1314

@@ -18,6 +19,8 @@ To run the setup script, type:
1819
funannotate setup --all
1920
```
2021

22+
Most problems that people have are with dependencies and installation of funannotate. Here are some Frequently Asked Questions: [FAQ](cods/faqs.md)
23+
2124
###Funannotate help menu
2225

2326
To see the help menu, simply type `funannotate` in the terminal window. Similarly, e.g `funannotate predict` without any arguments will give you the options available to pass to each script, this is consistent for all of the funannotate commands.

bin/augustus_parallel.py

+29-16
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
#!/usr/bin/env python
22

3-
import sys, multiprocessing, subprocess, os, shutil, argparse, time
3+
import sys, multiprocessing, subprocess, os, shutil, argparse, time, inspect
44
from Bio import SeqIO
5+
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
6+
parentdir = os.path.dirname(currentdir)
7+
sys.path.insert(0,parentdir)
8+
import lib.library as lib
59

610
#setup menu with argparse
711
class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
@@ -17,6 +21,7 @@ def __init__(self,prog):
1721
parser.add_argument('--hints', help='Hints file (PE)')
1822
parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to run')
1923
parser.add_argument('--debug', action='store_true', help='Keep intermediate files')
24+
parser.add_argument('--logfile', default ='augustus-parallel.log', help='logfile')
2025
args=parser.parse_args()
2126

2227
#check for augustus installation
@@ -43,7 +48,6 @@ def countGFFgenes(input):
4348
return count
4449

4550
def runAugustus(Input):
46-
FNULL = open(os.devnull, 'w')
4751
if '_part' in Input:
4852
chr = Input.split('_part')[0]
4953
else:
@@ -64,9 +68,17 @@ def runAugustus(Input):
6468
with open(aug_out, 'w') as output:
6569
subprocess.call(core_cmd, stdout = output)
6670

71+
log_name = args.logfile
72+
if os.path.isfile(log_name):
73+
os.remove(log_name)
74+
75+
#initialize script, log system info and cmd issue at runtime
76+
lib.setupLogging(log_name)
77+
cmd_args = " ".join(sys.argv)+'\n'
78+
lib.log.debug(cmd_args)
6779

6880
#first step is to split input fasta file into individual files in tmp folder
69-
print("Splitting contigs and hints files")
81+
lib.log.debug("Splitting contigs and hints files")
7082
tmpdir = 'augustus_tmp_'+str(os.getpid())
7183
os.makedirs(tmpdir)
7284
scaffolds = []
@@ -121,22 +133,23 @@ def runAugustus(Input):
121133
num = len(scaffolds)
122134
else:
123135
num = args.cpus
124-
print("Running augustus on %i chunks, using %i CPUs" % (len(scaffolds), num))
136+
lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % (len(scaffolds), num))
125137
p = multiprocessing.Pool(num)
138+
tasks = len(scaffolds)
126139
results = []
127-
r = [p.apply_async(runAugustus, (x,), callback=results.append) for x in scaffolds]
140+
for i in scaffolds:
141+
results.append(p.apply_async(runAugustus, [i]))
142+
while True:
143+
incomplete_count = sum(1 for x in results if not x.ready())
144+
if incomplete_count == 0:
145+
break
146+
sys.stdout.write(" Progress: %.2f%% \r" % (float(tasks - incomplete_count) / tasks * 100))
147+
sys.stdout.flush()
148+
time.sleep(1)
128149
p.close()
129150
p.join()
130-
'''
131-
rs = p.map_async(runAugustus, scaffolds)
132-
p.close()
133-
while (True):
134-
if (rs.ready()): break
135-
remaining = rs._number_left
136-
print "Waiting for", remaining, "augustus jobs to complete..."
137-
time.sleep(30)
138-
'''
139-
print("Augustus prediction is finished, now concatenating results")
151+
152+
lib.log.debug("Augustus prediction is finished, now concatenating results")
140153
with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
141154
for file in scaffolds:
142155
file = os.path.join(tmpdir, file+'.augustus.gff3')
@@ -149,4 +162,4 @@ def runAugustus(Input):
149162
subprocess.call([join_script],stdin = input, stdout = finalout)
150163
if not args.debug:
151164
shutil.rmtree(tmpdir)
152-
print("Found %i total gene models" % countGFFgenes(args.out))
165+
lib.log.info("Found %i gene models" % countGFFgenes(args.out))

bin/funannotate-compare.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def __init__(self,prog):
4444
sys.exit(1)
4545
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
4646
lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db)
47-
subprocess.call([os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')], stdout=FNULL, stderr=FNULL)
47+
cmd = [os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')]
48+
lib.runSubprocess(cmd, '.', lib.log)
4849
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
4950
lib.log.error("Downloading failed, exiting")
5051
sys.exit(1)
@@ -596,8 +597,8 @@ def __init__(self,prog):
596597
base = f.replace('.txt', '')
597598
goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt')
598599
if not lib.checkannotations(goa_out):
599-
with open(goa_out, 'w') as output:
600-
subprocess.call(['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')], stderr=FNULL, stdout=output)
600+
cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')]
601+
lib.runSubprocess2(cmd, '.', lib.log, goa_out)
601602

602603
#load into pandas and write to html
603604
with open(os.path.join(args.out, 'go.html'), 'w') as output:
@@ -608,8 +609,8 @@ def __init__(self,prog):
608609
for f in os.listdir(os.path.join(args.out, 'go_enrichment')):
609610
if f.endswith('go.enrichment.txt'):
610611
file = os.path.join(args.out, 'go_enrichment', f)
611-
base = file.split('.go_enrichment.txt')[0]
612-
name = base.split('/')[-1]
612+
base = os.path.basename(file)
613+
name = base.split('.go_enrichment.txt')[0]
613614
#check goatools output, return is a tuple with True/False and header line #
614615
goresult = lib.checkgoatools(file)
615616
output.write('<h4 class="sub-header" align="left">GO Enrichment: '+name+'</h4>')
@@ -621,7 +622,7 @@ def __init__(self,prog):
621622
df2 = df.loc[df['p_fdr'] < args.go_fdr]
622623
df2.sort_values(by='enrichment', inplace=True)
623624
if len(df2) > 0:
624-
df2.to_csv(base+'.fdr_enriched.csv', index=False)
625+
df2.to_csv(os.path.join(args.out, 'go_enrichment', base+'.fdr_enriched.csv'), index=False)
625626
#apparently goatools also changed the headers....arrggh...
626627
df2['GO'] = '<a target="_blank" href="http://amigo.geneontology.org/amigo/search/ontology?q='+ df2['GO'].astype(str)+'">'+df2['GO']+'</a>'
627628
output.write(df2.to_html(escape=False, index=False, classes='table table-hover'))
@@ -784,13 +785,19 @@ def __init__(self,prog):
784785
line = line.replace('\n', '')
785786
cols = line.split('\t')
786787
if args.run_dnds:
787-
dNdS = dNdSresults.get(cols[0])
788+
if cols[0] in dNdSresults:
789+
dNdS = dNdSresults.get(cols[0])
790+
else:
791+
dNdS = ('NC', 'NC', 'NC')
788792
else:
789793
dNdS = ('NC', 'NC', 'NC')
790794
if args.run_dnds == 'estimate':
791795
output.write("%s\t%s (NC,NC)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], cols[1], cols[2], cols[3]))
792796
else:
793-
output.write("%s\t%s (%.4f,%.4f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
797+
try:
798+
output.write("%s\t%s (%f,%f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
799+
except ValueError or NoneType:
800+
output.write("%s\t%s (NA,NA)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], cols[1], cols[2], cols[3]))
794801

795802
#cleanup
796803
os.remove(orthologstmp)
@@ -829,7 +836,7 @@ def __init__(self,prog):
829836
stats[i].append("{0:,}".format(scoCount))
830837

831838
for i in range(0, len(stats)):
832-
summary.append(stats[i])
839+
summary.append(stats[i])
833840

834841

835842
#convert to dataframe for easy output

bin/funannotate-functional.py

+24-15
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ def runIPRpython(Input):
121121
sys.exit(1)
122122
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
123123
lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db)
124-
subprocess.call([os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')], stdout=FNULL, stderr=FNULL)
124+
cmd = [os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')]
125+
lib.runSubprocess(cmd, '.', lib.log)
125126
if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
126127
lib.log.error("Downloading failed, exiting")
127128
sys.exit(1)
@@ -133,6 +134,11 @@ def runIPRpython(Input):
133134
lib.download_buscos(args.busco_db)
134135

135136
#need to do some checks here of the input
137+
genbank = ''
138+
Scaffolds = ''
139+
Proteins = ''
140+
Transcripts = ''
141+
GFF = ''
136142
if not args.input:
137143
#did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
138144
if not args.out:
@@ -226,7 +232,7 @@ def runIPRpython(Input):
226232

227233
#get organism and isolate from GBK file
228234
if not args.species:
229-
if args.genbank:
235+
if genbank != '':
230236
with open(genbank, 'rU') as gbk:
231237
SeqRecords = SeqIO.parse(gbk, 'genbank')
232238
for record in SeqRecords:
@@ -239,8 +245,8 @@ def runIPRpython(Input):
239245
isolate = args.isolate
240246
break
241247
else:
242-
organism = '???'
243-
isolate = '???'
248+
lib.log.error("No species name given will cause problems downstream, please pass a name to -s,--species")
249+
sys.exit(1)
244250
else:
245251
organism = args.species
246252
if not args.isolate:
@@ -355,21 +361,22 @@ def runIPRpython(Input):
355361
os.makedirs(IPROUT)
356362
#now split XML file
357363
splitter = os.path.join(parentdir, 'util', 'prepare_ind_xml.pl')
358-
subprocess.call([splitter, args.iprscan, IPROUT], stdout = FNULL, stderr = FNULL)
364+
cmd = [splitter, args.iprscan, IPROUT]
365+
lib.runSubprocess(cmd, '.', lib.log)
359366

360367
#now collect the results from InterProscan, then start to reformat results
361368
lib.log.info("InterProScan has finished, now pulling out annotations from results")
362369
IPR_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.iprscan.txt')
363370
if not os.path.isfile(IPR_terms):
364371
IPR2TSV = os.path.join(parentdir, 'util', 'ipr2tsv.py')
365-
with open(IPR_terms, 'w') as output:
366-
subprocess.call([sys.executable, IPR2TSV, IPROUT], stdout = output, stderr = FNULL)
372+
cmd = [sys.executable, IPR2TSV, IPROUT]
373+
lib.runSubprocess2(cmd, '.', lib.log, IPR_terms)
367374
GO_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.GO.txt')
368375
if not os.path.isfile(GO_terms):
369376
IPR2GO = os.path.join(parentdir, 'util', 'ipr2go.py')
370377
OBO = os.path.join(parentdir, 'DB', 'go.obo')
371-
with open(GO_terms, 'w') as output:
372-
subprocess.call([sys.executable, IPR2GO, OBO, IPROUT], stdout = output, stderr = FNULL)
378+
cmd = [sys.executable, IPR2GO, OBO, IPROUT]
379+
lib.runSubprocess2(cmd, '.', lib.log, GO_terms)
373380

374381

375382
#check if antiSMASH data is given, if so parse and reformat for annotations and cluster textual output
@@ -409,7 +416,8 @@ def runIPRpython(Input):
409416
#launch gag
410417
GAG = os.path.join(outputdir, 'annotate_misc', 'gag')
411418
lib.log.info("Adding annotations to GFF using GAG")
412-
subprocess.call(['gag.py', '-f', Scaffolds, '-g', GFF, '-a', ANNOTS, '-o', GAG], stdout = FNULL, stderr = FNULL)
419+
cmd = ['gag.py', '-f', Scaffolds, '-g', GFF, '-a', ANNOTS, '-o', GAG]
420+
lib.runSubprocess(cmd, '.', lib.log)
413421

414422
#fix the tbl file for tRNA genes
415423
lib.log.info("Fixing tRNA annotations in GenBank tbl file")
@@ -432,7 +440,8 @@ def runIPRpython(Input):
432440
shutil.copyfile(os.path.join(GAG, 'genome.fasta'), os.path.join(GAG, 'genome.fsa'))
433441
discrep = 'discrepency.report.txt'
434442
lib.log.info("Converting to final Genbank format, good luck!.....")
435-
subprocess.call(['tbl2asn', '-p', GAG, '-t', SBT, '-M', 'n', '-Z', discrep, '-a', 'r10u', '-l', 'paired-ends', '-j', ORGANISM, '-V', 'b', '-c', 'fx'], stdout = FNULL, stderr = FNULL)
443+
cmd = ['tbl2asn', '-p', GAG, '-t', SBT, '-M', 'n', '-Z', discrep, '-a', 'r10u', '-l', 'paired-ends', '-j', ORGANISM, '-V', 'b', '-c', 'fx']
444+
lib.runSubprocess(cmd, '.', lib.log)
436445

437446
#collected output files and rename accordingly
438447
ResultsFolder = os.path.join(outputdir, 'annotate_results')
@@ -451,9 +460,8 @@ def runIPRpython(Input):
451460
lib.log.info("Creating AGP file and corresponding contigs file")
452461
agp2fasta = os.path.join(parentdir, 'util', 'fasta2agp.pl')
453462
AGP = os.path.join(ResultsFolder, baseOUTPUT+'.agp')
454-
with open(AGP, 'w') as output:
455-
subprocess.call(['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa'], cwd = ResultsFolder, stdout = output, stderr = FNULL)
456-
463+
cmd = ['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa']
464+
lib.runSubprocess2(cmd, ResultsFolder, lib.log, AGP)
457465

458466
#write secondary metabolite clusters output using the final genome in gbk format
459467
if args.antismash:
@@ -474,7 +482,8 @@ def runIPRpython(Input):
474482
for record in SeqRecords:
475483
if record.id in AllProts:
476484
SeqIO.write(record, output, 'fasta')
477-
subprocess.call(['blastp', '-query', mibig_fasta, '-db', mibig_db, '-num_threads', str(args.cpus), '-max_target_seqs', '1', '-max_hsps', '1', '-evalue', '0.001', '-outfmt', '6', '-out', mibig_blast])
485+
cmd = ['blastp', '-query', mibig_fasta, '-db', mibig_db, '-num_threads', str(args.cpus), '-max_target_seqs', '1', '-max_hsps', '1', '-evalue', '0.001', '-outfmt', '6', '-out', mibig_blast]
486+
lib.runSubprocess(cmd, '.', lib.log)
478487
#now parse blast results to get {qseqid: hit}
479488
MIBiGBlast = {}
480489
with open(mibig_blast, 'rU') as input:

0 commit comments

Comments
 (0)