nextgenusfs
diff --git a/‎README.md
Lines changed: 3 additions & 0 deletions b/‎README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎bin/augustus_parallel.py
Lines changed: 29 additions & 16 deletions b/‎bin/augustus_parallel.py
Lines changed: 29 additions & 16 deletions
diff --git a/‎bin/funannotate-compare.py
Lines changed: 16 additions & 9 deletions b/‎bin/funannotate-compare.py
Lines changed: 16 additions & 9 deletions
diff --git a/‎bin/funannotate-functional.py
Lines changed: 24 additions & 15 deletions b/‎bin/funannotate-functional.py
Lines changed: 24 additions & 15 deletions
@@ -8,6 +8,7 @@ funannotate will likely run on any POSIX system, although it has only been teste
 
 * [Mac OSX install instructions](docs/mac_install.md)
 * [Ubuntu install instructions](docs/ubuntu_install.md)
+* [FAQS](docs/faqs.md)
 
 ###Setup
 
@@ -18,6 +19,8 @@ To run the setup script, type:
 funannotate setup --all
 ```
 
+Most problems that people have are with dependencies and installation of funannotate.  Here are some Frequently Asked Questions: [FAQ](cods/faqs.md)
+
 ###Funannotate help menu
 
 To see the help menu, simply type `funannotate` in the terminal window.  Similarly, e.g `funannotate predict` without any arguments will give you the options available to pass to each script, this is consistent for all of the funannotate commands.
 
@@ -1,7 +1,11 @@
 #!/usr/bin/env python
 
-import sys, multiprocessing, subprocess, os, shutil, argparse, time
+import sys, multiprocessing, subprocess, os, shutil, argparse, time, inspect
 from Bio import SeqIO
+currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
+parentdir = os.path.dirname(currentdir)
+sys.path.insert(0,parentdir)
+import lib.library as lib
 
 #setup menu with argparse
 class MyFormatter(argparse.ArgumentDefaultsHelpFormatter):
@@ -17,6 +21,7 @@ def __init__(self,prog):
 parser.add_argument('--hints', help='Hints file (PE)')
 parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to run')
 parser.add_argument('--debug', action='store_true', help='Keep intermediate files')
+parser.add_argument('--logfile', default ='augustus-parallel.log', help='logfile')
 args=parser.parse_args()
 
 #check for augustus installation
@@ -43,7 +48,6 @@ def countGFFgenes(input):
     return count
 
 def runAugustus(Input):
-    FNULL = open(os.devnull, 'w')
     if '_part' in Input:
         chr = Input.split('_part')[0]
     else:
@@ -64,9 +68,17 @@ def runAugustus(Input):
     with open(aug_out, 'w') as output:
         subprocess.call(core_cmd, stdout = output)
 
+log_name = args.logfile
+if os.path.isfile(log_name):
+    os.remove(log_name)
+
+#initialize script, log system info and cmd issue at runtime
+lib.setupLogging(log_name)
+cmd_args = " ".join(sys.argv)+'\n'
+lib.log.debug(cmd_args)
 
 #first step is to split input fasta file into individual files in tmp folder
-print("Splitting contigs and hints files")
+lib.log.debug("Splitting contigs and hints files")
 tmpdir = 'augustus_tmp_'+str(os.getpid())
 os.makedirs(tmpdir)
 scaffolds = []
@@ -121,22 +133,23 @@ def runAugustus(Input):
     num = len(scaffolds)
 else:
     num = args.cpus
-print("Running augustus on %i chunks, using %i CPUs" % (len(scaffolds), num))
+lib.log.debug("Running Augustus on %i chunks, using %i CPUs" % (len(scaffolds), num))
 p = multiprocessing.Pool(num)
+tasks = len(scaffolds)
 results = []
-r = [p.apply_async(runAugustus, (x,), callback=results.append) for x in scaffolds]
+for i in scaffolds:
+    results.append(p.apply_async(runAugustus, [i]))
+while True:
+    incomplete_count = sum(1 for x in results if not x.ready())
+    if incomplete_count == 0:
+        break
+    sys.stdout.write("     Progress: %.2f%% \r" % (float(tasks - incomplete_count) / tasks * 100))
+    sys.stdout.flush()
+    time.sleep(1)
 p.close()
 p.join()
-'''
-rs = p.map_async(runAugustus, scaffolds)
-p.close()
-while (True):
-    if (rs.ready()): break
-    remaining = rs._number_left
-    print "Waiting for", remaining, "augustus jobs to complete..."
-    time.sleep(30)
-'''
-print("Augustus prediction is finished, now concatenating results")
+
+lib.log.debug("Augustus prediction is finished, now concatenating results")
 with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'w') as output:
     for file in scaffolds:
         file = os.path.join(tmpdir, file+'.augustus.gff3')
@@ -149,4 +162,4 @@ def runAugustus(Input):
         subprocess.call([join_script],stdin = input, stdout = finalout)
 if not args.debug:
     shutil.rmtree(tmpdir)
-print("Found %i total gene models" % countGFFgenes(args.out))
+lib.log.info("Found %i gene models" % countGFFgenes(args.out))
@@ -44,7 +44,8 @@ def __init__(self,prog):
     sys.exit(1)
 if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
     lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db)
-    subprocess.call([os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')], stdout=FNULL, stderr=FNULL)
+    cmd = [os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')]
+    lib.runSubprocess(cmd, '.', lib.log)
     if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
         lib.log.error("Downloading failed, exiting")
         sys.exit(1)
@@ -596,8 +597,8 @@ def __init__(self,prog):
         base = f.replace('.txt', '')
         goa_out = os.path.join(args.out, 'go_enrichment', base+'.go.enrichment.txt')
         if not lib.checkannotations(goa_out):
-            with open(goa_out, 'w') as output:
-                subprocess.call(['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')], stderr=FNULL, stdout=output)
+            cmd = ['find_enrichment.py', '--obo', os.path.join(parentdir, 'DB', 'go.obo'), '--pval', '0.001', '--alpha', '0.001', '--method', 'fdr', file, os.path.join(go_folder, 'population.txt'), os.path.join(go_folder, 'associations.txt')]
+            lib.runSubprocess2(cmd, '.', lib.log, goa_out)
 
     #load into pandas and write to html
     with open(os.path.join(args.out, 'go.html'), 'w') as output:
@@ -608,8 +609,8 @@ def __init__(self,prog):
         for f in os.listdir(os.path.join(args.out, 'go_enrichment')):
             if f.endswith('go.enrichment.txt'):
                 file = os.path.join(args.out, 'go_enrichment', f)
-                base = file.split('.go_enrichment.txt')[0]
-                name = base.split('/')[-1]
+                base = os.path.basename(file)
+                name = base.split('.go_enrichment.txt')[0]
                 #check goatools output, return is a tuple with True/False and header line #
                 goresult = lib.checkgoatools(file)
                 output.write('<h4 class="sub-header" align="left">GO Enrichment: '+name+'</h4>')
@@ -621,7 +622,7 @@ def __init__(self,prog):
                     df2 = df.loc[df['p_fdr'] < args.go_fdr]
                     df2.sort_values(by='enrichment', inplace=True)
                     if len(df2) > 0:
-                        df2.to_csv(base+'.fdr_enriched.csv', index=False)
+                        df2.to_csv(os.path.join(args.out, 'go_enrichment', base+'.fdr_enriched.csv'), index=False)
                         #apparently goatools also changed the headers....arrggh...
                         df2['GO'] = '<a target="_blank" href="http://amigo.geneontology.org/amigo/search/ontology?q='+ df2['GO'].astype(str)+'">'+df2['GO']+'</a>'
                         output.write(df2.to_html(escape=False, index=False, classes='table table-hover'))
@@ -784,13 +785,19 @@ def __init__(self,prog):
                 line = line.replace('\n', '')
                 cols = line.split('\t')
                 if args.run_dnds:
-                    dNdS = dNdSresults.get(cols[0])
+                    if cols[0] in dNdSresults:
+                        dNdS = dNdSresults.get(cols[0])
+                    else:
+                        dNdS = ('NC', 'NC', 'NC')
                 else:
                     dNdS = ('NC', 'NC', 'NC')
                 if args.run_dnds == 'estimate':
                     output.write("%s\t%s (NC,NC)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], cols[1], cols[2], cols[3]))
                 else:
-                    output.write("%s\t%s (%.4f,%.4f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
+                    try:
+                        output.write("%s\t%s (%f,%f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
+                    except ValueError or NoneType:
+                        output.write("%s\t%s (NA,NA)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], cols[1], cols[2], cols[3]))
 
     #cleanup
     os.remove(orthologstmp)
@@ -829,7 +836,7 @@ def __init__(self,prog):
     stats[i].append("{0:,}".format(scoCount))   
 
 for i in range(0, len(stats)):     
-	summary.append(stats[i])
+    summary.append(stats[i])
 
 
 #convert to dataframe for easy output
 
@@ -121,7 +121,8 @@ def runIPRpython(Input):
     sys.exit(1)
 if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
     lib.log.error("%s EggNog DB not found, trying to download and format..." % args.eggnog_db)
-    subprocess.call([os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')], stdout=FNULL, stderr=FNULL)
+    cmd = [os.path.join(parentdir, 'util', 'getEggNog.sh'), args.eggnog_db, os.path.join(parentdir, 'DB')]
+    lib.runSubprocess(cmd, '.', lib.log)
     if not os.path.isfile(os.path.join(parentdir, 'DB', args.eggnog_db+'_4.5.hmm')):
         lib.log.error("Downloading failed, exiting")
         sys.exit(1)
@@ -133,6 +134,11 @@ def runIPRpython(Input):
     lib.download_buscos(args.busco_db)
 
 #need to do some checks here of the input
+genbank = ''
+Scaffolds = ''
+Proteins = ''
+Transcripts = ''
+GFF = ''
 if not args.input:
     #did not parse folder of funannotate results, so need either gb + gff or fasta + proteins, + gff and also need to have args.out for output folder
     if not args.out:
@@ -226,7 +232,7 @@ def runIPRpython(Input):
 
 #get organism and isolate from GBK file
 if not args.species:
-    if args.genbank:
+    if genbank != '':
         with open(genbank, 'rU') as gbk:
             SeqRecords = SeqIO.parse(gbk, 'genbank')
             for record in SeqRecords:
@@ -239,8 +245,8 @@ def runIPRpython(Input):
                             isolate = args.isolate
                         break
     else:
-        organism = '???'
-        isolate = '???'
+        lib.log.error("No species name given will cause problems downstream, please pass a name to -s,--species")
+        sys.exit(1)
 else:
     organism = args.species
     if not args.isolate:
@@ -355,21 +361,22 @@ def runIPRpython(Input):
             os.makedirs(IPROUT)
             #now split XML file
             splitter = os.path.join(parentdir, 'util', 'prepare_ind_xml.pl')
-            subprocess.call([splitter, args.iprscan, IPROUT], stdout = FNULL, stderr = FNULL)
+            cmd = [splitter, args.iprscan, IPROUT]
+            lib.runSubprocess(cmd, '.', lib.log)
 
     #now collect the results from InterProscan, then start to reformat results
     lib.log.info("InterProScan has finished, now pulling out annotations from results")
     IPR_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.iprscan.txt')
     if not os.path.isfile(IPR_terms):
         IPR2TSV = os.path.join(parentdir, 'util', 'ipr2tsv.py')
-        with open(IPR_terms, 'w') as output:
-            subprocess.call([sys.executable, IPR2TSV, IPROUT], stdout = output, stderr = FNULL)
+        cmd = [sys.executable, IPR2TSV, IPROUT]
+        lib.runSubprocess2(cmd, '.', lib.log, IPR_terms)
     GO_terms = os.path.join(outputdir, 'annotate_misc', 'annotations.GO.txt')
     if not os.path.isfile(GO_terms):
         IPR2GO = os.path.join(parentdir, 'util', 'ipr2go.py')
         OBO = os.path.join(parentdir, 'DB', 'go.obo')
-        with open(GO_terms, 'w') as output:
-            subprocess.call([sys.executable, IPR2GO, OBO, IPROUT], stdout = output, stderr = FNULL)
+        cmd = [sys.executable, IPR2GO, OBO, IPROUT]
+        lib.runSubprocess2(cmd, '.', lib.log, GO_terms)
 
 
 #check if antiSMASH data is given, if so parse and reformat for annotations and cluster textual output
@@ -409,7 +416,8 @@ def runIPRpython(Input):
 #launch gag
 GAG = os.path.join(outputdir, 'annotate_misc', 'gag')
 lib.log.info("Adding annotations to GFF using GAG")
-subprocess.call(['gag.py', '-f', Scaffolds, '-g', GFF, '-a', ANNOTS, '-o', GAG], stdout = FNULL, stderr = FNULL)
+cmd = ['gag.py', '-f', Scaffolds, '-g', GFF, '-a', ANNOTS, '-o', GAG]
+lib.runSubprocess(cmd, '.', lib.log)
 
 #fix the tbl file for tRNA genes
 lib.log.info("Fixing tRNA annotations in GenBank tbl file")
@@ -432,7 +440,8 @@ def runIPRpython(Input):
 shutil.copyfile(os.path.join(GAG, 'genome.fasta'), os.path.join(GAG, 'genome.fsa'))
 discrep = 'discrepency.report.txt'
 lib.log.info("Converting to final Genbank format, good luck!.....")
-subprocess.call(['tbl2asn', '-p', GAG, '-t', SBT, '-M', 'n', '-Z', discrep, '-a', 'r10u', '-l', 'paired-ends', '-j', ORGANISM, '-V', 'b', '-c', 'fx'], stdout = FNULL, stderr = FNULL)
+cmd = ['tbl2asn', '-p', GAG, '-t', SBT, '-M', 'n', '-Z', discrep, '-a', 'r10u', '-l', 'paired-ends', '-j', ORGANISM, '-V', 'b', '-c', 'fx']
+lib.runSubprocess(cmd, '.', lib.log)
 
 #collected output files and rename accordingly
 ResultsFolder = os.path.join(outputdir, 'annotate_results')
@@ -451,9 +460,8 @@ def runIPRpython(Input):
 lib.log.info("Creating AGP file and corresponding contigs file")
 agp2fasta = os.path.join(parentdir, 'util', 'fasta2agp.pl')
 AGP = os.path.join(ResultsFolder, baseOUTPUT+'.agp')
-with open(AGP, 'w') as output:
-    subprocess.call(['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa'], cwd = ResultsFolder, stdout = output, stderr = FNULL)
-
+cmd = ['perl', agp2fasta, baseOUTPUT+'.scaffolds.fa']
+lib.runSubprocess2(cmd, ResultsFolder, lib.log, AGP)
 
 #write secondary metabolite clusters output using the final genome in gbk format
 if args.antismash:
@@ -474,7 +482,8 @@ def runIPRpython(Input):
             for record in SeqRecords:
                 if record.id in AllProts:
                     SeqIO.write(record, output, 'fasta')
-    subprocess.call(['blastp', '-query', mibig_fasta, '-db', mibig_db, '-num_threads', str(args.cpus), '-max_target_seqs', '1', '-max_hsps', '1', '-evalue', '0.001', '-outfmt', '6', '-out', mibig_blast])
+    cmd = ['blastp', '-query', mibig_fasta, '-db', mibig_db, '-num_threads', str(args.cpus), '-max_target_seqs', '1', '-max_hsps', '1', '-evalue', '0.001', '-outfmt', '6', '-out', mibig_blast]
+    lib.runSubprocess(cmd, '.', lib.log)
     #now parse blast results to get {qseqid: hit}
     MIBiGBlast = {}
     with open(mibig_blast, 'rU') as input: