fixes and updates to v0.5.2

Jon Palmer · Jon Palmer · commit 5899e9020491 · 2016-12-17T19:01:51.000-06:00
diff --git a/bin/funannotate-compare.py b/bin/funannotate-compare.py
@@ -31,7 +31,7 @@ def __init__(self,prog):
 parser.add_argument('--num_orthos', default=500, type=int, help='Number of Single-copy orthologs to run with RAxML')
 parser.add_argument('--outgroup', help='Name of species for RAxML outgroup')
 parser.add_argument('--eggnog_db', default='fuNOG', help='EggNog database')
-parser.add_argument('--run_dnds', action='store_true', help='Run dN/dS analysis with codeML for each ortholog (long runtime)')
+parser.add_argument('--run_dnds', choices=['estimate', 'full'], help='Run dN/dS analysis with codeML for each ortholog (long runtime)')
 parser.add_argument('--proteinortho', help='Pre-computed ProteinOrtho POFF')
 args=parser.parse_args() 
 
@@ -753,13 +753,6 @@ def __init__(self,prog):
                 with open(KaKstranscript, 'w') as tranout:
                     for i in proteins:
                         SeqIO.write(SeqTranscripts[i], tranout, 'fasta')
-                #calculate dN/dS
-                #DnDs,M1M2p,M7M8p = lib.rundNdS(KaKstranscript, ID, ortho_folder)
-            #else:
-                #DnDs = 'NC'
-                #M1M2p = 'NC'
-                #M7M8p = 'NC'
-
             #write to output
             if len(eggs) > 0:
                 eggs = ', '.join(str(v) for v in eggs)
@@ -776,7 +769,10 @@ def __init__(self,prog):
 if args.run_dnds:
     #multiprocessing dN/dS on list of folders
     dNdSList = lib.get_subdirs(ortho_folder)
-    lib.runMultiProgress(lib.rundNdS, dNdSList, args.cpus)
+    if args.run_dnds == 'estimate':
+        lib.runMultiProgress(lib.rundNdSestimate, dNdSList, args.cpus)
+    else:
+        lib.runMultiProgress(lib.rundNdSexhaustive, dNdSList, args.cpus)
 
     #after all data is run, then parse result log files, return dictionary
     dNdSresults = lib.parsedNdS(ortho_folder)
@@ -791,7 +787,11 @@ def __init__(self,prog):
                     dNdS = dNdSresults.get(cols[0])
                 else:
                     dNdS = ('NC', 'NC', 'NC')
-                output.write("%s\t%s (%.4f,%.4f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
+                if args.run_dnds == 'estimate':
+                    output.write("%s\t%s (NC,NC)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], cols[1], cols[2], cols[3]))
+                else:
+                    output.write("%s\t%s (%.4f,%.4f)\t%s\t%s\t%s\n" % (cols[0], dNdS[0], round(float(dNdS[1]),4), round(float(dNdS[2]),4), cols[1], cols[2], cols[3]))
+                   
     #cleanup
     os.remove(orthologstmp)
 
diff --git a/bin/funannotate-contig_cleaner.py b/bin/funannotate-contig_cleaner.py
@@ -48,6 +48,14 @@ def Sortbysize(input):
                 contigs.append(rec.id)
         return contigs
 
+def countfasta(input):
+    count = 0
+    with open(input, 'rU') as f:
+        for line in f:
+            if line.startswith (">"):
+                count += 1
+    return count
+
 def getFasta(sequences, header):
     with open('query.fa', 'w') as fasta:
         with open(sequences, 'rU') as input:
@@ -109,7 +117,7 @@ def runNucmer(query, reference, output):
     os.remove('reference.fa')
 
 print"------------------------------------"
-print"%i input contigs, %i duplicated, %i written to file" % (len(scaffolds), (len(scaffolds) - len(keepers)), len(keepers))
+print"%i input contigs, %i duplicated, %i written to file" % (countfasta(args.input), (len(scaffolds) - len(keepers)), len(keepers))
 
 #finally write a new reference based on list of keepers
 with open(args.out, 'w') as output:
diff --git a/funannotate.py b/funannotate.py
@@ -19,7 +19,7 @@ def flatten(l):
             flatList.append(elem)
     return flatList
 
-version = '0.5.0'
+version = '0.5.2'
 
 default_help = """
 Usage:       funannotate <command> <arguments>
@@ -217,7 +217,7 @@ def flatten(l):
 
 Optional:    -o, --out           Output folder name. Default: funannotate_compare
              --cpus              Number of CPUs to use. Default: 2
-             --run_dnds          Calculate dN/dS ratio on all orthologs. Very long runtime.
+             --run_dnds          Calculate dN/dS ratio on all orthologs. [estimate,full]
              --go_fdr            P-value for FDR GO-enrichment. Default: 0.05
              --heatmap_stdev     Cut-off for heatmap. Default: 1.0
              --num_orthos        Number of Single-copy orthologs to use for RAxML. Default: 500
diff --git a/lib/library.py b/lib/library.py
@@ -326,27 +326,21 @@ def countGFFgenes(input):
     return count
 
 def runMultiProgress(function, inputList, cpus):
-    from progressbar import ProgressBar, Percentage
-    try:
-        from progressbar import AdaptiveETA
-        eta = AdaptiveETA()
-    except ImportError:
-        from progressbar import ETA
-        eta = ETA()
-    from time import sleep
     #setup pool
     p = multiprocessing.Pool(cpus)
-    #setup progress bar
-    widgets = ['     Progress: ', Percentage(),' || ', eta]
-    pbar = ProgressBar(widgets=widgets, term_width=30, maxval=len(inputList)).start()
     #setup results and split over cpus
+    tasks = len(inputList)
     results = []
-    r = [p.apply_async(function, (x,), callback=results.append) for x in inputList]
+    for i in inputList:
+        results.append(p.apply_async(function, [i]))
     #refresh pbar every 5 seconds
-    while len(results) != len(inputList):
-        pbar.update(len(results))
-        sleep(5)
-    pbar.finish()
+    while True:
+        incomplete_count = sum(1 for x in results if not x.ready())
+        if incomplete_count == 0:
+            break
+        sys.stdout.write("     Progress: %.2f%% \r" % (float(tasks - incomplete_count) / tasks * 100))
+        sys.stdout.flush()
+        time.sleep(1)
     p.close()
     p.join()
 
@@ -1988,9 +1982,8 @@ def simplestTreeEver(fasta, tree):
             for rec in SeqIO.parse(input, 'fasta'):
                 ids.append(rec.id)
             outfile.write('(%s,%s);' % (ids[0], ids[1]))
-                
 
-def rundNdS(folder):
+def rundNdSexhaustive(folder):
     FNULL = open(os.devnull, 'w')
     #setup intermediate files
     tmpdir = os.path.dirname(folder)
@@ -2025,6 +2018,43 @@ def rundNdS(folder):
     for file in os.listdir(tmpdir):
         if file.startswith(name+'.'):
             os.rename(os.path.join(tmpdir, file), os.path.join(tmpdir, name, file))            
+              
+
+def rundNdSestimate(folder):
+    FNULL = open(os.devnull, 'w')
+    #setup intermediate files
+    tmpdir = os.path.dirname(folder)
+    name = os.path.basename(folder)
+    transcripts = os.path.join(tmpdir, name+'.transcripts.fa')
+    prots = os.path.join(tmpdir, name+'.proteins.fa')
+    aln = os.path.join(tmpdir, name+'.aln')
+    codon = os.path.join(tmpdir, name+'.codon.aln')
+    tree = os.path.join(tmpdir, name+'.tree')
+    log = os.path.join(tmpdir, name+'.log')
+    finallog = os.path.join(tmpdir, name, name+'.log')
+    if not checkannotations(finallog):
+        num_seqs = countfasta(transcripts)
+        #Translate to protein space
+        translatemRNA(transcripts, prots)
+        #align protein sequences
+        alignMAFFT(prots, aln)
+        #convert to codon alignment
+        align2Codon(aln, transcripts, codon)
+        if checkannotations(codon):
+            if num_seqs > 2:
+                #now generate a tree using phyml
+                drawPhyMLtree(codon, tree)
+            else:
+                simplestTreeEver(transcripts, tree)
+            #now run codeml through ete3
+            etecmd = ['ete3', 'evol', '--alg', os.path.abspath(codon), '-t', os.path.abspath(tree), '--models', 'M0', '-o', name, '--clear_all', '--codeml_param', 'cleandata,1']
+            with open(log, 'w') as logfile:
+                logfile.write('\n%s\n' % ' '.join(etecmd))
+                subprocess.call(etecmd, cwd = tmpdir, stdout = logfile, stderr = logfile)
+    #clean up
+    for file in os.listdir(tmpdir):
+        if file.startswith(name+'.'):
+            os.rename(os.path.join(tmpdir, file), os.path.join(tmpdir, name, file))            
 
 def get_subdirs(a_dir):
     return [os.path.join(a_dir, name) for name in os.listdir(a_dir)
@@ -2069,14 +2099,6 @@ def chunkIt(seq, num):
     last += avg
   return out
 
-def countKaks(folder, num):
-    allfiles = []
-    for file in os.listdir(folder):
-        if file.endswith('.fasta.axt'):
-            f = os.path.join(folder, file)
-            allfiles.append(f)
-    #split files by x chunks
-    return chunkIt(allfiles, num)
 
 HEADER = '''
 <!DOCTYPE html>
diff --git a/sample_data/run_unit_tests.sh b/sample_data/run_unit_tests.sh
@@ -31,7 +31,7 @@ cmd='funannotate annotate -i genome3 -e palmer3@wisc.edu --cpus 6 --iprscan iprs
 echo $cmd; eval $cmd
 
 #now run compare
-cmd='funannotate compare -i genome1 genome2 genome3 --cpus 6 --outgroup botrytis_cinerea.dikarya'
+cmd='funannotate compare -i genome1 genome2 genome3 --cpus 6 --outgroup botrytis_cinerea.dikarya --run_dnds estimate'
 echo $cmd; eval $cmd
 
 #clean up augustus training
diff --git a/setup.sh b/setup.sh
@@ -53,7 +53,7 @@ else
         fi    
     else
         echo "HomeBrew installation not detected, specify DB installation directory"
-        outputdir='/usr/local/share/funannotate'
+        outputdir="$HOME/funannotate/"
         echo -n "Default DB directory set to ($outputdir), continue [y/n]: "
         read question1
         if [ $question1 == 'n' ]; then