update to tblastn/exonerate for speed and ploidy num

Jon Palmer · Jon Palmer · commit 74d0e3d1cc90 · 2016-12-23T15:35:18.000-06:00
diff --git a/bin/funannotate-p2g.py b/bin/funannotate-p2g.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 
-import sys, os, subprocess, csv, shutil, inspect, itertools, argparse
+import sys, os, subprocess, shutil, inspect, itertools, argparse
 from Bio import SeqIO
+from Bio.SeqIO.FastaIO import SimpleFastaParser
 currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
 parentdir = os.path.dirname(currentdir)
 sys.path.insert(0,parentdir)
@@ -22,9 +23,9 @@ def __init__(self,prog):
 parser.add_argument('-o','--out', required=True, help='Final exonerate output file')
 parser.add_argument('--maxintron', default = 3000, help='Maximum intron size')
 parser.add_argument('--logfile', default ='funannotate-p2g.log', help='logfile')
+parser.add_argument('--ploidy', default =1, type=int, help='Ploidy of assembly')
 args=parser.parse_args() 
 
-
 log_name = args.logfile
 if os.path.isfile(log_name):
     os.remove(log_name)
@@ -42,90 +43,153 @@ def __init__(self,prog):
 blast_version = blast_version.split(': ')[-1]
 lib.log.debug("BLAST v%s; Exonerate v%s" % (blast_version, exo_version))
 
-def tblastnFilter(input, query, cpus, output):
+def runtblastn(input, query, cpus, output, maxhits):
     #start by formatting blast db/dustmasker filtered format
     cmd = ['dustmasker', '-in', input, '-infmt', 'fasta', '-parse_seqids', '-outfmt', 'maskinfo_asn1_bin', '-out', 'genome_dust.asnb']
     lib.runSubprocess(cmd, output, lib.log)
     cmd = ['makeblastdb', '-in', input, '-dbtype', 'nucl', '-parse_seqids', '-mask_data', 'genome_dust.asnb', '-out', 'genome']
     lib.runSubprocess(cmd, output, lib.log)
-    cmd = ['tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', '1', '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab']
+    cmd = ['tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', str(maxhits), '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab']
     lib.runSubprocess(cmd, output, lib.log)
 
 def parseBlast(blastresult):
-    global HitList
-    HitList = []
-    #now parse through results, generating a list for exonerate function
+    Results = {}
     with open(blastresult, 'rU') as input:
-        reader = csv.reader(input, delimiter='\t')
-        for cols in reader:
-            hit = cols[0] + '::' + cols[1]
-            if hit not in HitList:
-                HitList.append(hit)
+        for line in input:
+            cols = line.split('\t')
+            hit = cols[0] + ':::' + cols[1]
+            if int(cols[8]) < int(cols[9]):
+                start = cols[8]
+                end = cols[9]
+            else: 
+                start = cols[9]
+                end = cols[8]
+            if not hit in Results:
+                Results[hit] = (start, end)
+            else:
+                #get old start stop
+                old = Results.get(hit)
+                if int(start) < int(old[0]):
+                    newstart = start
+                else:
+                    newstart = old[0]
+                if int(end) > int(old[1]):
+                    newstop = end
+                else:
+                    newstop = old[1]
+                Results[hit] = (newstart, newstop)
+    #convert Dictionary to a list that has  hit:::scaffold:::start:::stop
+    HitList = []
+    for k,v in Results.items():
+        finalhit = k+':::'+str(v[0])+':::'+str(v[1])
+        HitList.append(finalhit)
+    return HitList
 
 def runExonerate(input):
-    FNULL = open(os.devnull, 'w')
-    s = input.split('::')
-    if s[0].startswith('sp|'):
-        name = s[0].split("|")[1] + '_' + s[1]
-    else:
-        name = s[0].split()[0] + '_' + s[1]
-    query = os.path.join(tmpdir, name+'.fa')
+    s = input.split(':::')
+    ProtID = s[0]
+    ScaffID = s[1]
+    ScaffStart = int(s[2])
+    ScaffEnd = int(s[3])
+    #get the protein model
+    query = os.path.join(tmpdir, ProtID+'.'+str(os.getpid())+'.fa')
     with open(query, 'w') as output:
-        rec = record_dict[s[0]]
-        output.write(">%s\n%s\n" % (rec.id, rec.seq))
-    scaffold = s[1] + '.fa'
+        SeqIO.write(protein_dict[ProtID], output, 'fasta')
+    #now get the genome region, use different variable names for SeqRecords to avoid collision
+    scaffold = ScaffID+'.'+ProtID+'.'+str(ScaffStart)+'-'+str(ScaffEnd)+'.fa'
     scaffold = os.path.join(tmpdir, scaffold)
-    exonerate_out = 'exonerate_' + name + '.out'
+    with open(scaffold, 'w') as output2:
+        with open(os.path.join(tmpdir, 'scaffolds', ScaffID+'.fa'), 'rU') as fullscaff:
+            for header, Sequence in SimpleFastaParser(fullscaff):
+                #grab a 1 kb cushion on either side of hit region, careful of scaffold ends      
+                start = ScaffStart - 1000
+                if start < 1:
+                    start = 1
+                end = ScaffEnd + 1000
+                if end > len(Sequence):
+                    end = len(Sequence)
+                output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
+    exoname = ProtID+'.'+ScaffID+'__'+str(start)+'__'
+    #check that input files are created and valid
+    exonerate_out = 'exonerate.' + exoname + '.out'
     exonerate_out = os.path.join(tmpdir, exonerate_out)
     ryo = "AveragePercentIdentity: %pi\n"
-    with open(exonerate_out, 'w') as output:
-        subprocess.call(['exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo , query, scaffold], stdout = output, stderr = FNULL)
-    os.remove(query)
-    #check filesize of exonerate output, no hits are 285 bytes, but lets just filter everything smaller than 310
-    if lib.getSize(exonerate_out) < 310:
+    cmd = ['exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo , query, scaffold]
+    #run exonerate, capture errors
+    with open(exonerate_out, 'w') as output3:
+        proc = subprocess.Popen(cmd, stdout = output3, stderr=subprocess.PIPE)
+    stderr = proc.communicate()
+    if 'WARNING' in stderr[1]:
+    	lib.log.debug('%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
+        os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query)))
+        os.rename(scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
+    else:   
+        for y in [query, scaffold]:
+            try:
+                os.remove(y)
+            except OSError:
+                lib.log.debug("Error removing %s" % (y))   
+    #check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
+    if lib.getSize(exonerate_out) < 500:
         os.remove(exonerate_out)
-
+  
 #make tmpdir
 tmpdir = 'p2g_'+ str(os.getpid())
 if not os.path.isdir(tmpdir):
     os.makedirs(tmpdir)
-
+    os.makedirs(os.path.join(tmpdir, 'failed'))
+    os.makedirs(os.path.join(tmpdir, 'scaffolds'))
+#check for tblastn input
 if args.tblastn:
     lib.log.info("Using pre-calculated tBLASTN result")
     BlastResult = args.tblastn
 else:
     lib.log.info("Running pre-filter tBlastn step")
     BlastResult = os.path.join(tmpdir, 'filter.tblastn.tab')
-    tblastnFilter(os.path.abspath(args.genome), os.path.abspath(args.proteins), args.cpus, tmpdir)
+    runtblastn(os.path.abspath(args.genome), os.path.abspath(args.proteins), args.cpus, tmpdir, args.ploidy*2) #2X ploidy for tBLASTn filter
+
+#new routine
+Hits = parseBlast(BlastResult)
+lib.log.info("Found %i preliminary alignments" % (len(Hits)))
 
-#parse the results
-parseBlast(BlastResult)
-lib.log.info("found %i preliminary alignments" % (len(HitList)))
+#index the genome and proteins
+protein_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta') #do index here in case memory problems?
 
 #split genome fasta into individual scaffolds
-if not os.path.exists(tmpdir):
-    os.makedirs(tmpdir)
 with open(os.path.abspath(args.genome), 'rU') as input:
     for record in SeqIO.parse(input, "fasta"):
-        SeqIO.write(record, os.path.join(tmpdir, record.id + ".fa"), "fasta")
-
-#Now run exonerate on hits
-lib.log.info("Polishing alignments with Exonerate")
-record_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta') #do index here in case memory problems?
+        SeqIO.write(record, os.path.join(tmpdir, 'scaffolds', record.id + ".fa"), "fasta")
 
 #run multiprocessing exonerate
-lib.runMultiProgress(runExonerate, HitList, args.cpus)
-lib.log.info("Exonerate finished")
-
-#now collect all exonerate results into one
-with open(args.out, 'wb') as output:
-    for root, dirs, files in os.walk(tmpdir):
-        for file in files:
-            if file.endswith('.out'):
-                filename = os.path.join(root, file)
-                with open(filename, 'rU') as readfile:
-                    for line in itertools.islice(readfile, 3, None):
+lib.runMultiProgress(runExonerate, Hits, args.cpus)
+
+#now need to loop through and offset exonerate predictions back to whole scaffolds
+with open(args.out, 'w') as output:
+    for file in os.listdir(tmpdir):
+        if file.endswith('.out'):
+            with open(os.path.join(tmpdir, file), 'rU') as exoresult:
+                offset = int(file.split('__')[1])
+                for line in itertools.islice(exoresult, 3, None):
+                    if line.startswith('#') or line.startswith('Average') or line.startswith('-- completed'):
                         output.write(line)
-
-#finally clean-up your mess
-shutil.rmtree(tmpdir)
+                    else:
+                        cols = line.split('\t')
+                        cols[3] = str(int(cols[3])+offset)
+                        cols[4] = str(int(cols[4])+offset)
+                        output.write('\t'.join(cols))
+
+#output some quick summary of exonerate alignments that you found
+Found = lib.countGFFgenes(args.out)
+lib.log.info("Exonerate finished: found %i alignments" % Found)
+
+#finally clean-up your mess if failed is empty
+try:
+	os.rmdir(os.path.join(tmpdir, 'failed'))
+	empty = True
+except OSError:
+	empty = False
+if empty:
+	shutil.rmtree(tmpdir)
+else:
+	lib.log.error("Failed exonerate alignments found, see files in %s" % os.path.join(tmpdir, 'failed'))
+sys.exit(1)
diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py
@@ -37,6 +37,7 @@ def __init__(self, prog):
 parser.add_argument('--max_intronlen', default=3000, help='Maximum intron length for gene models')
 parser.add_argument('--min_protlen', default=50, type=int, help='Minimum amino acid length for valid gene model')
 parser.add_argument('--keep_no_stops', action='store_true', help='Keep gene models without valid stop codons')
+parser.add_argument('--ploidy', default=1, type=int, help='Ploidy of assembly')
 parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use')
 parser.add_argument('--busco_seed_species', default='anidulans', help='Augustus species to use as initial training point for BUSCO')
 parser.add_argument('--optimize_augustus', action='store_true', help='Run "long" training of Augustus')
@@ -375,24 +376,23 @@ def __init__(self, prog):
             if os.path.isfile(prot_temp):
                 shutil.copyfile(prot_temp, prot_temp+'.old')     
             if ',' in args.protein_evidence:
-                files = args.protein_evidence.split(",")
-                with open(prot_temp, 'w') as output:
-                    for f in files:
-                        with open(f) as input:
-                            output.write(input.read())
+                prot_files = args.protein_evidence.split(",")
             else:
-                shutil.copyfile(args.protein_evidence, prot_temp)
+                prot_files = [args.protein_evidence]
+            #clean up headers, etc
+            lib.cleanProteins(prot_files, prot_temp)
             #run funannotate-p2g to map to genome
-            lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
-            p2g_cmd = [sys.executable, P2G, '-p', prot_temp, '-g', MaskGenome, '-o', p2g_out, '--maxintron', str(args.max_intronlen), '--cpus', str(args.cpus), '--logfile', os.path.join(args.out, 'logfiles', 'funannotate-p2g.log')]
+            p2g_cmd = [sys.executable, P2G, '-p', prot_temp, '-g', MaskGenome, '-o', p2g_out, '--maxintron', str(args.max_intronlen), '--cpus', str(args.cpus), '--ploidy', str(args.ploidy), '--logfile', os.path.join(args.out, 'logfiles', 'funannotate-p2g.log')]
             #check if protein evidence is same as old evidence
             if os.path.isfile(prot_temp+'.old'):
                 if not lib.sha256_check(prot_temp, prot_temp+'.old'):
+                    lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
                     subprocess.call(p2g_cmd)
                 else:
                     lib.log.info("Using existing protein evidence alignments")
                     os.remove(prot_temp+'.old')
             if not os.path.isfile(p2g_out):
+                lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
                 subprocess.call(p2g_cmd)
             exonerate_out = os.path.abspath(p2g_out)
         else:
diff --git a/lib/library.py b/lib/library.py
@@ -369,25 +369,35 @@ def runMultiProgress(function, inputList, cpus):
     p.close()
     p.join()
 
-def update_progress(progress):
-    barLength = 30 # Modify this to change the length of the progress bar
-    status = ""
-    if isinstance(progress, int):
-        progress = float(progress)
-    if not isinstance(progress, float):
-        progress = 0
-        status = "error: progress var must be float\r\n"
-    if progress < 0:
-        progress = 0
-        status = "Halt...\r\n"
-    if progress >= 1:
-        progress = 1
-        status = "Done...\r\n"
-    block = int(round(barLength*progress))
-    text = "\r IPR progress: [{0}] {1:.2f}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
-    sys.stdout.write(text)
-    sys.stdout.flush()
-
+def cleanProteins(inputList, output):
+    #expecting a list of protein fasta files for combining/cleaning headers
+    #make sure you aren't duplicated sequences names
+    seen = set()
+    with open(output, 'w') as out:
+        for x in inputList:
+            with open(x, 'rU') as input:
+                for rec in SeqIO.parse(input, 'fasta'):
+                    #explicitly check for swissprot and jgi
+                    if rec.id.startswith('sp|') or rec.id.startswith('jgi|'):
+                        ID = rec.id.split('|')[-1]
+                    else:
+                        ID = rec.id
+                    #now clean up the shit
+                    badshit = [':', ';', '/', '\\', '.', ',', '%']
+                    for i in badshit:
+                        if i in ID:
+                            ID = ID.replace(i, '_')
+                    if not ID in seen:
+                        seen.add(ID)
+                    else:
+                        ID = ID+'_1'
+                        if not ID in set:
+                            seen.add(ID)
+                        else:
+                            num = int(ID.split('_')[1])
+                            ID = ID.split('_')[0]+str(num+1)
+                    out.write('>%s\n%s\n' % (ID, rec.seq))
+  
 def gb2output(input, output1, output2, output3):
     with open(output1, 'w') as proteins:
         with open(output2, 'w') as transcripts:
@@ -519,7 +529,7 @@ def runBUSCO(input, DB, cpus, tmpdir, output):
                 if line.startswith('#'):
                     continue
                 col = line.split('\t')
-                if col[1] == 'Complete':
+                if col[1] == 'Complete' or col[1] == 'Duplicated': #if diploid these should show up, but problematic for drawing trees....
                     if col[2].endswith('-T1'):
                         ID = col[2]
                     else: