Skip to content

Commit 74d0e3d

Browse files
Jon PalmerJon Palmer
authored andcommitted
update to tblastn/exonerate for speed and ploidy num
1 parent ba2bc85 commit 74d0e3d

File tree

3 files changed

+157
-83
lines changed

3 files changed

+157
-83
lines changed

bin/funannotate-p2g.py

Lines changed: 119 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/usr/bin/env python
22

3-
import sys, os, subprocess, csv, shutil, inspect, itertools, argparse
3+
import sys, os, subprocess, shutil, inspect, itertools, argparse
44
from Bio import SeqIO
5+
from Bio.SeqIO.FastaIO import SimpleFastaParser
56
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
67
parentdir = os.path.dirname(currentdir)
78
sys.path.insert(0,parentdir)
@@ -22,9 +23,9 @@ def __init__(self,prog):
2223
parser.add_argument('-o','--out', required=True, help='Final exonerate output file')
2324
parser.add_argument('--maxintron', default = 3000, help='Maximum intron size')
2425
parser.add_argument('--logfile', default ='funannotate-p2g.log', help='logfile')
26+
parser.add_argument('--ploidy', default =1, type=int, help='Ploidy of assembly')
2527
args=parser.parse_args()
2628

27-
2829
log_name = args.logfile
2930
if os.path.isfile(log_name):
3031
os.remove(log_name)
@@ -42,90 +43,153 @@ def __init__(self,prog):
4243
blast_version = blast_version.split(': ')[-1]
4344
lib.log.debug("BLAST v%s; Exonerate v%s" % (blast_version, exo_version))
4445

45-
def tblastnFilter(input, query, cpus, output):
46+
def runtblastn(input, query, cpus, output, maxhits):
4647
#start by formatting blast db/dustmasker filtered format
4748
cmd = ['dustmasker', '-in', input, '-infmt', 'fasta', '-parse_seqids', '-outfmt', 'maskinfo_asn1_bin', '-out', 'genome_dust.asnb']
4849
lib.runSubprocess(cmd, output, lib.log)
4950
cmd = ['makeblastdb', '-in', input, '-dbtype', 'nucl', '-parse_seqids', '-mask_data', 'genome_dust.asnb', '-out', 'genome']
5051
lib.runSubprocess(cmd, output, lib.log)
51-
cmd = ['tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', '1', '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab']
52+
cmd = ['tblastn', '-num_threads', str(cpus), '-db', 'genome', '-query', query, '-max_target_seqs', str(maxhits), '-db_soft_mask', '11', '-threshold', '999', '-max_intron_length', str(args.maxintron), '-evalue', '1e-10', '-outfmt', '6', '-out', 'filter.tblastn.tab']
5253
lib.runSubprocess(cmd, output, lib.log)
5354

5455
def parseBlast(blastresult):
55-
global HitList
56-
HitList = []
57-
#now parse through results, generating a list for exonerate function
56+
Results = {}
5857
with open(blastresult, 'rU') as input:
59-
reader = csv.reader(input, delimiter='\t')
60-
for cols in reader:
61-
hit = cols[0] + '::' + cols[1]
62-
if hit not in HitList:
63-
HitList.append(hit)
58+
for line in input:
59+
cols = line.split('\t')
60+
hit = cols[0] + ':::' + cols[1]
61+
if int(cols[8]) < int(cols[9]):
62+
start = cols[8]
63+
end = cols[9]
64+
else:
65+
start = cols[9]
66+
end = cols[8]
67+
if not hit in Results:
68+
Results[hit] = (start, end)
69+
else:
70+
#get old start stop
71+
old = Results.get(hit)
72+
if int(start) < int(old[0]):
73+
newstart = start
74+
else:
75+
newstart = old[0]
76+
if int(end) > int(old[1]):
77+
newstop = end
78+
else:
79+
newstop = old[1]
80+
Results[hit] = (newstart, newstop)
81+
#convert Dictionary to a list that has hit:::scaffold:::start:::stop
82+
HitList = []
83+
for k,v in Results.items():
84+
finalhit = k+':::'+str(v[0])+':::'+str(v[1])
85+
HitList.append(finalhit)
86+
return HitList
6487

6588
def runExonerate(input):
66-
FNULL = open(os.devnull, 'w')
67-
s = input.split('::')
68-
if s[0].startswith('sp|'):
69-
name = s[0].split("|")[1] + '_' + s[1]
70-
else:
71-
name = s[0].split()[0] + '_' + s[1]
72-
query = os.path.join(tmpdir, name+'.fa')
89+
s = input.split(':::')
90+
ProtID = s[0]
91+
ScaffID = s[1]
92+
ScaffStart = int(s[2])
93+
ScaffEnd = int(s[3])
94+
#get the protein model
95+
query = os.path.join(tmpdir, ProtID+'.'+str(os.getpid())+'.fa')
7396
with open(query, 'w') as output:
74-
rec = record_dict[s[0]]
75-
output.write(">%s\n%s\n" % (rec.id, rec.seq))
76-
scaffold = s[1] + '.fa'
97+
SeqIO.write(protein_dict[ProtID], output, 'fasta')
98+
#now get the genome region, use different variable names for SeqRecords to avoid collision
99+
scaffold = ScaffID+'.'+ProtID+'.'+str(ScaffStart)+'-'+str(ScaffEnd)+'.fa'
77100
scaffold = os.path.join(tmpdir, scaffold)
78-
exonerate_out = 'exonerate_' + name + '.out'
101+
with open(scaffold, 'w') as output2:
102+
with open(os.path.join(tmpdir, 'scaffolds', ScaffID+'.fa'), 'rU') as fullscaff:
103+
for header, Sequence in SimpleFastaParser(fullscaff):
104+
#grab a 1 kb cushion on either side of hit region, careful of scaffold ends
105+
start = ScaffStart - 1000
106+
if start < 1:
107+
start = 1
108+
end = ScaffEnd + 1000
109+
if end > len(Sequence):
110+
end = len(Sequence)
111+
output2.write('>%s\n%s\n' % (header, Sequence[start:end]))
112+
exoname = ProtID+'.'+ScaffID+'__'+str(start)+'__'
113+
#check that input files are created and valid
114+
exonerate_out = 'exonerate.' + exoname + '.out'
79115
exonerate_out = os.path.join(tmpdir, exonerate_out)
80116
ryo = "AveragePercentIdentity: %pi\n"
81-
with open(exonerate_out, 'w') as output:
82-
subprocess.call(['exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo , query, scaffold], stdout = output, stderr = FNULL)
83-
os.remove(query)
84-
#check filesize of exonerate output, no hits are 285 bytes, but lets just filter everything smaller than 310
85-
if lib.getSize(exonerate_out) < 310:
117+
cmd = ['exonerate', '--model', 'p2g', '--showvulgar', 'no', '--showalignment', 'no', '--showquerygff', 'no', '--showtargetgff', 'yes', '--maxintron', str(args.maxintron), '--percent', '80', '--ryo', ryo , query, scaffold]
118+
#run exonerate, capture errors
119+
with open(exonerate_out, 'w') as output3:
120+
proc = subprocess.Popen(cmd, stdout = output3, stderr=subprocess.PIPE)
121+
stderr = proc.communicate()
122+
if 'WARNING' in stderr[1]:
123+
lib.log.debug('%s, Len=%i, %i-%i; %i-%i' % (header, len(Sequence), ScaffStart, ScaffEnd, start, end))
124+
os.rename(query, os.path.join(tmpdir, 'failed', os.path.basename(query)))
125+
os.rename(scaffold, os.path.join(tmpdir, 'failed', os.path.basename(scaffold)))
126+
else:
127+
for y in [query, scaffold]:
128+
try:
129+
os.remove(y)
130+
except OSError:
131+
lib.log.debug("Error removing %s" % (y))
132+
#check filesize of exonerate output, no hits still have some output data in them, should be safe dropping anything smaller than 500 bytes
133+
if lib.getSize(exonerate_out) < 500:
86134
os.remove(exonerate_out)
87-
135+
88136
#make tmpdir
89137
tmpdir = 'p2g_'+ str(os.getpid())
90138
if not os.path.isdir(tmpdir):
91139
os.makedirs(tmpdir)
92-
140+
os.makedirs(os.path.join(tmpdir, 'failed'))
141+
os.makedirs(os.path.join(tmpdir, 'scaffolds'))
142+
#check for tblastn input
93143
if args.tblastn:
94144
lib.log.info("Using pre-calculated tBLASTN result")
95145
BlastResult = args.tblastn
96146
else:
97147
lib.log.info("Running pre-filter tBlastn step")
98148
BlastResult = os.path.join(tmpdir, 'filter.tblastn.tab')
99-
tblastnFilter(os.path.abspath(args.genome), os.path.abspath(args.proteins), args.cpus, tmpdir)
149+
runtblastn(os.path.abspath(args.genome), os.path.abspath(args.proteins), args.cpus, tmpdir, args.ploidy*2) #2X ploidy for tBLASTn filter
150+
151+
#new routine
152+
Hits = parseBlast(BlastResult)
153+
lib.log.info("Found %i preliminary alignments" % (len(Hits)))
100154

101-
#parse the results
102-
parseBlast(BlastResult)
103-
lib.log.info("found %i preliminary alignments" % (len(HitList)))
155+
#index the genome and proteins
156+
protein_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta') #do index here in case memory problems?
104157

105158
#split genome fasta into individual scaffolds
106-
if not os.path.exists(tmpdir):
107-
os.makedirs(tmpdir)
108159
with open(os.path.abspath(args.genome), 'rU') as input:
109160
for record in SeqIO.parse(input, "fasta"):
110-
SeqIO.write(record, os.path.join(tmpdir, record.id + ".fa"), "fasta")
111-
112-
#Now run exonerate on hits
113-
lib.log.info("Polishing alignments with Exonerate")
114-
record_dict = SeqIO.index(os.path.abspath(args.proteins), 'fasta') #do index here in case memory problems?
161+
SeqIO.write(record, os.path.join(tmpdir, 'scaffolds', record.id + ".fa"), "fasta")
115162

116163
#run multiprocessing exonerate
117-
lib.runMultiProgress(runExonerate, HitList, args.cpus)
118-
lib.log.info("Exonerate finished")
119-
120-
#now collect all exonerate results into one
121-
with open(args.out, 'wb') as output:
122-
for root, dirs, files in os.walk(tmpdir):
123-
for file in files:
124-
if file.endswith('.out'):
125-
filename = os.path.join(root, file)
126-
with open(filename, 'rU') as readfile:
127-
for line in itertools.islice(readfile, 3, None):
164+
lib.runMultiProgress(runExonerate, Hits, args.cpus)
165+
166+
#now need to loop through and offset exonerate predictions back to whole scaffolds
167+
with open(args.out, 'w') as output:
168+
for file in os.listdir(tmpdir):
169+
if file.endswith('.out'):
170+
with open(os.path.join(tmpdir, file), 'rU') as exoresult:
171+
offset = int(file.split('__')[1])
172+
for line in itertools.islice(exoresult, 3, None):
173+
if line.startswith('#') or line.startswith('Average') or line.startswith('-- completed'):
128174
output.write(line)
129-
130-
#finally clean-up your mess
131-
shutil.rmtree(tmpdir)
175+
else:
176+
cols = line.split('\t')
177+
cols[3] = str(int(cols[3])+offset)
178+
cols[4] = str(int(cols[4])+offset)
179+
output.write('\t'.join(cols))
180+
181+
#output some quick summary of exonerate alignments that you found
182+
Found = lib.countGFFgenes(args.out)
183+
lib.log.info("Exonerate finished: found %i alignments" % Found)
184+
185+
#finally clean-up your mess if failed is empty
186+
try:
187+
os.rmdir(os.path.join(tmpdir, 'failed'))
188+
empty = True
189+
except OSError:
190+
empty = False
191+
if empty:
192+
shutil.rmtree(tmpdir)
193+
else:
194+
lib.log.error("Failed exonerate alignments found, see files in %s" % os.path.join(tmpdir, 'failed'))
195+
sys.exit(1)

bin/funannotate-predict.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def __init__(self, prog):
3737
parser.add_argument('--max_intronlen', default=3000, help='Maximum intron length for gene models')
3838
parser.add_argument('--min_protlen', default=50, type=int, help='Minimum amino acid length for valid gene model')
3939
parser.add_argument('--keep_no_stops', action='store_true', help='Keep gene models without valid stop codons')
40+
parser.add_argument('--ploidy', default=1, type=int, help='Ploidy of assembly')
4041
parser.add_argument('--cpus', default=2, type=int, help='Number of CPUs to use')
4142
parser.add_argument('--busco_seed_species', default='anidulans', help='Augustus species to use as initial training point for BUSCO')
4243
parser.add_argument('--optimize_augustus', action='store_true', help='Run "long" training of Augustus')
@@ -375,24 +376,23 @@ def __init__(self, prog):
375376
if os.path.isfile(prot_temp):
376377
shutil.copyfile(prot_temp, prot_temp+'.old')
377378
if ',' in args.protein_evidence:
378-
files = args.protein_evidence.split(",")
379-
with open(prot_temp, 'w') as output:
380-
for f in files:
381-
with open(f) as input:
382-
output.write(input.read())
379+
prot_files = args.protein_evidence.split(",")
383380
else:
384-
shutil.copyfile(args.protein_evidence, prot_temp)
381+
prot_files = [args.protein_evidence]
382+
#clean up headers, etc
383+
lib.cleanProteins(prot_files, prot_temp)
385384
#run funannotate-p2g to map to genome
386-
lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
387-
p2g_cmd = [sys.executable, P2G, '-p', prot_temp, '-g', MaskGenome, '-o', p2g_out, '--maxintron', str(args.max_intronlen), '--cpus', str(args.cpus), '--logfile', os.path.join(args.out, 'logfiles', 'funannotate-p2g.log')]
385+
p2g_cmd = [sys.executable, P2G, '-p', prot_temp, '-g', MaskGenome, '-o', p2g_out, '--maxintron', str(args.max_intronlen), '--cpus', str(args.cpus), '--ploidy', str(args.ploidy), '--logfile', os.path.join(args.out, 'logfiles', 'funannotate-p2g.log')]
388386
#check if protein evidence is same as old evidence
389387
if os.path.isfile(prot_temp+'.old'):
390388
if not lib.sha256_check(prot_temp, prot_temp+'.old'):
389+
lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
391390
subprocess.call(p2g_cmd)
392391
else:
393392
lib.log.info("Using existing protein evidence alignments")
394393
os.remove(prot_temp+'.old')
395394
if not os.path.isfile(p2g_out):
395+
lib.log.info("Mapping proteins to genome using tBlastn/Exonerate")
396396
subprocess.call(p2g_cmd)
397397
exonerate_out = os.path.abspath(p2g_out)
398398
else:

lib/library.py

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -369,25 +369,35 @@ def runMultiProgress(function, inputList, cpus):
369369
p.close()
370370
p.join()
371371

372-
def update_progress(progress):
373-
barLength = 30 # Modify this to change the length of the progress bar
374-
status = ""
375-
if isinstance(progress, int):
376-
progress = float(progress)
377-
if not isinstance(progress, float):
378-
progress = 0
379-
status = "error: progress var must be float\r\n"
380-
if progress < 0:
381-
progress = 0
382-
status = "Halt...\r\n"
383-
if progress >= 1:
384-
progress = 1
385-
status = "Done...\r\n"
386-
block = int(round(barLength*progress))
387-
text = "\r IPR progress: [{0}] {1:.2f}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
388-
sys.stdout.write(text)
389-
sys.stdout.flush()
390-
372+
def cleanProteins(inputList, output):
373+
#expecting a list of protein fasta files for combining/cleaning headers
374+
#make sure you aren't duplicated sequences names
375+
seen = set()
376+
with open(output, 'w') as out:
377+
for x in inputList:
378+
with open(x, 'rU') as input:
379+
for rec in SeqIO.parse(input, 'fasta'):
380+
#explicitly check for swissprot and jgi
381+
if rec.id.startswith('sp|') or rec.id.startswith('jgi|'):
382+
ID = rec.id.split('|')[-1]
383+
else:
384+
ID = rec.id
385+
#now clean up the shit
386+
badshit = [':', ';', '/', '\\', '.', ',', '%']
387+
for i in badshit:
388+
if i in ID:
389+
ID = ID.replace(i, '_')
390+
if not ID in seen:
391+
seen.add(ID)
392+
else:
393+
ID = ID+'_1'
394+
if not ID in set:
395+
seen.add(ID)
396+
else:
397+
num = int(ID.split('_')[1])
398+
ID = ID.split('_')[0]+str(num+1)
399+
out.write('>%s\n%s\n' % (ID, rec.seq))
400+
391401
def gb2output(input, output1, output2, output3):
392402
with open(output1, 'w') as proteins:
393403
with open(output2, 'w') as transcripts:
@@ -519,7 +529,7 @@ def runBUSCO(input, DB, cpus, tmpdir, output):
519529
if line.startswith('#'):
520530
continue
521531
col = line.split('\t')
522-
if col[1] == 'Complete':
532+
if col[1] == 'Complete' or col[1] == 'Duplicated': #if diploid these should show up, but problematic for drawing trees....
523533
if col[2].endswith('-T1'):
524534
ID = col[2]
525535
else:

0 commit comments

Comments
 (0)