Skip to content

Commit 6e454e6

Browse files
Jon PalmerJon Palmer
Jon Palmer
authored and
Jon Palmer
committed
updates to v0.1.8
1 parent 9854bc6 commit 6e454e6

File tree

4 files changed

+271
-171
lines changed

4 files changed

+271
-171
lines changed

bin/funannotate-compare.py

+146-84
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def __init__(self,prog):
9696
lib.copyDirectory(os.path.join(parentdir, 'html_template', 'css'), os.path.join(args.out, 'css'))
9797
if not os.path.isdir(os.path.join(args.out, 'js')):
9898
lib.copyDirectory(os.path.join(parentdir, 'html_template', 'js'), os.path.join(args.out, 'js'))
99-
99+
100100
#loop through each genome
101101
stats = []
102102
merops = []
@@ -106,6 +106,7 @@ def __init__(self,prog):
106106
eggnog = []
107107
busco = []
108108
gbkfilenames = []
109+
scinames = []
109110
num_input = len(args.input)
110111
if num_input == 0:
111112
lib.log.error("Error, you did not specify an input, -i")
@@ -142,22 +143,24 @@ def __init__(self,prog):
142143
lib.parseGOterms(GBK, go_folder, stats[i][0].replace(' ', '_'))
143144
lib.gb2proteinortho(GBK, protortho, stats[i][0].replace(' ', '_'))
144145
eggnog.append(lib.getEggNogfromNote(GBK))
145-
146-
#convert eggnog to a single dictionary for lookup later
147-
EGGNOG = { k: v for d in eggnog for k, v in d.items() }
146+
scinames.append(stats[i][0].replace(' ', '_'))
148147

149148
#convert busco to dictionary
150-
busco = lib.dictFlip(busco)
149+
busco = lib.busco_dictFlip(busco)
151150

152151
#add species names to pandas table
153152
names = []
154153
for i in stats:
155154
sci_name = i[0]
156-
genus = sci_name.split(' ')[0]
157-
species = ' '.join(sci_name.split(' ')[1:])
158-
abbrev = genus[:1] + '.'
159-
final_name = abbrev + ' ' + species
160-
names.append(final_name)
155+
if '_' in sci_name: #here I'm assuming that somebody used an abbreviated name and an underscore, this would be atypical I think
156+
names.append(sci_name)
157+
else:
158+
genus = sci_name.split(' ')[0]
159+
species = ' '.join(sci_name.split(' ')[1:])
160+
abbrev = genus[:1] + '.'
161+
final_name = abbrev + ' ' + species
162+
names.append(final_name)
163+
161164

162165
#PFAM#############################################
163166
lib.log.info("Summarizing PFAM domain results")
@@ -170,8 +173,11 @@ def __init__(self,prog):
170173
pfamdf['species'] = names
171174
pfamdf.set_index('species', inplace=True)
172175

176+
#remove any "empty" genomes
177+
pfamdf = pfamdf[(pfamdf.T != 0).any()]
178+
173179
#make an nmds
174-
if len(args.input) > 1:
180+
if len(pfamdf.index) > 1: #make sure number of species is at least two
175181
lib.distance2mds(pfamdf, 'braycurtis', 'PFAM', os.path.join(args.out, 'pfam','PFAM.nmds.pdf'))
176182

177183
#get the PFAM descriptions
@@ -195,7 +201,7 @@ def __init__(self,prog):
195201
output.write(lib.FOOTER)
196202

197203
##################################################
198-
204+
199205
####InterProScan##################################
200206
lib.log.info("Summarizing InterProScan results")
201207
if not os.path.isdir(os.path.join(args.out, 'interpro')):
@@ -207,14 +213,21 @@ def __init__(self,prog):
207213
IPRdf['species'] = names
208214
IPRdf.set_index('species', inplace=True)
209215

216+
#some checking here of data, if genome is missing, i.e. counts are zero, drop it
217+
#print IPRdf
218+
#print len(IPRdf.columns)
219+
IPRdf = IPRdf[(IPRdf.T != 0).any()]
220+
#print len(IPRdf.index)
221+
210222
#analysis of InterPro Domains
211223
#get IPR descriptions
224+
lib.log.info("Loading InterPro descriptions")
212225
INTERPRO = lib.iprxml2dict(os.path.join(parentdir, 'DB', 'interpro.xml'))
213226
#NMDS
214-
if len(args.input) > 1:
215-
if len(IPRdf.columns) > 1:
227+
if len(IPRdf.index) > 1: #count number of species
228+
if len(IPRdf.columns) > 1: #count number of IPR domains
216229
lib.distance2mds(IPRdf, 'braycurtis', 'InterProScan', os.path.join(args.out, 'interpro', 'InterProScan.nmds.pdf'))
217-
230+
218231
#write to csv file
219232
ipr2 = IPRdf.transpose()
220233
ipr_desc = []
@@ -232,12 +245,12 @@ def __init__(self,prog):
232245
output.write(lib.HEADER)
233246
output.write(lib.INTERPRO)
234247
if len(IPRdf.columns) > 1:
235-
output.write(ipr2.to_html(index=False, escape=False, classes='table table-hover'))
248+
if len(IPRdf.index) > 1:
249+
output.write(ipr2.to_html(index=False, escape=False, classes='table table-hover'))
236250
output.write(lib.FOOTER)
237251

238252
##############################################
239253

240-
241254
####MEROPS################################
242255
lib.log.info("Summarizing MEROPS protease results")
243256
if not os.path.isdir(os.path.join(args.out, 'merops')):
@@ -388,7 +401,6 @@ def __init__(self,prog):
388401
output.write(lib.FOOTER)
389402
########################################################
390403

391-
392404
####GO Terms, GO enrichment############################
393405
if not os.path.isdir(os.path.join(args.out, 'go_enrichment')):
394406
os.makedirs(os.path.join(args.out, 'go_enrichment'))
@@ -444,7 +456,7 @@ def __init__(self,prog):
444456
else:
445457
output.write('<table border="1" class="dataframe table table-hover">\n<th>No enrichment found</th></table>')
446458
output.write(lib.FOOTER)
447-
459+
448460
####################################################
449461

450462
##ProteinOrtho################################
@@ -455,78 +467,126 @@ def __init__(self,prog):
455467
lib.log.info("Running orthologous clustering tool, ProteinOrtho5. This may take awhile...")
456468
#setup protein ortho inputs, some are a bit strange in the sense that they use equals signs
457469
log = os.path.join(protortho, 'proteinortho.log')
458-
#get list of files in folder
470+
471+
#generate list of files based on input order for consistency
459472
filelist = []
460-
for file in os.listdir(protortho):
461-
if file.endswith('.faa'):
462-
filelist.append(file)
473+
for i in stats:
474+
name = i[0].replace(' ', '_')
475+
name = name+'.faa'
476+
filelist.append(name)
463477
fileinput = ' '.join(filelist)
478+
#print fileinput
464479
cmd = ['proteinortho5.pl', '-project=funannotate', '-synteny', '-cpus='+str(args.cpus), '-singles', '-selfblast']
465480
cmd2 = cmd + filelist
466481
if not os.path.isfile(os.path.join(args.out, 'protortho', 'funannotate.poff')):
467482
with open(log, 'w') as logfile:
468483
subprocess.call(cmd2, cwd = protortho, stderr = logfile, stdout = logfile)
469484

470-
#now process the output, get # of singletons per genome, total orthologs, single-copy orthologs and append to stats, output text file with groups
485+
#open poff in pandas to parse "easier" for stats, orthologs, etc
486+
df = pd.read_csv(os.path.join(args.out, 'protortho', 'funannotate.poff'), sep='\t', header=0)
487+
df.rename(columns=lambda x: x.replace('.faa', ''), inplace=True)
488+
#reorder table to it matches up with busco list of dicts
489+
newhead = [df.columns.values[0], df.columns.values[1], df.columns.values[2]]
490+
newhead += scinames
491+
df = df[newhead]
492+
#write to file (not sure I need this now?)
493+
#df.to_csv(os.path.join(args.out, 'protortho', 'funannotate_reorder.poff'), sep='\t', index=False)
494+
#now filter table to only single copy orthologs to use with phylogeny
495+
num_species = len(df.columns) - 3
496+
sco = df[(df['# Species'] == num_species) & (df['Genes'] == num_species)]
497+
sco_hits = sco.drop(sco.columns[0:3], axis=1)
498+
#now cross reference with busco, as we want this for phylogeny
499+
keep = []
500+
sc_buscos = []
501+
for index, row in sco_hits.iterrows():
502+
busco_check = []
503+
for i in range(0, num_species):
504+
if row[i] in busco[i]:
505+
busco_check.append(busco[i].get(row[i]))
506+
busco_check = lib.flatten(busco_check)
507+
#need to check if outgroup is passed and this model exists in that outgroup
508+
if len(set(busco_check)) == 1:
509+
if args.outgroup:
510+
available_busco = []
511+
with open(outgroup_species, 'rU') as outfasta:
512+
for line in outfasta:
513+
if line.startswith('>'):
514+
line = line.replace('\n', '')
515+
name = line.replace('>', '')
516+
available_busco.append(name)
517+
if busco_check[0] in available_busco:
518+
keep.append(index)
519+
sc_buscos.append(busco_check[0])
520+
else:
521+
keep.append(index)
522+
sco_final = sco_hits.ix[keep]
523+
524+
#take dataframe and output the ortholog table.
525+
dftrim = df.drop(df.columns[0:3], axis=1) #trim down to just gene models
526+
orthdf = df[(df['# Species'] > 1)] #get rid of singletons in this dataset
527+
orth_hits = orthdf.drop(orthdf.columns[0:3], axis=1) #trim to just gene models
528+
471529
orthologs = os.path.join(args.out, 'annotations','orthology_groups.txt')
472530
with open(orthologs, 'w') as output:
473-
with open(os.path.join(args.out, 'protortho', 'funannotate.poff'), 'rU') as input:
474-
count = 0
475-
scoCount = 0
476-
for line in input:
477-
line = line.replace('\n', '') #strip line ending
478-
if line.startswith('#'):
479-
header = line
480-
species = header.split('\t')[3:]
481-
num_species = header.count('\t') - 2
482-
continue
483-
col = re.split(r'[,\t]', line)
484-
if col[0] != '1':
485-
count +=1
486-
ID = 'orth'+str(count)
487-
prots = col[3:]
488-
prots = [x for x in prots if x != '*']
489-
eggs = []
490-
buscos = []
491-
for i in prots:
492-
hit = EGGNOG.get(i)
493-
if not hit in eggs:
494-
eggs.append(hit)
495-
hit2 = busco.get(i)
496-
if not hit2 in buscos:
497-
buscos.append(hit2)
498-
eggs = [x for x in eggs if x is not None]
499-
buscos = [x for x in buscos if x is not None]
500-
buscos = lib.flatten(buscos)
501-
if len(eggs) > 0:
502-
eggs = ', '.join(str(v) for v in eggs)
503-
else:
504-
eggs = 'None'
505-
if len(buscos) > 0:
506-
buscos = set(buscos)
507-
buscos = ', '.join(str(v) for v in buscos)
508-
else:
509-
buscos = 'None'
510-
if col[0] == str(num_species) and col[1] == str(num_species):
511-
scoCount += 1
512-
output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(prots)))
531+
#should be able to parse the pandas ortho dataframe now
532+
for index, row in orth_hits.iterrows():
533+
ID = 'orth'+str(index)
534+
buscos = []
535+
eggs = []
536+
proteins = []
537+
for x in range(0, len(row)):
538+
if row[x] != '*':
539+
prots = row[x].split(',')
540+
for y in prots:
541+
proteins.append(y)
542+
egghit = eggnog[x].get(y)
543+
if not egghit in eggs:
544+
eggs.append(egghit)
545+
buscohit = busco[x].get(y)
546+
if not buscohit in buscos:
547+
buscos.append(buscohit)
548+
#clean up the None's that get added
549+
eggs = [x for x in eggs if x is not None]
550+
buscos = [x for x in buscos if x is not None]
551+
buscos = lib.flatten(buscos)
552+
553+
#write to output
554+
if len(eggs) > 0:
555+
eggs = ', '.join(str(v) for v in eggs)
556+
else:
557+
eggs = 'None'
558+
if len(buscos) > 0:
559+
buscos = set(buscos)
560+
buscos = ', '.join(str(v) for v in buscos)
561+
else:
562+
buscos = 'None'
563+
output.write("%s\t%s\t%s\t%s\n" % (ID, eggs, buscos, ', '.join(proteins)))
513564

514565
if not os.path.isdir(os.path.join(args.out, 'stats')):
515566
os.makedirs(os.path.join(args.out, 'stats'))
516567
summary = []
517-
for i in stats:
518-
try:
519-
singles = lib.singletons(os.path.join(args.out, 'protortho', 'funannotate.poff'), i[0])
520-
except IOError:
521-
singles = 0
522-
i.append("{0:,}".format(singles))
523-
try:
524-
orthos = lib.orthologs(os.path.join(args.out, 'protortho', 'funannotate.poff'), i[0])
525-
except IOError:
526-
orthos = 0
527-
i.append("{0:,}".format(orthos))
528-
i.append("{0:,}".format(scoCount))
529-
summary.append(i)
568+
#get stats, this is all single copy orthologs
569+
scoCount = len(sco_hits)
570+
for i in range(0, len(stats)):
571+
orthos = 0
572+
for index, row in orth_hits[scinames[i]].iteritems():
573+
if row != '*':
574+
add = row.count(',') + 1
575+
orthos += add
576+
singletons = 0
577+
for index, row in dftrim.iterrows():
578+
if row[scinames[i]] != '*':
579+
others = []
580+
for y in range(0, len(row)):
581+
others.append(row[y])
582+
others = set(others)
583+
if len(others) == 2:
584+
singletons += 1
585+
stats[i].append("{0:,}".format(singletons))
586+
stats[i].append("{0:,}".format(orthos))
587+
stats[i].append("{0:,}".format(scoCount))
588+
summary.append(stats[i])
589+
530590
#convert to dataframe for easy output
531591
header = ['species', 'isolate', 'Assembly Size', 'Largest Scaffold', 'Average Scaffold', 'Num Scaffolds', 'Scaffold N50', 'Percent GC', 'Num Genes', 'Num Proteins', 'Num tRNA', 'Unique Proteins', 'Prots atleast 1 ortholog', 'Single-copy orthologs']
532592
df = pd.DataFrame(summary, columns=header)
@@ -539,6 +599,7 @@ def __init__(self,prog):
539599
output.write(df.transpose().to_html(classes='table table-condensed'))
540600
output.write(lib.FOOTER)
541601
############################################
602+
542603
######summarize all annotation for each gene in a table
543604
lib.log.info("Compiling all annotations for each genome")
544605

@@ -574,14 +635,13 @@ def __init__(self,prog):
574635
meropsDict = lib.dictFlip(merops)
575636
cazyDict = lib.dictFlip(cazy)
576637

577-
578638
table = []
579639
header = ['GeneID','length','description', 'Ortho Group', 'EggNog', 'BUSCO','Protease family', 'CAZyme family', 'InterPro Domains', 'PFAM Domains', 'GO terms', 'SecMet Cluster', 'SMCOG']
580-
for i in range(0,num_input):
581-
outputname = os.path.join(args.out, 'annotations', stats[i][0].replace(' ', '_')+'.all.annotations.tsv')
640+
for y in range(0,num_input):
641+
outputname = os.path.join(args.out, 'annotations', scinames[y]+'.all.annotations.tsv')
582642
with open(outputname, 'w') as output:
583643
output.write("%s\n" % ('\t'.join(header)))
584-
with open(gbkfilenames[i], 'rU') as input:
644+
with open(gbkfilenames[y], 'rU') as input:
585645
SeqRecords = SeqIO.parse(input, 'genbank')
586646
for record in SeqRecords:
587647
for f in record.features:
@@ -608,8 +668,8 @@ def __init__(self,prog):
608668
cazydomains = "; ".join(cazyDict.get(ID))
609669
else:
610670
cazydomains = ''
611-
if ID in busco:
612-
buscogroup = busco.get(ID)[0]
671+
if ID in busco[y]:
672+
buscogroup = busco[y].get(ID)[0]
613673
else:
614674
buscogroup = ''
615675
if ID in goDict:
@@ -635,6 +695,7 @@ def __init__(self,prog):
635695
final_result = [ID, str(length), description, orthogroup, egg, buscogroup, meropsdomains, cazydomains, IPRdomains, pfamdomains, goTerms, cluster, smcog]
636696
output.write("%s\n" % ('\t'.join(final_result)))
637697
############################################
698+
638699
#build phylogeny
639700
if not os.path.isfile(os.path.join(args.out, 'phylogeny', 'RAxML.phylogeny.pdf')):
640701
if outgroup:
@@ -643,7 +704,8 @@ def __init__(self,prog):
643704
num_phylogeny = len(args.input)
644705
if num_phylogeny > 3:
645706
lib.log.info("Inferring phylogeny using RAxML")
646-
lib.ortho2phylogeny(os.path.join(args.out, 'protortho', 'funannotate.poff'), args.num_orthos, busco, args.cpus, args.bootstrap, phylogeny, outgroup, outgroup_species, outgroup_name)
707+
folder = os.path.join(args.out, 'protortho')
708+
lib.ortho2phylogeny(folder, sco_final, args.num_orthos, busco, args.cpus, args.bootstrap, phylogeny, outgroup, outgroup_species, outgroup_name, sc_buscos)
647709
else:
648710
lib.log.info("Skipping RAxML phylogeny as at least 4 taxa are required")
649711
with open(os.path.join(args.out,'phylogeny.html'), 'w') as output:

0 commit comments

Comments
 (0)