@@ -96,7 +96,7 @@ def __init__(self,prog):
96
96
lib .copyDirectory (os .path .join (parentdir , 'html_template' , 'css' ), os .path .join (args .out , 'css' ))
97
97
if not os .path .isdir (os .path .join (args .out , 'js' )):
98
98
lib .copyDirectory (os .path .join (parentdir , 'html_template' , 'js' ), os .path .join (args .out , 'js' ))
99
-
99
+
100
100
#loop through each genome
101
101
stats = []
102
102
merops = []
@@ -106,6 +106,7 @@ def __init__(self,prog):
106
106
eggnog = []
107
107
busco = []
108
108
gbkfilenames = []
109
+ scinames = []
109
110
num_input = len (args .input )
110
111
if num_input == 0 :
111
112
lib .log .error ("Error, you did not specify an input, -i" )
@@ -142,22 +143,24 @@ def __init__(self,prog):
142
143
lib .parseGOterms (GBK , go_folder , stats [i ][0 ].replace (' ' , '_' ))
143
144
lib .gb2proteinortho (GBK , protortho , stats [i ][0 ].replace (' ' , '_' ))
144
145
eggnog .append (lib .getEggNogfromNote (GBK ))
145
-
146
- #convert eggnog to a single dictionary for lookup later
147
- EGGNOG = { k : v for d in eggnog for k , v in d .items () }
146
+ scinames .append (stats [i ][0 ].replace (' ' , '_' ))
148
147
149
148
#convert busco to dictionary
150
- busco = lib .dictFlip (busco )
149
+ busco = lib .busco_dictFlip (busco )
151
150
152
151
#add species names to pandas table
153
152
names = []
154
153
for i in stats :
155
154
sci_name = i [0 ]
156
- genus = sci_name .split (' ' )[0 ]
157
- species = ' ' .join (sci_name .split (' ' )[1 :])
158
- abbrev = genus [:1 ] + '.'
159
- final_name = abbrev + ' ' + species
160
- names .append (final_name )
155
+ if '_' in sci_name : #here I'm assuming that somebody used an abbreviated name and an underscore, this would be atypical I think
156
+ names .append (sci_name )
157
+ else :
158
+ genus = sci_name .split (' ' )[0 ]
159
+ species = ' ' .join (sci_name .split (' ' )[1 :])
160
+ abbrev = genus [:1 ] + '.'
161
+ final_name = abbrev + ' ' + species
162
+ names .append (final_name )
163
+
161
164
162
165
#PFAM#############################################
163
166
lib .log .info ("Summarizing PFAM domain results" )
@@ -170,8 +173,11 @@ def __init__(self,prog):
170
173
pfamdf ['species' ] = names
171
174
pfamdf .set_index ('species' , inplace = True )
172
175
176
+ #remove any "empty" genomes
177
+ pfamdf = pfamdf [(pfamdf .T != 0 ).any ()]
178
+
173
179
#make an nmds
174
- if len (args . input ) > 1 :
180
+ if len (pfamdf . index ) > 1 : #make sure number of species is at least two
175
181
lib .distance2mds (pfamdf , 'braycurtis' , 'PFAM' , os .path .join (args .out , 'pfam' ,'PFAM.nmds.pdf' ))
176
182
177
183
#get the PFAM descriptions
@@ -195,7 +201,7 @@ def __init__(self,prog):
195
201
output .write (lib .FOOTER )
196
202
197
203
##################################################
198
-
204
+
199
205
####InterProScan##################################
200
206
lib .log .info ("Summarizing InterProScan results" )
201
207
if not os .path .isdir (os .path .join (args .out , 'interpro' )):
@@ -207,14 +213,21 @@ def __init__(self,prog):
207
213
IPRdf ['species' ] = names
208
214
IPRdf .set_index ('species' , inplace = True )
209
215
216
+ #some checking here of data, if genome is missing, i.e. counts are zero, drop it
217
+ #print IPRdf
218
+ #print len(IPRdf.columns)
219
+ IPRdf = IPRdf [(IPRdf .T != 0 ).any ()]
220
+ #print len(IPRdf.index)
221
+
210
222
#analysis of InterPro Domains
211
223
#get IPR descriptions
224
+ lib .log .info ("Loading InterPro descriptions" )
212
225
INTERPRO = lib .iprxml2dict (os .path .join (parentdir , 'DB' , 'interpro.xml' ))
213
226
#NMDS
214
- if len (args . input ) > 1 :
215
- if len (IPRdf .columns ) > 1 :
227
+ if len (IPRdf . index ) > 1 : #count number of species
228
+ if len (IPRdf .columns ) > 1 : #count number of IPR domains
216
229
lib .distance2mds (IPRdf , 'braycurtis' , 'InterProScan' , os .path .join (args .out , 'interpro' , 'InterProScan.nmds.pdf' ))
217
-
230
+
218
231
#write to csv file
219
232
ipr2 = IPRdf .transpose ()
220
233
ipr_desc = []
@@ -232,12 +245,12 @@ def __init__(self,prog):
232
245
output .write (lib .HEADER )
233
246
output .write (lib .INTERPRO )
234
247
if len (IPRdf .columns ) > 1 :
235
- output .write (ipr2 .to_html (index = False , escape = False , classes = 'table table-hover' ))
248
+ if len (IPRdf .index ) > 1 :
249
+ output .write (ipr2 .to_html (index = False , escape = False , classes = 'table table-hover' ))
236
250
output .write (lib .FOOTER )
237
251
238
252
##############################################
239
253
240
-
241
254
####MEROPS################################
242
255
lib .log .info ("Summarizing MEROPS protease results" )
243
256
if not os .path .isdir (os .path .join (args .out , 'merops' )):
@@ -388,7 +401,6 @@ def __init__(self,prog):
388
401
output .write (lib .FOOTER )
389
402
########################################################
390
403
391
-
392
404
####GO Terms, GO enrichment############################
393
405
if not os .path .isdir (os .path .join (args .out , 'go_enrichment' )):
394
406
os .makedirs (os .path .join (args .out , 'go_enrichment' ))
@@ -444,7 +456,7 @@ def __init__(self,prog):
444
456
else :
445
457
output .write ('<table border="1" class="dataframe table table-hover">\n <th>No enrichment found</th></table>' )
446
458
output .write (lib .FOOTER )
447
-
459
+
448
460
####################################################
449
461
450
462
##ProteinOrtho################################
@@ -455,78 +467,126 @@ def __init__(self,prog):
455
467
lib .log .info ("Running orthologous clustering tool, ProteinOrtho5. This may take awhile..." )
456
468
#setup protein ortho inputs, some are a bit strange in the sense that they use equals signs
457
469
log = os .path .join (protortho , 'proteinortho.log' )
458
- #get list of files in folder
470
+
471
+ #generate list of files based on input order for consistency
459
472
filelist = []
460
- for file in os .listdir (protortho ):
461
- if file .endswith ('.faa' ):
462
- filelist .append (file )
473
+ for i in stats :
474
+ name = i [0 ].replace (' ' , '_' )
475
+ name = name + '.faa'
476
+ filelist .append (name )
463
477
fileinput = ' ' .join (filelist )
478
+ #print fileinput
464
479
cmd = ['proteinortho5.pl' , '-project=funannotate' , '-synteny' , '-cpus=' + str (args .cpus ), '-singles' , '-selfblast' ]
465
480
cmd2 = cmd + filelist
466
481
if not os .path .isfile (os .path .join (args .out , 'protortho' , 'funannotate.poff' )):
467
482
with open (log , 'w' ) as logfile :
468
483
subprocess .call (cmd2 , cwd = protortho , stderr = logfile , stdout = logfile )
469
484
470
- #now process the output, get # of singletons per genome, total orthologs, single-copy orthologs and append to stats, output text file with groups
485
+ #open poff in pandas to parse "easier" for stats, orthologs, etc
486
+ df = pd .read_csv (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), sep = '\t ' , header = 0 )
487
+ df .rename (columns = lambda x : x .replace ('.faa' , '' ), inplace = True )
488
+ #reorder table to it matches up with busco list of dicts
489
+ newhead = [df .columns .values [0 ], df .columns .values [1 ], df .columns .values [2 ]]
490
+ newhead += scinames
491
+ df = df [newhead ]
492
+ #write to file (not sure I need this now?)
493
+ #df.to_csv(os.path.join(args.out, 'protortho', 'funannotate_reorder.poff'), sep='\t', index=False)
494
+ #now filter table to only single copy orthologs to use with phylogeny
495
+ num_species = len (df .columns ) - 3
496
+ sco = df [(df ['# Species' ] == num_species ) & (df ['Genes' ] == num_species )]
497
+ sco_hits = sco .drop (sco .columns [0 :3 ], axis = 1 )
498
+ #now cross reference with busco, as we want this for phylogeny
499
+ keep = []
500
+ sc_buscos = []
501
+ for index , row in sco_hits .iterrows ():
502
+ busco_check = []
503
+ for i in range (0 , num_species ):
504
+ if row [i ] in busco [i ]:
505
+ busco_check .append (busco [i ].get (row [i ]))
506
+ busco_check = lib .flatten (busco_check )
507
+ #need to check if outgroup is passed and this model exists in that outgroup
508
+ if len (set (busco_check )) == 1 :
509
+ if args .outgroup :
510
+ available_busco = []
511
+ with open (outgroup_species , 'rU' ) as outfasta :
512
+ for line in outfasta :
513
+ if line .startswith ('>' ):
514
+ line = line .replace ('\n ' , '' )
515
+ name = line .replace ('>' , '' )
516
+ available_busco .append (name )
517
+ if busco_check [0 ] in available_busco :
518
+ keep .append (index )
519
+ sc_buscos .append (busco_check [0 ])
520
+ else :
521
+ keep .append (index )
522
+ sco_final = sco_hits .ix [keep ]
523
+
524
+ #take dataframe and output the ortholog table.
525
+ dftrim = df .drop (df .columns [0 :3 ], axis = 1 ) #trim down to just gene models
526
+ orthdf = df [(df ['# Species' ] > 1 )] #get rid of singletons in this dataset
527
+ orth_hits = orthdf .drop (orthdf .columns [0 :3 ], axis = 1 ) #trim to just gene models
528
+
471
529
orthologs = os .path .join (args .out , 'annotations' ,'orthology_groups.txt' )
472
530
with open (orthologs , 'w' ) as output :
473
- with open (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), 'rU' ) as input :
474
- count = 0
475
- scoCount = 0
476
- for line in input :
477
- line = line .replace ('\n ' , '' ) #strip line ending
478
- if line .startswith ('#' ):
479
- header = line
480
- species = header .split ('\t ' )[3 :]
481
- num_species = header .count ('\t ' ) - 2
482
- continue
483
- col = re .split (r'[,\t]' , line )
484
- if col [0 ] != '1' :
485
- count += 1
486
- ID = 'orth' + str (count )
487
- prots = col [3 :]
488
- prots = [x for x in prots if x != '*' ]
489
- eggs = []
490
- buscos = []
491
- for i in prots :
492
- hit = EGGNOG .get (i )
493
- if not hit in eggs :
494
- eggs .append (hit )
495
- hit2 = busco .get (i )
496
- if not hit2 in buscos :
497
- buscos .append (hit2 )
498
- eggs = [x for x in eggs if x is not None ]
499
- buscos = [x for x in buscos if x is not None ]
500
- buscos = lib .flatten (buscos )
501
- if len (eggs ) > 0 :
502
- eggs = ', ' .join (str (v ) for v in eggs )
503
- else :
504
- eggs = 'None'
505
- if len (buscos ) > 0 :
506
- buscos = set (buscos )
507
- buscos = ', ' .join (str (v ) for v in buscos )
508
- else :
509
- buscos = 'None'
510
- if col [0 ] == str (num_species ) and col [1 ] == str (num_species ):
511
- scoCount += 1
512
- output .write ("%s\t %s\t %s\t %s\n " % (ID , eggs , buscos , ', ' .join (prots )))
531
+ #should be able to parse the pandas ortho dataframe now
532
+ for index , row in orth_hits .iterrows ():
533
+ ID = 'orth' + str (index )
534
+ buscos = []
535
+ eggs = []
536
+ proteins = []
537
+ for x in range (0 , len (row )):
538
+ if row [x ] != '*' :
539
+ prots = row [x ].split (',' )
540
+ for y in prots :
541
+ proteins .append (y )
542
+ egghit = eggnog [x ].get (y )
543
+ if not egghit in eggs :
544
+ eggs .append (egghit )
545
+ buscohit = busco [x ].get (y )
546
+ if not buscohit in buscos :
547
+ buscos .append (buscohit )
548
+ #clean up the None's that get added
549
+ eggs = [x for x in eggs if x is not None ]
550
+ buscos = [x for x in buscos if x is not None ]
551
+ buscos = lib .flatten (buscos )
552
+
553
+ #write to output
554
+ if len (eggs ) > 0 :
555
+ eggs = ', ' .join (str (v ) for v in eggs )
556
+ else :
557
+ eggs = 'None'
558
+ if len (buscos ) > 0 :
559
+ buscos = set (buscos )
560
+ buscos = ', ' .join (str (v ) for v in buscos )
561
+ else :
562
+ buscos = 'None'
563
+ output .write ("%s\t %s\t %s\t %s\n " % (ID , eggs , buscos , ', ' .join (proteins )))
513
564
514
565
if not os .path .isdir (os .path .join (args .out , 'stats' )):
515
566
os .makedirs (os .path .join (args .out , 'stats' ))
516
567
summary = []
517
- for i in stats :
518
- try :
519
- singles = lib .singletons (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), i [0 ])
520
- except IOError :
521
- singles = 0
522
- i .append ("{0:,}" .format (singles ))
523
- try :
524
- orthos = lib .orthologs (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), i [0 ])
525
- except IOError :
526
- orthos = 0
527
- i .append ("{0:,}" .format (orthos ))
528
- i .append ("{0:,}" .format (scoCount ))
529
- summary .append (i )
568
+ #get stats, this is all single copy orthologs
569
+ scoCount = len (sco_hits )
570
+ for i in range (0 , len (stats )):
571
+ orthos = 0
572
+ for index , row in orth_hits [scinames [i ]].iteritems ():
573
+ if row != '*' :
574
+ add = row .count (',' ) + 1
575
+ orthos += add
576
+ singletons = 0
577
+ for index , row in dftrim .iterrows ():
578
+ if row [scinames [i ]] != '*' :
579
+ others = []
580
+ for y in range (0 , len (row )):
581
+ others .append (row [y ])
582
+ others = set (others )
583
+ if len (others ) == 2 :
584
+ singletons += 1
585
+ stats [i ].append ("{0:,}" .format (singletons ))
586
+ stats [i ].append ("{0:,}" .format (orthos ))
587
+ stats [i ].append ("{0:,}" .format (scoCount ))
588
+ summary .append (stats [i ])
589
+
530
590
#convert to dataframe for easy output
531
591
header = ['species' , 'isolate' , 'Assembly Size' , 'Largest Scaffold' , 'Average Scaffold' , 'Num Scaffolds' , 'Scaffold N50' , 'Percent GC' , 'Num Genes' , 'Num Proteins' , 'Num tRNA' , 'Unique Proteins' , 'Prots atleast 1 ortholog' , 'Single-copy orthologs' ]
532
592
df = pd .DataFrame (summary , columns = header )
@@ -539,6 +599,7 @@ def __init__(self,prog):
539
599
output .write (df .transpose ().to_html (classes = 'table table-condensed' ))
540
600
output .write (lib .FOOTER )
541
601
############################################
602
+
542
603
######summarize all annotation for each gene in a table
543
604
lib .log .info ("Compiling all annotations for each genome" )
544
605
@@ -574,14 +635,13 @@ def __init__(self,prog):
574
635
meropsDict = lib .dictFlip (merops )
575
636
cazyDict = lib .dictFlip (cazy )
576
637
577
-
578
638
table = []
579
639
header = ['GeneID' ,'length' ,'description' , 'Ortho Group' , 'EggNog' , 'BUSCO' ,'Protease family' , 'CAZyme family' , 'InterPro Domains' , 'PFAM Domains' , 'GO terms' , 'SecMet Cluster' , 'SMCOG' ]
580
- for i in range (0 ,num_input ):
581
- outputname = os .path .join (args .out , 'annotations' , stats [ i ][ 0 ]. replace ( ' ' , '_' ) + '.all.annotations.tsv' )
640
+ for y in range (0 ,num_input ):
641
+ outputname = os .path .join (args .out , 'annotations' , scinames [ y ] + '.all.annotations.tsv' )
582
642
with open (outputname , 'w' ) as output :
583
643
output .write ("%s\n " % ('\t ' .join (header )))
584
- with open (gbkfilenames [i ], 'rU' ) as input :
644
+ with open (gbkfilenames [y ], 'rU' ) as input :
585
645
SeqRecords = SeqIO .parse (input , 'genbank' )
586
646
for record in SeqRecords :
587
647
for f in record .features :
@@ -608,8 +668,8 @@ def __init__(self,prog):
608
668
cazydomains = "; " .join (cazyDict .get (ID ))
609
669
else :
610
670
cazydomains = ''
611
- if ID in busco :
612
- buscogroup = busco .get (ID )[0 ]
671
+ if ID in busco [ y ] :
672
+ buscogroup = busco [ y ] .get (ID )[0 ]
613
673
else :
614
674
buscogroup = ''
615
675
if ID in goDict :
@@ -635,6 +695,7 @@ def __init__(self,prog):
635
695
final_result = [ID , str (length ), description , orthogroup , egg , buscogroup , meropsdomains , cazydomains , IPRdomains , pfamdomains , goTerms , cluster , smcog ]
636
696
output .write ("%s\n " % ('\t ' .join (final_result )))
637
697
############################################
698
+
638
699
#build phylogeny
639
700
if not os .path .isfile (os .path .join (args .out , 'phylogeny' , 'RAxML.phylogeny.pdf' )):
640
701
if outgroup :
@@ -643,7 +704,8 @@ def __init__(self,prog):
643
704
num_phylogeny = len (args .input )
644
705
if num_phylogeny > 3 :
645
706
lib .log .info ("Inferring phylogeny using RAxML" )
646
- lib .ortho2phylogeny (os .path .join (args .out , 'protortho' , 'funannotate.poff' ), args .num_orthos , busco , args .cpus , args .bootstrap , phylogeny , outgroup , outgroup_species , outgroup_name )
707
+ folder = os .path .join (args .out , 'protortho' )
708
+ lib .ortho2phylogeny (folder , sco_final , args .num_orthos , busco , args .cpus , args .bootstrap , phylogeny , outgroup , outgroup_species , outgroup_name , sc_buscos )
647
709
else :
648
710
lib .log .info ("Skipping RAxML phylogeny as at least 4 taxa are required" )
649
711
with open (os .path .join (args .out ,'phylogeny.html' ), 'w' ) as output :
0 commit comments