-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathrank_taxa_by_aa_compo.py
More file actions
100 lines (91 loc) · 2.98 KB
/
rank_taxa_by_aa_compo.py
File metadata and controls
100 lines (91 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#implements the GARP:FIMNKY criterion for comp hetero per taxon suggested by Munoz-Gomez et al. 2018
from Bio import SeqIO, AlignIO
import sys, operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
#for this dataset, ratio of about 1 seems to be a reasonable cutoff between low and high GC.
highGC = []
lowGC = []
ratios = {}
flat_ratios = []
flat_zeds = []
col_zed = {}
seqaln = SeqIO.index(sys.argv[1], "fasta")
for taxon in seqaln:
garp = 0
fimnky = 0
sequence = str(seqaln[taxon].seq)
for char in sequence:
if char in 'GARP':
garp += 1
elif char in 'FIMNKY':
fimnky += 1
ratio = float(garp)/float(fimnky)
if ratio > 1:
highGC.append(taxon)
else:
lowGC.append(taxon)
ratios[taxon] = ratio
flat_ratios.append(ratio)
#rank by ratio and print out list
sorted_taxa = sorted(ratios.items(), key=operator.itemgetter(1), reverse = True)
for taxon in sorted_taxa:
print taxon[0] + "\t" + str(taxon[1])
#np_rat = np.array(flat_ratios)
#sns.distplot(np_rat)
#plt.show()
#now compute Z for each column in the alignment
alignment = AlignIO.read(sys.argv[1], "fasta")
for col in range(len(str(alignment[0].seq))):
GARP_high = 0.0
GARP_low = 0.0
FIMNKY_high = 0.0
FIMNKY_low = 0.0
highGC_taxa = float(len(highGC))
lowGC_taxa = float(len(lowGC))
for rec in alignment:
char = str(rec.seq)[col]
if rec.id in highGC:
if char in 'GARP':
GARP_high += 1.0/highGC_taxa
elif char in 'FIMNKY':
FIMNKY_high += 1.0/highGC_taxa
elif rec.id in lowGC:
if char in 'GARP':
GARP_low += 1.0/lowGC_taxa
elif char in 'FIMNKY':
FIMNKY_low += 1.0/lowGC_taxa
else:
print "Taxon " + str(rec.id) + " couldn't be assigned to high or low GC..."
quit()
zed = FIMNKY_low - FIMNKY_high + GARP_high - GARP_low
flat_zeds.append(zed)
print str(col) + "\t" + str(zed) + "\t" + str(FIMNKY_low) + "\t" + str(FIMNKY_high) + "\t" + str(GARP_high) + "\t" + str(GARP_low)
print alignment[:, col]
col_zed[col] = zed
#np_z = np.array(flat_zeds)
#sns.distplot(np_z)
#plt.show()
#now write alignments in which top X% of sites by zed have been removed - say 50% for now
sorted_zeds = sorted(col_zed.items(), key=operator.itemgetter(1))
num_cols = len(sorted_zeds)
cols_to_take = int(float(num_cols)/2.0)
print cols_to_take
selected_cols = []
for i in range(cols_to_take):
selected_cols.append(sorted_zeds[i][0])
for rec in seqaln:
print rec + "\t" + str(seqaln[rec].seq)[i]
#print sites for most negative zeds
#print alignment[:, selected_cols[0]]
#outfile = sys.argv[2]
#outh = open(outfile, "w")
#for rec in seqaln:
# seq_to_print = ''
# for col in selected_cols:
# seq_to_print += str(seqaln[rec].seq)[col]
# outh.write(">" + str(rec) + "\n" + str(seq_to_print) + "\n")
#outh.close()