-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathrank_taxa_by_nuc_compo.py
More file actions
104 lines (95 loc) · 3.04 KB
/
rank_taxa_by_nuc_compo.py
File metadata and controls
104 lines (95 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#a method for ranking sites in an alignment according to GC bias, for filtering purposes. Inspired by the Munoz-Gomez et al. (2018) zed score for amino acid data
from Bio import SeqIO, AlignIO
import sys, operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
highGC = []
lowGC = []
ratios = {}
flat_ratios = []
flat_zeds = []
col_zed = {}
#find high and low GC taxa
seqaln = SeqIO.index(sys.argv[1], "fasta")
scores_by_taxa = {}
for taxon in seqaln:
gc = 0
at = 0
sequence = str(seqaln[taxon].seq)
for char in sequence:
if char in 'GC':
gc += 1
elif char in 'AT':
at += 1
ratio = float(gc)/float(at)
scores_by_taxa[taxon] = ratio
ratios[taxon] = ratio
flat_ratios.append(ratio)
if ratio > 1:
highGC.append(taxon)
else:
lowGC.append(taxon)
#rank by ratio and print out list
sorted_taxa = sorted(ratios.items(), key=operator.itemgetter(1), reverse = True)
just_taxa = []
for taxon in sorted_taxa:
print taxon[0] + "\t" + str(taxon[1])
just_taxa.append(taxon[0])
#np_rat = np.array(flat_ratios)
#sns.distplot(np_rat)
#plt.show()
#now compute Z for each column in the alignment
alignment = AlignIO.read(sys.argv[1], "fasta")
for col in range(len(str(alignment[0].seq))):
GARP_high = 0.0
GARP_low = 0.0
FIMNKY_high = 0.0
FIMNKY_low = 0.0
highGC_taxa = float(len(highGC))
lowGC_taxa = float(len(lowGC))
for rec in alignment:
char = str(rec.seq)[col]
if rec.id in highGC:
if char in 'GC':
GARP_high += 1.0/highGC_taxa
elif char in 'AT':
FIMNKY_high += 1.0/highGC_taxa
elif rec.id in lowGC:
if char in 'GC':
GARP_low += 1.0/lowGC_taxa
elif char in 'AT':
FIMNKY_low += 1.0/lowGC_taxa
else:
print "Taxon " + str(rec.id) + " couldn't be assigned to high or low GC..."
quit()
zed = FIMNKY_low - FIMNKY_high + GARP_high - GARP_low
flat_zeds.append(zed)
print str(col) + "\t" + str(zed) + "\t" + str(FIMNKY_low) + "\t" + str(FIMNKY_high) + "\t" + str(GARP_high) + "\t" + str(GARP_low)
print alignment[:, col]
col_zed[col] = zed
#np_z = np.array(flat_zeds)
#sns.distplot(np_z)
#plt.show()
#now write alignments in which top X% of sites by zed have been removed - say 50% for now
sorted_zeds = sorted(col_zed.items(), key=operator.itemgetter(1))
num_cols = len(sorted_zeds)
cols_to_take = int(float(num_cols)/2.0)
print cols_to_take
selected_cols = []
for i in range(cols_to_take):
selected_cols.append(sorted_zeds[i][0])
# for rec in seqaln:
# print rec + "\t" + str(seqaln[rec].seq)[i]
#print sites for most negative zeds
#print alignment[:, selected_cols[0]]
outfile = sys.argv[2]
outh = open(outfile, "w")
for rec in seqaln:
seq_to_print = ''
for col in selected_cols:
seq_to_print += str(seqaln[rec].seq)[col]
outh.write(">" + str(rec) + "\n" + str(seq_to_print) + "\n")
outh.close()