-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
277 lines (225 loc) · 9.22 KB
/
utils.py
File metadata and controls
277 lines (225 loc) · 9.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import json
import os
from itertools import chain
import Bio.Data.CodonTable as ct
from scipy.stats import gmean
from collections import Counter
# get rid of Biopython warning
import warnings
from Bio import BiopythonWarning
warnings.simplefilter("ignore", BiopythonWarning)
def count_nucs_sliding(rand_nucs: str, seq: str = None, nucs_count: int = 3, frame: int = 0, on='variant') -> dict:
"""
Count how many `nucs_count` length nucs and which appear
in the `seq` sequence in a sliding mode
"""
counter = {on: seq or rand_nucs}
rand_nucs = rand_nucs[frame:]
for i in range(0, len(rand_nucs) - nucs_count + 1):
nucs = rand_nucs[i:i + nucs_count]
cn = nucs + '_count' + str(frame if frame != 0 else '')
if cn in counter:
counter[cn] += 1
else:
counter[cn] = 1
return counter
def count_codons(sequence, is_sliding=True, codon_length=3):
"""
Count how many of each codon appear in the sequence either
in a sliding mode or in a slicing mode (is_sliding=False)
"""
assert len(sequence) > 3
d = {}
i = 0
while i <= len(sequence) - codon_length:
codon = sequence[i:i + codon_length]
if codon in d:
d[codon] += 1
else:
d[codon] = 1
if is_sliding:
i += 1
else:
i += 3
return d
def apply_counts(row):
"""
Function used to apply func::count_codons on the variants dataframe
for each of its parts (variant, prefix, rand_nucs, suffix) in several frames
and also in a sliding mode
"""
for cn in ['variant', 'prefix', 'rand_nucs', 'suffix']:
for method in ['sliding', 'slicing_frame0', 'slicing_frame1', 'slicing_frame2']:
is_slicing = method.startswith('slicing')
string_to_work_on = row[cn]
if is_slicing:
string_to_work_on = string_to_work_on[int(method[-1]):]
frequency_counts_dictionary = count_codons(string_to_work_on, is_slicing)
for k, v in frequency_counts_dictionary.items():
row[f"{cn}_{method}_{k}_count"] = v
return row
BASE_FOLDER = 'Alpha'
fpath = '../' if not os.getcwd().endswith(BASE_FOLDER) else ''
with open(fpath + 'data/synonymous_codons.json') as f:
_synonymous_codons = json.load(f)
_non_synonymous_codons = {'ATG', 'TGG'}
def RSCU(sequences, genetic_code=11):
r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences.
RSCU is 'the observed frequency of [a] codon divided by the frequency
expected under the assumption of equal usage of the synonymous codons for an
amino acid' (page 1283).
In math terms, it is
.. math::
\frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}}
"where :math:`X` is the number of occurrences of the :math:`j` th codon for
the :math:`i` th amino acid, and :math:`n` is the number (from one to six)
of alternative codons for the :math:`i` th amino acid" (page 1283).
Args:
sequences (list): The reference set of sequences.
genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
Returns:
dict: The relative synonymous codon usage.
Raises:
ValueError: When an invalid sequence is provided or a list is not provided.
"""
if not isinstance(sequences, (list, tuple)):
raise ValueError(
"Be sure to pass a list of sequences, not a single sequence. "
"To find the RSCU of a single sequence, pass it as a one element list."
)
# ensure all input sequences are divisible by three
for sequence in sequences:
if len(sequence) % 3 != 0:
raise ValueError("Input sequence not divisible by three")
if not sequence:
raise ValueError("Input sequence cannot be empty")
# count the number of each codon in the sequences
sequences = (
(sequence[i: i + 3].upper() for i in range(0, len(sequence), 3))
for sequence in sequences
)
codons = chain.from_iterable(
sequences
) # flat list of all codons (to be used for counting)
counts = Counter(codons)
# "if a certain codon is never used in the reference set... assign [its
# count] a value of 0.5" (page 1285)
for codon in ct.unambiguous_dna_by_id[genetic_code].forward_table:
if counts[codon] == 0:
counts[codon] = 0.5
# determine the synonymous codons for the genetic code
synonymous_codons = _synonymous_codons
# hold the result as it is being calulated
result = {}
# calculate RSCU values
for codon in ct.unambiguous_dna_by_id[genetic_code].forward_table:
result[codon] = counts[codon] / (
(len(synonymous_codons[codon]) ** -1)
* (sum((counts[_codon] for _codon in synonymous_codons[codon])))
)
return result
def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11):
r"""Calculates the relative adaptiveness/weight of codons.
The relative adaptiveness is "the frequency of use of that codon compared to
the frequency of the optimal codon for that amino acid" (page 1283).
In math terms, :math:`w_{ij}`, the weight for the :math:`j` th codon for
the :math:`i` th amino acid is
.. math::
w_{ij} = \frac{\text{RSCU}_{ij}}{\text{RSCU}_{imax}}
where ":math:`\text{RSCU}_{imax}` [is] the RSCU... for the frequently used
codon for the :math:`i` th amino acid" (page 1283).
Args:
sequences (list, optional): The reference set of sequences.
RSCUs (dict, optional): The RSCU of the reference set.
genentic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
Note:
Either ``sequences`` or ``RSCUs`` is required.
Returns:
dict: A mapping between each codon and its weight/relative adaptiveness.
Raises:
ValueError: When neither ``sequences`` nor ``RSCUs`` is provided.
ValueError: See :func:`RSCU` for details.
"""
# ensure user gave only and only one input
if sum([bool(sequences), bool(RSCUs)]) != 1:
raise TypeError("Must provide either reference sequences or RSCU dictionary")
# calculate the RSCUs if only given sequences
if sequences:
RSCUs = RSCU(sequences, genetic_code=genetic_code)
# determine the synonymous codons for the genetic code
synonymous_codons = _synonymous_codons
# calculate the weights
weights = {}
for codon in RSCUs:
weights[codon] = RSCUs[codon] / max(
(RSCUs[_codon] for _codon in synonymous_codons[codon])
)
return weights
def CAI(sequence, weights):
"""CAI Index calculation"""
sequence = sequence.upper()
sequence = [sequence[i: i + 3] for i in range(0, len(sequence), 3)]
sequence_weights = []
for codon in sequence:
if codon not in _non_synonymous_codons:
try:
sequence_weights.append(weights[codon])
except KeyError:
# ignore stop codons
if codon in ct.unambiguous_dna_by_id[11].stop_codons:
pass
# return the geometric mean of the weights raised to one over the length of the sequence
return float(gmean(sequence_weights))
def TAI(sequence, weights):
"""TAI Index is the same as CAI calculation except for end codon being irrelevant"""
return CAI(sequence[:-3], weights)
codons_amino_acid_table = {
'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W',
}
# Amino acid degeneracy
degeneracy = {2: ['M'], 9: ['N', 'D', 'C', 'Q', 'E', 'H', 'K', 'F', 'Y'], 1: ['I'], 5: ['A', 'G', 'P', 'T', 'V'],
3: ['S', 'L', 'R']}
def get_deg(aa):
for k, v in degeneracy.items():
if aa in v:
return k
def ENC(sequence: str) -> int:
"""ENC Index -- /shrug"""
amino_acids = {}
for i in range(0, len(sequence), 3):
codon = sequence[i:i + 3]
amino_acid = codons_amino_acid_table[codon]
# appearTimes += 1
if amino_acid in amino_acids:
amino_acids[amino_acid]['appearTimes'] += 1
else:
amino_acids[amino_acid] = {'codons': {}, 'appearTimes': 1}
# codon in aa += 1
if codon in amino_acids[amino_acid]['codons']:
amino_acids[amino_acid]['codons'][codon] += 1
else:
amino_acids[amino_acid]['codons'][codon] = 1
# calculate Pi
for aa, amino_data in amino_acids.items():
for codon in amino_data['codons'].keys():
amino_acids[aa]['codons'][codon] /= amino_acids[aa]['appearTimes']
# calculate F
for aa, amino_data in amino_acids.items():
amino_acids[aa]['F'] = sum(v ** 2 for v in amino_acids[aa]['codons'].values())
# calculate ENC
return sum(get_deg(k) / (1 / v['F']) for k, v in amino_acids.items() if k not in ['W', '_'])