Alpha/utils.py at main · VerifyBot/Alpha · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import json
import os
from itertools import chain
import Bio.Data.CodonTable as ct
from scipy.stats import gmean
from collections import Counter

# get rid of Biopython warning
import warnings
from Bio import BiopythonWarning

warnings.simplefilter("ignore", BiopythonWarning)


def count_nucs_sliding(rand_nucs: str, seq: str = None, nucs_count: int = 3, frame: int = 0, on='variant') -> dict:
  """
  Count how many `nucs_count` length nucs and which appear
  in the `seq` sequence in a sliding mode
  """
  counter = {on: seq or rand_nucs}

  rand_nucs = rand_nucs[frame:]

  for i in range(0, len(rand_nucs) - nucs_count + 1):
    nucs = rand_nucs[i:i + nucs_count]
    cn = nucs + '_count' + str(frame if frame != 0 else '')

    if cn in counter:
      counter[cn] += 1
    else:
      counter[cn] = 1

  return counter


def count_codons(sequence, is_sliding=True, codon_length=3):
  """
  Count how many of each codon appear in the sequence either
  in a sliding mode or in a slicing mode (is_sliding=False)
  """
  assert len(sequence) > 3

  d = {}
  i = 0
  while i <= len(sequence) - codon_length:
    codon = sequence[i:i + codon_length]
    if codon in d:
      d[codon] += 1
    else:
      d[codon] = 1
    if is_sliding:
      i += 1
    else:
      i += 3
  return d


def apply_counts(row):
  """
  Function used to apply func::count_codons on the variants dataframe
  for each of its parts (variant, prefix, rand_nucs, suffix) in several frames
  and also in a sliding mode
  """

  for cn in ['variant', 'prefix', 'rand_nucs', 'suffix']:
    for method in ['sliding', 'slicing_frame0', 'slicing_frame1', 'slicing_frame2']:
      is_slicing = method.startswith('slicing')
      string_to_work_on = row[cn]
      if is_slicing:
        string_to_work_on = string_to_work_on[int(method[-1]):]
      frequency_counts_dictionary = count_codons(string_to_work_on, is_slicing)
      for k, v in frequency_counts_dictionary.items():
        row[f"{cn}_{method}_{k}_count"] = v
  return row


BASE_FOLDER = 'Alpha'
fpath = '../' if not os.getcwd().endswith(BASE_FOLDER) else ''
with open(fpath + 'data/synonymous_codons.json') as f:
  _synonymous_codons = json.load(f)
_non_synonymous_codons = {'ATG', 'TGG'}


def RSCU(sequences, genetic_code=11):
  r"""Calculates the relative synonymous codon usage (RSCU) for a set of sequences.
  RSCU is 'the observed frequency of [a] codon divided by the frequency
  expected under the assumption of equal usage of the synonymous codons for an
  amino acid' (page 1283).
  In math terms, it is
  .. math::
      \frac{X_{ij}}{\frac{1}{n_i}\sum_{j=1}^{n_i}x_{ij}}
  "where :math:`X` is the number of occurrences of the :math:`j` th codon for
  the :math:`i` th amino acid, and :math:`n` is the number (from one to six)
  of alternative codons for the :math:`i` th amino acid" (page 1283).
  Args:
      sequences (list): The reference set of sequences.
      genetic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
  Returns:
      dict: The relative synonymous codon usage.
  Raises:
      ValueError: When an invalid sequence is provided or a list is not provided.
  """

  if not isinstance(sequences, (list, tuple)):
    raise ValueError(
      "Be sure to pass a list of sequences, not a single sequence. "
      "To find the RSCU of a single sequence, pass it as a one element list."
    )

  # ensure all input sequences are divisible by three
  for sequence in sequences:
    if len(sequence) % 3 != 0:
      raise ValueError("Input sequence not divisible by three")
    if not sequence:
      raise ValueError("Input sequence cannot be empty")

  # count the number of each codon in the sequences
  sequences = (
    (sequence[i: i + 3].upper() for i in range(0, len(sequence), 3))
    for sequence in sequences
  )
  codons = chain.from_iterable(
    sequences
  )  # flat list of all codons (to be used for counting)
  counts = Counter(codons)

  # "if a certain codon is never used in the reference set... assign [its
  # count] a value of 0.5" (page 1285)
  for codon in ct.unambiguous_dna_by_id[genetic_code].forward_table:
    if counts[codon] == 0:
      counts[codon] = 0.5

  # determine the synonymous codons for the genetic code
  synonymous_codons = _synonymous_codons

  # hold the result as it is being calulated
  result = {}

  # calculate RSCU values
  for codon in ct.unambiguous_dna_by_id[genetic_code].forward_table:
    result[codon] = counts[codon] / (
        (len(synonymous_codons[codon]) ** -1)
        * (sum((counts[_codon] for _codon in synonymous_codons[codon])))
    )

  return result


def relative_adaptiveness(sequences=None, RSCUs=None, genetic_code=11):
  r"""Calculates the relative adaptiveness/weight of codons.
  The relative adaptiveness is "the frequency of use of that codon compared to
  the frequency of the optimal codon for that amino acid" (page 1283).
  In math terms, :math:`w_{ij}`, the weight for the :math:`j` th codon for
  the :math:`i` th amino acid is
  .. math::
      w_{ij} = \frac{\text{RSCU}_{ij}}{\text{RSCU}_{imax}}
  where ":math:`\text{RSCU}_{imax}` [is] the RSCU... for the frequently used
  codon for the :math:`i` th amino acid" (page 1283).
  Args:
      sequences (list, optional): The reference set of sequences.
      RSCUs (dict, optional): The RSCU of the reference set.
      genentic_code (int, optional): The translation table to use. Defaults to 11, the standard genetic code.
  Note:
      Either ``sequences`` or ``RSCUs`` is required.
  Returns:
      dict: A mapping between each codon and its weight/relative adaptiveness.
  Raises:
      ValueError: When neither ``sequences`` nor ``RSCUs`` is provided.
      ValueError: See :func:`RSCU` for details.
  """

  # ensure user gave only and only one input
  if sum([bool(sequences), bool(RSCUs)]) != 1:
    raise TypeError("Must provide either reference sequences or RSCU dictionary")

  # calculate the RSCUs if only given sequences
  if sequences:
    RSCUs = RSCU(sequences, genetic_code=genetic_code)

  # determine the synonymous codons for the genetic code
  synonymous_codons = _synonymous_codons

  # calculate the weights
  weights = {}
  for codon in RSCUs:
    weights[codon] = RSCUs[codon] / max(
      (RSCUs[_codon] for _codon in synonymous_codons[codon])
    )

  return weights


def CAI(sequence, weights):
  """CAI Index calculation"""
  sequence = sequence.upper()
  sequence = [sequence[i: i + 3] for i in range(0, len(sequence), 3)]

  sequence_weights = []
  for codon in sequence:
    if codon not in _non_synonymous_codons:
      try:
        sequence_weights.append(weights[codon])
      except KeyError:
        # ignore stop codons
        if codon in ct.unambiguous_dna_by_id[11].stop_codons:
          pass

  # return the geometric mean of the weights raised to one over the length of the sequence
  return float(gmean(sequence_weights))


def TAI(sequence, weights):
  """TAI Index is the same as CAI calculation except for end codon being irrelevant"""
  return CAI(sequence[:-3], weights)


codons_amino_acid_table = {
  'ATA': 'I', 'ATC': 'I', 'ATT': 'I', 'ATG': 'M',
  'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',
  'AAC': 'N', 'AAT': 'N', 'AAA': 'K', 'AAG': 'K',
  'AGC': 'S', 'AGT': 'S', 'AGA': 'R', 'AGG': 'R',
  'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',
  'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',
  'CAC': 'H', 'CAT': 'H', 'CAA': 'Q', 'CAG': 'Q',
  'CGA': 'R', 'CGC': 'R', 'CGG': 'R', 'CGT': 'R',
  'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',
  'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
  'GAC': 'D', 'GAT': 'D', 'GAA': 'E', 'GAG': 'E',
  'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G',
  'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',
  'TTC': 'F', 'TTT': 'F', 'TTA': 'L', 'TTG': 'L',
  'TAC': 'Y', 'TAT': 'Y', 'TAA': '_', 'TAG': '_',
  'TGC': 'C', 'TGT': 'C', 'TGA': '_', 'TGG': 'W',
}

# Amino acid degeneracy
degeneracy = {2: ['M'], 9: ['N', 'D', 'C', 'Q', 'E', 'H', 'K', 'F', 'Y'], 1: ['I'], 5: ['A', 'G', 'P', 'T', 'V'],
              3: ['S', 'L', 'R']}


def get_deg(aa):
  for k, v in degeneracy.items():
    if aa in v:
      return k


def ENC(sequence: str) -> int:
  """ENC Index -- /shrug"""
  amino_acids = {}

  for i in range(0, len(sequence), 3):
    codon = sequence[i:i + 3]
    amino_acid = codons_amino_acid_table[codon]

    # appearTimes += 1
    if amino_acid in amino_acids:
      amino_acids[amino_acid]['appearTimes'] += 1
    else:
      amino_acids[amino_acid] = {'codons': {}, 'appearTimes': 1}

    # codon in aa += 1
    if codon in amino_acids[amino_acid]['codons']:
      amino_acids[amino_acid]['codons'][codon] += 1
    else:
      amino_acids[amino_acid]['codons'][codon] = 1

  # calculate Pi
  for aa, amino_data in amino_acids.items():
    for codon in amino_data['codons'].keys():
      amino_acids[aa]['codons'][codon] /= amino_acids[aa]['appearTimes']

  # calculate F
  for aa, amino_data in amino_acids.items():
    amino_acids[aa]['F'] = sum(v ** 2 for v in amino_acids[aa]['codons'].values())

  # calculate ENC
  return sum(get_deg(k) / (1 / v['F']) for k, v in amino_acids.items() if k not in ['W', '_'])