Skip to content

Commit 697dd60

Browse files
committed
Add maybe_stop_codon and DnaDisambiguation
1 parent ad25ea7 commit 697dd60

7 files changed

Lines changed: 389 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ is roughly based on [Keep a Changelog], and this project tries to adheres to
66

77
## [0.0.29] - TBD
88

9+
### Added
10+
11+
- Adds `DnaDisambiguation` for checking whether an ambiguous residue potentially codes for an A, C, G, or T
12+
- Adds `maybe_stop_codon` to check whether a codon with ambiguous residues potentially codes for a stop codon
13+
914
### Changed
1015

1116
- Optional auxilary fields in SAM files `SamTags` is now `SamAuxRaw`.
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
/// A look-up table for disambiguating IUPAC nucleotides.
2+
///
3+
/// See the [IUPAC definitions](https://www.bioinformatics.org/sms/iupac.html)
4+
/// for more details.
5+
pub struct DnaDisambiguation;
6+
7+
impl DnaDisambiguation {
8+
/// The look-up table used for disambiguation.
9+
const TABLE: &[u8; 256] = &IUPAC_DISAMBIGUATION;
10+
11+
/// Returns whether the codon potentially codes for a stop codon under the
12+
/// standard genetic code (`TAA`, `TAG`, or `TGA`).
13+
///
14+
/// Note that `NNN` returns `true`.
15+
#[must_use]
16+
pub fn maybe_std_stop_codon(codon: [u8; 3]) -> bool {
17+
// Check that first base is possibly `T`, second base is possibly `A` or
18+
// `G`, third base is possibly `A` or `G`, and either the second or
19+
// third base is possibly `A`.
20+
Self::maybe_t(codon[0]) && {
21+
let encoded1 = Self::TABLE[codon[1] as usize];
22+
let encoded2 = Self::TABLE[codon[2] as usize];
23+
(encoded1 & (MAYBE_A | MAYBE_G) > 0)
24+
&& (encoded2 & (MAYBE_A | MAYBE_G) > 0)
25+
&& ((encoded1 | encoded2) & MAYBE_A > 0)
26+
}
27+
}
28+
29+
/// Returns whether the `base` potentially represents `A`.
30+
#[must_use]
31+
pub fn maybe_a(base: u8) -> bool {
32+
Self::TABLE[base as usize] & MAYBE_A > 0
33+
}
34+
35+
/// Returns whether the `base` potentially represents `C`.
36+
#[must_use]
37+
pub fn maybe_c(base: u8) -> bool {
38+
Self::TABLE[base as usize] & MAYBE_C > 0
39+
}
40+
41+
/// Returns whether the `base` potentially represents `G`.
42+
#[must_use]
43+
pub fn maybe_g(base: u8) -> bool {
44+
Self::TABLE[base as usize] & MAYBE_G > 0
45+
}
46+
47+
/// Returns whether the `base` potentially represents `T`.
48+
#[must_use]
49+
pub fn maybe_t(base: u8) -> bool {
50+
Self::TABLE[base as usize] & MAYBE_T > 0
51+
}
52+
53+
/// Returns whether the `base` potentially represents `A` or `C`.
54+
#[must_use]
55+
pub fn maybe_ac(base: u8) -> bool {
56+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_C) > 0
57+
}
58+
59+
/// Returns whether the `base` potentially represents `A` or `G`.
60+
#[must_use]
61+
pub fn maybe_ag(base: u8) -> bool {
62+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_G) > 0
63+
}
64+
65+
/// Returns whether the `base` potentially represents `A` or `T`.
66+
#[must_use]
67+
pub fn maybe_at(base: u8) -> bool {
68+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_T) > 0
69+
}
70+
71+
/// Returns whether the `base` potentially represents `C` or `G`.
72+
#[must_use]
73+
pub fn maybe_cg(base: u8) -> bool {
74+
Self::TABLE[base as usize] & (MAYBE_C | MAYBE_G) > 0
75+
}
76+
77+
/// Returns whether the `base` potentially represents `C` or `T`.
78+
#[must_use]
79+
pub fn maybe_ct(base: u8) -> bool {
80+
Self::TABLE[base as usize] & (MAYBE_C | MAYBE_T) > 0
81+
}
82+
83+
/// Returns whether the `base` potentially represents `G` or `T`.
84+
#[must_use]
85+
pub fn maybe_gt(base: u8) -> bool {
86+
Self::TABLE[base as usize] & (MAYBE_G | MAYBE_T) > 0
87+
}
88+
89+
/// Returns whether the `base` potentially represents `A`, `C`, or `G`.
90+
#[must_use]
91+
pub fn maybe_acg(base: u8) -> bool {
92+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_C | MAYBE_G) > 0
93+
}
94+
95+
/// Returns whether the `base` potentially represents `A`, `C`, or `T`.
96+
#[must_use]
97+
pub fn maybe_act(base: u8) -> bool {
98+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_C | MAYBE_T) > 0
99+
}
100+
101+
/// Returns whether the `base` potentially represents `A`, `G`, or `T`.
102+
#[must_use]
103+
pub fn maybe_agt(base: u8) -> bool {
104+
Self::TABLE[base as usize] & (MAYBE_A | MAYBE_G | MAYBE_T) > 0
105+
}
106+
107+
/// Returns whether the `base` potentially represents `A`, `G`, or `T`.
108+
#[must_use]
109+
pub fn maybe_cgt(base: u8) -> bool {
110+
Self::TABLE[base as usize] & (MAYBE_C | MAYBE_G | MAYBE_T) > 0
111+
}
112+
113+
/// Returns whether the `base` is valid IUPAC, and hence represents either
114+
/// `A`, `C`, `G`, or `T`.
115+
#[must_use]
116+
pub fn maybe_acgt(base: u8) -> bool {
117+
Self::TABLE[base as usize] > 0
118+
}
119+
}
120+
121+
/// The bit used to represent an IUPAC code potentially representing `A`.
122+
const MAYBE_A: u8 = 0b0001;
123+
/// The bit used to represent an IUPAC code potentially representing `C`.
124+
const MAYBE_C: u8 = 0b0010;
125+
/// The bit used to represent an IUPAC code potentially representing `G`.
126+
const MAYBE_G: u8 = 0b0100;
127+
/// The bit used to represent an IUPAC code potentially representing `T`.
128+
const MAYBE_T: u8 = 0b1000;
129+
130+
/// The look-up table used in [`DnaDisambiguation`], where each index represents
131+
/// a byte and each value has bits set to indicate which bases it might
132+
/// represent.
133+
const IUPAC_DISAMBIGUATION: [u8; 256] = {
134+
/// A helper function for constructing [`IUPAC_DISAMBIGUATION`], which sets
135+
/// the value for the uppercase and lowercase `byte`.
136+
const fn fill(byte: u8, to: &[u8], out: &mut [u8; 256]) {
137+
let lower = byte.to_ascii_lowercase();
138+
let upper = byte.to_ascii_uppercase();
139+
140+
let mut to_encoded = 0;
141+
let mut i = 0;
142+
while i < to.len() {
143+
let bit = match to[i] {
144+
b'A' => MAYBE_A,
145+
b'C' => MAYBE_C,
146+
b'G' => MAYBE_G,
147+
b'T' => MAYBE_T,
148+
_ => panic!("Invalid byte found"),
149+
};
150+
to_encoded |= bit;
151+
i += 1;
152+
}
153+
154+
out[lower as usize] = to_encoded;
155+
out[upper as usize] = to_encoded;
156+
}
157+
158+
let mut out = [0; 256];
159+
160+
fill(b'A', b"A", &mut out);
161+
fill(b'C', b"C", &mut out);
162+
fill(b'G', b"G", &mut out);
163+
fill(b'T', b"T", &mut out);
164+
fill(b'U', b"T", &mut out);
165+
fill(b'R', b"AG", &mut out);
166+
fill(b'Y', b"CT", &mut out);
167+
fill(b'S', b"GC", &mut out);
168+
fill(b'W', b"AT", &mut out);
169+
fill(b'K', b"GT", &mut out);
170+
fill(b'M', b"AC", &mut out);
171+
fill(b'B', b"CGT", &mut out);
172+
fill(b'D', b"AGT", &mut out);
173+
fill(b'H', b"ACT", &mut out);
174+
fill(b'V', b"ACG", &mut out);
175+
fill(b'N', b"ACGT", &mut out);
176+
177+
out
178+
};

src/data/constants/mappings/gc.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use crate::data::mappings::disambiguation::DnaDisambiguation;
2+
13
macro_rules! fill_std_gc {
24
($( $key: expr => $val: expr ),*) => {{
35
let mut map = StdGeneticCode::new_raw_table();
@@ -105,13 +107,13 @@ impl StdGeneticCode {
105107
Self::get(codon).unwrap_or(b'X')
106108
}
107109

108-
/// Determine whether the provided codon is a stop codon.
110+
/// Determines whether the provided codon is a stop codon.
109111
///
110112
/// Any additional elements past the first three are ignored and partial
111113
/// codons are considered `false`. Any ambiguous codons must always
112114
/// translate to a stop codon for this to return `true`.
113-
#[must_use]
114115
#[inline]
116+
#[must_use]
115117
pub fn is_stop_codon(c: &[u8]) -> bool {
116118
c.len() > 2
117119
&& matches!(
@@ -124,6 +126,20 @@ impl StdGeneticCode {
124126
)
125127
}
126128

129+
/// Returns whether the codon potentially codes for a stop codon under the
130+
/// standard genetic code (`TAA`, `TAG`, or `TGA`).
131+
///
132+
/// Note that `NNN` returns `true`.
133+
#[inline]
134+
#[must_use]
135+
pub fn maybe_stop_codon(c: &[u8]) -> bool {
136+
if let Some(codon) = c.first_chunk() {
137+
DnaDisambiguation::maybe_std_stop_codon(*codon)
138+
} else {
139+
false
140+
}
141+
}
142+
127143
/// Initializes the [`StdGeneticCode`] perfect hashmap such that no entries
128144
/// are populated.
129145
#[must_use]

src/data/constants/mappings/mod.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
77
pub(crate) mod aa;
88
pub(crate) mod byte_index;
9-
mod dna;
9+
mod disambiguation;
10+
pub(crate) mod dna;
1011
pub(crate) mod gc;
1112
mod validator;
1213

@@ -15,6 +16,7 @@ mod test;
1516

1617
pub use aa::*;
1718
pub use byte_index::*;
19+
pub use disambiguation::*;
1820
pub use dna::*;
1921
pub use gc::*;
2022
pub use validator::*;

0 commit comments

Comments
 (0)