|
| 1 | +use std::fmt; |
| 2 | + |
| 3 | +#[derive(Debug, PartialEq, Eq, Copy, Clone)] |
| 4 | +pub enum Nucleotide { |
| 5 | + A, |
| 6 | + C, |
| 7 | + T, |
| 8 | + G, |
| 9 | +} |
| 10 | + |
| 11 | +impl From<u8> for Nucleotide { |
| 12 | + fn from(value: u8) -> Self { |
| 13 | + match value { |
| 14 | + 0 => Self::A, |
| 15 | + 1 => Self::C, |
| 16 | + 2 => Self::T, |
| 17 | + 3 => Self::G, |
| 18 | + _ => panic!("Not a Nucleotide!"), |
| 19 | + } |
| 20 | + } |
| 21 | +} |
| 22 | + |
| 23 | +impl From<Nucleotide> for u8 { |
| 24 | + fn from(value: Nucleotide) -> Self { |
| 25 | + match value { |
| 26 | + Nucleotide::A => 0, |
| 27 | + Nucleotide::C => 1, |
| 28 | + Nucleotide::T => 2, |
| 29 | + Nucleotide::G => 3, |
| 30 | + } |
| 31 | + } |
| 32 | +} |
| 33 | + |
| 34 | +impl From<Nucleotide> for char { |
| 35 | + fn from(value: Nucleotide) -> Self { |
| 36 | + match value { |
| 37 | + Nucleotide::A => 'A', |
| 38 | + Nucleotide::C => 'C', |
| 39 | + Nucleotide::G => 'G', |
| 40 | + Nucleotide::T => 'T', |
| 41 | + } |
| 42 | + } |
| 43 | +} |
| 44 | + |
| 45 | +/// A compressed vector-like structure for storing nucleotide sequences |
| 46 | +/// - Two base pairs are stored per byte |
| 47 | +/// |
| 48 | +pub struct PackedVec { |
| 49 | + /// A vector that stores a compressed encoding of this PackedVec's sequence |
| 50 | + data: Vec<u8>, |
| 51 | + |
| 52 | + /// True if the final base pair in the sequence is stored at a |
| 53 | + /// high nibble |
| 54 | + high_nibble_end: bool, |
| 55 | +} |
| 56 | + |
| 57 | +impl PackedVec { |
| 58 | + /// Creates a new empty PackedVec |
| 59 | + pub fn new() -> Self { |
| 60 | + PackedVec { |
| 61 | + data: Vec::new(), |
| 62 | + high_nibble_end: true, |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + /// Returns a compressed PackedVec given an uncompressed vector `arr` |
| 67 | + pub fn create(arr: Vec<Nucleotide>) -> Self { |
| 68 | + let mut new_vec = PackedVec::new(); |
| 69 | + for item in arr { |
| 70 | + new_vec.push(item); |
| 71 | + } |
| 72 | + new_vec |
| 73 | + } |
| 74 | + |
| 75 | + /// Appends `input` to the end of this PackedVec |
| 76 | + pub fn push(&mut self, input: Nucleotide) { |
| 77 | + let value = input.into(); |
| 78 | + assert!(value <= 0xF); |
| 79 | + if self.high_nibble_end { |
| 80 | + self.data.push(value); |
| 81 | + self.high_nibble_end = false; |
| 82 | + } else { |
| 83 | + let last_index = self.data.len() - 1; |
| 84 | + self.data[last_index] |= value << 4; |
| 85 | + self.high_nibble_end = true; |
| 86 | + } |
| 87 | + } |
| 88 | + |
| 89 | + pub fn len(&self) -> usize { |
| 90 | + if self.high_nibble_end { |
| 91 | + self.data.len() * 2 |
| 92 | + } else { |
| 93 | + self.data.len() * 2 - 1 |
| 94 | + } |
| 95 | + } |
| 96 | + |
| 97 | + pub fn is_empty(&self) -> bool { |
| 98 | + self.data.is_empty() |
| 99 | + } |
| 100 | + |
| 101 | + /// Returns the element of this PackedVec at index `index` |
| 102 | + pub fn get(&self, index: usize) -> Nucleotide { |
| 103 | + let i = index / 2; |
| 104 | + if index % 2 == 1 { |
| 105 | + ((self.data[i] & 0b11110000u8) >> 4).into() |
| 106 | + } else { |
| 107 | + (self.data[i] & 0b00001111u8).into() |
| 108 | + } |
| 109 | + } |
| 110 | + |
| 111 | + /// Sets the element of this PackedVec at index `index` to `elem` |
| 112 | + pub fn set(&mut self, index: usize, input: Nucleotide) { |
| 113 | + let elem: u8 = input.into(); |
| 114 | + let i = index / 2; |
| 115 | + if index % 2 == 1 { |
| 116 | + println!("i: {}", i); |
| 117 | + self.data[i] = (0b00001111u8 & self.data[i]) | (elem << 4); |
| 118 | + } else { |
| 119 | + self.data[i] = (0b11110000u8 & self.data[i]) | elem; |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + pub fn get_range(&self, span: std::ops::Range<usize>) -> Vec<Nucleotide> { |
| 124 | + let mut arr: Vec<Nucleotide> = Vec::with_capacity(span.end - span.start); |
| 125 | + for i in span.start..=span.end { |
| 126 | + arr.push(self.get(i)); |
| 127 | + } |
| 128 | + arr |
| 129 | + } |
| 130 | + |
| 131 | + /// Returns a uncompressed vector that contains the same sequence as this PackedVec |
| 132 | + pub fn get_elements(&self) -> Vec<Nucleotide> { |
| 133 | + self.get_range(0..(self.len() - 1)) |
| 134 | + } |
| 135 | +} |
| 136 | + |
| 137 | +impl Default for PackedVec { |
| 138 | + fn default() -> Self { |
| 139 | + Self::new() |
| 140 | + } |
| 141 | +} |
| 142 | + |
| 143 | +impl fmt::Display for PackedVec { |
| 144 | + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 145 | + write!(f, "[")?; |
| 146 | + let mut i = 0; |
| 147 | + for item in PackedVecIterator::new(self) { |
| 148 | + if i == 0 { |
| 149 | + i = 1; |
| 150 | + } else { |
| 151 | + write!(f, ", ")?; |
| 152 | + } |
| 153 | + let c: char = item.into(); |
| 154 | + write!(f, "{}", c)?; |
| 155 | + } |
| 156 | + write!(f, "]") |
| 157 | + } |
| 158 | +} |
| 159 | + |
| 160 | +struct PackedVecIterator<'a> { |
| 161 | + data: &'a PackedVec, |
| 162 | + cur_index: usize, |
| 163 | +} |
| 164 | + |
| 165 | +impl<'a> PackedVecIterator<'a> { |
| 166 | + pub fn new(vec: &'a PackedVec) -> Self { |
| 167 | + Self { |
| 168 | + data: vec, |
| 169 | + cur_index: 0, |
| 170 | + } |
| 171 | + } |
| 172 | +} |
| 173 | + |
| 174 | +impl Iterator for PackedVecIterator<'_> { |
| 175 | + type Item = Nucleotide; |
| 176 | + |
| 177 | + fn next(&mut self) -> Option<Self::Item> { |
| 178 | + if self.cur_index < self.data.len() { |
| 179 | + self.cur_index += 1; |
| 180 | + Some(self.data.get(self.cur_index - 1)) |
| 181 | + } else { |
| 182 | + None |
| 183 | + } |
| 184 | + } |
| 185 | +} |
| 186 | + |
| 187 | +/// A reference to a subsection of a nucleotide sequence stored in a PackedVec |
| 188 | +pub struct PackedSlice<'a> { |
| 189 | + /// The underlying vector that stores the sequence referenced by this slice |
| 190 | + vec_ref: &'a PackedVec, |
| 191 | + |
| 192 | + /// The specific section of the sequence that this slice references |
| 193 | + span: std::ops::Range<usize>, |
| 194 | +} |
| 195 | + |
| 196 | +/// Returns a PackedSlice given a compressed PackVec `vec` that acts as a reference |
| 197 | +/// to the section of `vec` contained within the index bounds of Span `s`. |
| 198 | +pub fn create_slice(vec: &PackedVec, s: std::ops::Range<usize>) -> PackedSlice<'_> { |
| 199 | + PackedSlice { |
| 200 | + vec_ref: vec, |
| 201 | + span: s, |
| 202 | + } |
| 203 | +} |
| 204 | + |
| 205 | +/// Returns a vector containing the base pairs referenced by `slice` |
| 206 | +pub fn get_slice_seq(slice: PackedSlice<'_>) -> Vec<Nucleotide> { |
| 207 | + slice.vec_ref.get_range(slice.span) |
| 208 | +} |
| 209 | + |
| 210 | +#[test] |
| 211 | +fn test_vec() { |
| 212 | + let mut vec = PackedVec::create(vec![ |
| 213 | + Nucleotide::A, |
| 214 | + Nucleotide::C, |
| 215 | + Nucleotide::G, |
| 216 | + Nucleotide::T, |
| 217 | + Nucleotide::A, |
| 218 | + ]); |
| 219 | + vec.push(Nucleotide::A); |
| 220 | + let arr = vec.get_elements(); |
| 221 | + assert_eq!(arr[0], Nucleotide::A); |
| 222 | + assert_eq!(arr[1], Nucleotide::C); |
| 223 | + assert_eq!(arr[2], Nucleotide::G); |
| 224 | + assert_eq!(arr[3], Nucleotide::T); |
| 225 | + assert_eq!(arr[4], Nucleotide::A); |
| 226 | + assert_eq!(arr[5], Nucleotide::A); |
| 227 | +} |
| 228 | + |
| 229 | +#[test] |
| 230 | +fn test_vec_push() { |
| 231 | + let mut vec = PackedVec::create(vec![ |
| 232 | + Nucleotide::A, |
| 233 | + Nucleotide::C, |
| 234 | + Nucleotide::G, |
| 235 | + Nucleotide::T, |
| 236 | + ]); |
| 237 | + vec.push(Nucleotide::A); |
| 238 | + vec.push(Nucleotide::C); |
| 239 | + vec.push(Nucleotide::G); |
| 240 | + vec.push(Nucleotide::T); |
| 241 | + let arr = vec.get_elements(); |
| 242 | + assert_eq!(arr[0], Nucleotide::A); |
| 243 | + assert_eq!(arr[1], Nucleotide::C); |
| 244 | + assert_eq!(arr[2], Nucleotide::G); |
| 245 | + assert_eq!(arr[3], Nucleotide::T); |
| 246 | + assert_eq!(arr[4], Nucleotide::A); |
| 247 | + assert_eq!(arr[5], Nucleotide::C); |
| 248 | + assert_eq!(arr[6], Nucleotide::G); |
| 249 | + assert_eq!(arr[7], Nucleotide::T); |
| 250 | +} |
| 251 | + |
| 252 | +#[test] |
| 253 | +fn test_slice() { |
| 254 | + let span = 1..4; |
| 255 | + let vec = PackedVec::create(vec![ |
| 256 | + Nucleotide::A, |
| 257 | + Nucleotide::C, |
| 258 | + Nucleotide::G, |
| 259 | + Nucleotide::T, |
| 260 | + Nucleotide::A, |
| 261 | + Nucleotide::G, |
| 262 | + ]); |
| 263 | + let slice = create_slice(&vec, span); |
| 264 | + let arr = get_slice_seq(slice); |
| 265 | + assert_eq!(arr[0], Nucleotide::C); |
| 266 | + assert_eq!(arr[1], Nucleotide::G); |
| 267 | + assert_eq!(arr[2], Nucleotide::T); |
| 268 | + assert_eq!(arr[3], Nucleotide::A); |
| 269 | +} |
| 270 | + |
| 271 | +#[test] |
| 272 | +fn test_display_even() { |
| 273 | + let vec = PackedVec::create(vec![ |
| 274 | + Nucleotide::C, |
| 275 | + Nucleotide::A, |
| 276 | + Nucleotide::T, |
| 277 | + Nucleotide::C, |
| 278 | + Nucleotide::G, |
| 279 | + Nucleotide::C, |
| 280 | + ]); |
| 281 | + assert_eq!("[C, A, T, C, G, C]", vec.to_string()); |
| 282 | +} |
| 283 | + |
| 284 | +#[test] |
| 285 | +fn test_display_single() { |
| 286 | + let vec = PackedVec::create(vec![Nucleotide::T.into()]); |
| 287 | + assert_eq!("[T]", vec.to_string()); |
| 288 | +} |
| 289 | + |
| 290 | +#[test] |
| 291 | +fn test_display_odd() { |
| 292 | + let vec = PackedVec::create(vec![ |
| 293 | + Nucleotide::C, |
| 294 | + Nucleotide::A, |
| 295 | + Nucleotide::T, |
| 296 | + Nucleotide::C, |
| 297 | + Nucleotide::G, |
| 298 | + Nucleotide::C, |
| 299 | + Nucleotide::C, |
| 300 | + ]); |
| 301 | + assert_eq!("[C, A, T, C, G, C, C]", vec.to_string()); |
| 302 | +} |
| 303 | + |
| 304 | +#[test] |
| 305 | +fn test_getter_setter() { |
| 306 | + let mut vec = PackedVec::create(vec![ |
| 307 | + Nucleotide::A, |
| 308 | + Nucleotide::A, |
| 309 | + Nucleotide::T, |
| 310 | + Nucleotide::C, |
| 311 | + Nucleotide::G, |
| 312 | + Nucleotide::C, |
| 313 | + Nucleotide::C, |
| 314 | + ]); |
| 315 | + assert_eq!(vec.get(0), Nucleotide::A); |
| 316 | + assert_eq!(vec.get(1), Nucleotide::A); |
| 317 | + vec.set(1, Nucleotide::G); |
| 318 | + assert_eq!(vec.get(1), Nucleotide::G); |
| 319 | +} |
0 commit comments