Skip to content

Commit 51effc8

Browse files
committed
Add back in packedseqnibble
1 parent b7363f5 commit 51effc8

File tree

3 files changed

+321
-17
lines changed

3 files changed

+321
-17
lines changed

flatgfa/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ pub mod memfile;
66
pub mod namemap;
77
pub mod ops;
88
pub mod packedseqcrumb;
9+
pub mod packedseqnibble;
910
pub mod parse;
1011
pub mod pool;
1112
pub mod print;
12-
1313
pub use flatgfa::*;

flatgfa/src/packedseqcrumb.rs

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ impl From<Nucleotide> for char {
4343
}
4444

4545
/// A compressed vector-like structure for storing nucleotide sequences
46-
/// - Two base pairs are stored per byte
46+
/// - Four base pairs are stored per byte
4747
///
4848
pub struct PackedVec {
4949
/// A vector that stores a compressed encoding of this PackedVec's sequence
@@ -86,10 +86,6 @@ impl PackedVec {
8686
}
8787

8888
pub fn len(&self) -> usize {
89-
println!(
90-
"len: {}",
91-
self.data.len() * 4 - (3 - self.crumb_end as usize)
92-
);
9389
self.data.len() * 4 - (3 - self.crumb_end as usize)
9490
}
9591

@@ -101,8 +97,6 @@ impl PackedVec {
10197
pub fn get(&self, index: usize) -> Nucleotide {
10298
let i = index / 4;
10399
let j = index % 4;
104-
// println!("get: {}", (self.data[i] >> (6 - 2 * j)) & 0b00000011u8);
105-
println!("self.data[i]: {}", self.data[i]);
106100
((self.data[i] >> (6 - 2 * j)) & 0b00000011u8).into()
107101
}
108102

@@ -112,7 +106,6 @@ impl PackedVec {
112106
let i = index / 4;
113107
let j = index % 4;
114108
if j == 0 {
115-
println!("i: {}", i);
116109
self.data[i] = (0b00111111u8 & self.data[i]) | (elem << 6);
117110
} else if j == 1 {
118111
self.data[i] = (0b11001111u8 & self.data[i]) | (elem << 4);
@@ -210,14 +203,6 @@ pub fn get_slice_seq(slice: PackedSlice<'_>) -> Vec<Nucleotide> {
210203
slice.vec_ref.get_range(slice.span)
211204
}
212205

213-
#[test]
214-
fn test_simple() {
215-
let vec = PackedVec::create(vec![Nucleotide::A, Nucleotide::C]);
216-
let arr = vec.get_elements();
217-
assert_eq!(arr[0], Nucleotide::A);
218-
assert_eq!(arr[1], Nucleotide::C);
219-
}
220-
221206
#[test]
222207
fn test_vec() {
223208
let mut vec = PackedVec::create(vec![

flatgfa/src/packedseqnibble.rs

Lines changed: 319 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,319 @@
1+
use std::fmt;
2+
3+
#[derive(Debug, PartialEq, Eq, Copy, Clone)]
4+
pub enum Nucleotide {
5+
A,
6+
C,
7+
T,
8+
G,
9+
}
10+
11+
impl From<u8> for Nucleotide {
12+
fn from(value: u8) -> Self {
13+
match value {
14+
0 => Self::A,
15+
1 => Self::C,
16+
2 => Self::T,
17+
3 => Self::G,
18+
_ => panic!("Not a Nucleotide!"),
19+
}
20+
}
21+
}
22+
23+
impl From<Nucleotide> for u8 {
24+
fn from(value: Nucleotide) -> Self {
25+
match value {
26+
Nucleotide::A => 0,
27+
Nucleotide::C => 1,
28+
Nucleotide::T => 2,
29+
Nucleotide::G => 3,
30+
}
31+
}
32+
}
33+
34+
impl From<Nucleotide> for char {
35+
fn from(value: Nucleotide) -> Self {
36+
match value {
37+
Nucleotide::A => 'A',
38+
Nucleotide::C => 'C',
39+
Nucleotide::G => 'G',
40+
Nucleotide::T => 'T',
41+
}
42+
}
43+
}
44+
45+
/// A compressed vector-like structure for storing nucleotide sequences
46+
/// - Two base pairs are stored per byte
47+
///
48+
pub struct PackedVec {
49+
/// A vector that stores a compressed encoding of this PackedVec's sequence
50+
data: Vec<u8>,
51+
52+
/// True if the final base pair in the sequence is stored at a
53+
/// high nibble
54+
high_nibble_end: bool,
55+
}
56+
57+
impl PackedVec {
58+
/// Creates a new empty PackedVec
59+
pub fn new() -> Self {
60+
PackedVec {
61+
data: Vec::new(),
62+
high_nibble_end: true,
63+
}
64+
}
65+
66+
/// Returns a compressed PackedVec given an uncompressed vector `arr`
67+
pub fn create(arr: Vec<Nucleotide>) -> Self {
68+
let mut new_vec = PackedVec::new();
69+
for item in arr {
70+
new_vec.push(item);
71+
}
72+
new_vec
73+
}
74+
75+
/// Appends `input` to the end of this PackedVec
76+
pub fn push(&mut self, input: Nucleotide) {
77+
let value = input.into();
78+
assert!(value <= 0xF);
79+
if self.high_nibble_end {
80+
self.data.push(value);
81+
self.high_nibble_end = false;
82+
} else {
83+
let last_index = self.data.len() - 1;
84+
self.data[last_index] |= value << 4;
85+
self.high_nibble_end = true;
86+
}
87+
}
88+
89+
pub fn len(&self) -> usize {
90+
if self.high_nibble_end {
91+
self.data.len() * 2
92+
} else {
93+
self.data.len() * 2 - 1
94+
}
95+
}
96+
97+
pub fn is_empty(&self) -> bool {
98+
self.data.is_empty()
99+
}
100+
101+
/// Returns the element of this PackedVec at index `index`
102+
pub fn get(&self, index: usize) -> Nucleotide {
103+
let i = index / 2;
104+
if index % 2 == 1 {
105+
((self.data[i] & 0b11110000u8) >> 4).into()
106+
} else {
107+
(self.data[i] & 0b00001111u8).into()
108+
}
109+
}
110+
111+
/// Sets the element of this PackedVec at index `index` to `elem`
112+
pub fn set(&mut self, index: usize, input: Nucleotide) {
113+
let elem: u8 = input.into();
114+
let i = index / 2;
115+
if index % 2 == 1 {
116+
println!("i: {}", i);
117+
self.data[i] = (0b00001111u8 & self.data[i]) | (elem << 4);
118+
} else {
119+
self.data[i] = (0b11110000u8 & self.data[i]) | elem;
120+
}
121+
}
122+
123+
pub fn get_range(&self, span: std::ops::Range<usize>) -> Vec<Nucleotide> {
124+
let mut arr: Vec<Nucleotide> = Vec::with_capacity(span.end - span.start);
125+
for i in span.start..=span.end {
126+
arr.push(self.get(i));
127+
}
128+
arr
129+
}
130+
131+
/// Returns a uncompressed vector that contains the same sequence as this PackedVec
132+
pub fn get_elements(&self) -> Vec<Nucleotide> {
133+
self.get_range(0..(self.len() - 1))
134+
}
135+
}
136+
137+
impl Default for PackedVec {
138+
fn default() -> Self {
139+
Self::new()
140+
}
141+
}
142+
143+
impl fmt::Display for PackedVec {
144+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
145+
write!(f, "[")?;
146+
let mut i = 0;
147+
for item in PackedVecIterator::new(self) {
148+
if i == 0 {
149+
i = 1;
150+
} else {
151+
write!(f, ", ")?;
152+
}
153+
let c: char = item.into();
154+
write!(f, "{}", c)?;
155+
}
156+
write!(f, "]")
157+
}
158+
}
159+
160+
struct PackedVecIterator<'a> {
161+
data: &'a PackedVec,
162+
cur_index: usize,
163+
}
164+
165+
impl<'a> PackedVecIterator<'a> {
166+
pub fn new(vec: &'a PackedVec) -> Self {
167+
Self {
168+
data: vec,
169+
cur_index: 0,
170+
}
171+
}
172+
}
173+
174+
impl Iterator for PackedVecIterator<'_> {
175+
type Item = Nucleotide;
176+
177+
fn next(&mut self) -> Option<Self::Item> {
178+
if self.cur_index < self.data.len() {
179+
self.cur_index += 1;
180+
Some(self.data.get(self.cur_index - 1))
181+
} else {
182+
None
183+
}
184+
}
185+
}
186+
187+
/// A reference to a subsection of a nucleotide sequence stored in a PackedVec
188+
pub struct PackedSlice<'a> {
189+
/// The underlying vector that stores the sequence referenced by this slice
190+
vec_ref: &'a PackedVec,
191+
192+
/// The specific section of the sequence that this slice references
193+
span: std::ops::Range<usize>,
194+
}
195+
196+
/// Returns a PackedSlice given a compressed PackVec `vec` that acts as a reference
197+
/// to the section of `vec` contained within the index bounds of Span `s`.
198+
pub fn create_slice(vec: &PackedVec, s: std::ops::Range<usize>) -> PackedSlice<'_> {
199+
PackedSlice {
200+
vec_ref: vec,
201+
span: s,
202+
}
203+
}
204+
205+
/// Returns a vector containing the base pairs referenced by `slice`
206+
pub fn get_slice_seq(slice: PackedSlice<'_>) -> Vec<Nucleotide> {
207+
slice.vec_ref.get_range(slice.span)
208+
}
209+
210+
#[test]
211+
fn test_vec() {
212+
let mut vec = PackedVec::create(vec![
213+
Nucleotide::A,
214+
Nucleotide::C,
215+
Nucleotide::G,
216+
Nucleotide::T,
217+
Nucleotide::A,
218+
]);
219+
vec.push(Nucleotide::A);
220+
let arr = vec.get_elements();
221+
assert_eq!(arr[0], Nucleotide::A);
222+
assert_eq!(arr[1], Nucleotide::C);
223+
assert_eq!(arr[2], Nucleotide::G);
224+
assert_eq!(arr[3], Nucleotide::T);
225+
assert_eq!(arr[4], Nucleotide::A);
226+
assert_eq!(arr[5], Nucleotide::A);
227+
}
228+
229+
#[test]
230+
fn test_vec_push() {
231+
let mut vec = PackedVec::create(vec![
232+
Nucleotide::A,
233+
Nucleotide::C,
234+
Nucleotide::G,
235+
Nucleotide::T,
236+
]);
237+
vec.push(Nucleotide::A);
238+
vec.push(Nucleotide::C);
239+
vec.push(Nucleotide::G);
240+
vec.push(Nucleotide::T);
241+
let arr = vec.get_elements();
242+
assert_eq!(arr[0], Nucleotide::A);
243+
assert_eq!(arr[1], Nucleotide::C);
244+
assert_eq!(arr[2], Nucleotide::G);
245+
assert_eq!(arr[3], Nucleotide::T);
246+
assert_eq!(arr[4], Nucleotide::A);
247+
assert_eq!(arr[5], Nucleotide::C);
248+
assert_eq!(arr[6], Nucleotide::G);
249+
assert_eq!(arr[7], Nucleotide::T);
250+
}
251+
252+
#[test]
253+
fn test_slice() {
254+
let span = 1..4;
255+
let vec = PackedVec::create(vec![
256+
Nucleotide::A,
257+
Nucleotide::C,
258+
Nucleotide::G,
259+
Nucleotide::T,
260+
Nucleotide::A,
261+
Nucleotide::G,
262+
]);
263+
let slice = create_slice(&vec, span);
264+
let arr = get_slice_seq(slice);
265+
assert_eq!(arr[0], Nucleotide::C);
266+
assert_eq!(arr[1], Nucleotide::G);
267+
assert_eq!(arr[2], Nucleotide::T);
268+
assert_eq!(arr[3], Nucleotide::A);
269+
}
270+
271+
#[test]
272+
fn test_display_even() {
273+
let vec = PackedVec::create(vec![
274+
Nucleotide::C,
275+
Nucleotide::A,
276+
Nucleotide::T,
277+
Nucleotide::C,
278+
Nucleotide::G,
279+
Nucleotide::C,
280+
]);
281+
assert_eq!("[C, A, T, C, G, C]", vec.to_string());
282+
}
283+
284+
#[test]
285+
fn test_display_single() {
286+
let vec = PackedVec::create(vec![Nucleotide::T.into()]);
287+
assert_eq!("[T]", vec.to_string());
288+
}
289+
290+
#[test]
291+
fn test_display_odd() {
292+
let vec = PackedVec::create(vec![
293+
Nucleotide::C,
294+
Nucleotide::A,
295+
Nucleotide::T,
296+
Nucleotide::C,
297+
Nucleotide::G,
298+
Nucleotide::C,
299+
Nucleotide::C,
300+
]);
301+
assert_eq!("[C, A, T, C, G, C, C]", vec.to_string());
302+
}
303+
304+
#[test]
305+
fn test_getter_setter() {
306+
let mut vec = PackedVec::create(vec![
307+
Nucleotide::A,
308+
Nucleotide::A,
309+
Nucleotide::T,
310+
Nucleotide::C,
311+
Nucleotide::G,
312+
Nucleotide::C,
313+
Nucleotide::C,
314+
]);
315+
assert_eq!(vec.get(0), Nucleotide::A);
316+
assert_eq!(vec.get(1), Nucleotide::A);
317+
vec.set(1, Nucleotide::G);
318+
assert_eq!(vec.get(1), Nucleotide::G);
319+
}

0 commit comments

Comments
 (0)