-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpacked_array.rs
More file actions
250 lines (208 loc) · 7.3 KB
/
packed_array.rs
File metadata and controls
250 lines (208 loc) · 7.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/// Variable-width bit-packed array matching STAR's PackedArray format.
///
/// Stores integers with a specified bit width, packing them at bit-level
/// granularity (LSB-first, little-endian bit packing).
#[derive(Clone)]
pub struct PackedArray {
/// Number of bits per element
word_length: u32,
/// Complement bits (64 - word_length) — kept for STAR compatibility
#[allow(dead_code)]
word_comp_length: u32,
/// Mask for extracting an element (word_length bits set)
bit_rec_mask: u64,
/// Number of elements
length: usize,
/// Raw byte storage
data: Vec<u8>,
}
impl PackedArray {
/// Create a new PackedArray with specified bit width and length.
///
/// # Arguments
/// * `word_length` - Bits per element (1-64)
/// * `length` - Number of elements
pub fn new(word_length: u32, length: usize) -> Self {
assert!(word_length > 0 && word_length <= 64);
let word_comp_length = 64 - word_length;
let bit_rec_mask = if word_length == 64 {
u64::MAX
} else {
(1u64 << word_length) - 1
};
// Calculate bytes needed (matching STAR's formula)
let length_byte = if length == 0 {
0
} else {
((length - 1) as u64 * word_length as u64) / 8 + 8
};
let data = vec![0u8; length_byte as usize];
Self {
word_length,
word_comp_length,
bit_rec_mask,
length,
data,
}
}
/// Write an element at the specified index.
///
/// # Arguments
/// * `index` - Element index
/// * `value` - Value to write (will be masked to word_length bits)
pub fn write(&mut self, index: usize, value: u64) {
assert!(index < self.length);
let b = (index as u64 * self.word_length as u64) as usize; // bit offset
let byte_offset = b / 8;
let bit_shift = (b % 8) as u32;
let masked_value = (value & self.bit_rec_mask) << bit_shift;
let mask = self.bit_rec_mask << bit_shift;
// Read current 8-byte word, update bits, write back
let mut word = u64::from_le_bytes([
self.data.get(byte_offset).copied().unwrap_or(0),
self.data.get(byte_offset + 1).copied().unwrap_or(0),
self.data.get(byte_offset + 2).copied().unwrap_or(0),
self.data.get(byte_offset + 3).copied().unwrap_or(0),
self.data.get(byte_offset + 4).copied().unwrap_or(0),
self.data.get(byte_offset + 5).copied().unwrap_or(0),
self.data.get(byte_offset + 6).copied().unwrap_or(0),
self.data.get(byte_offset + 7).copied().unwrap_or(0),
]);
word = (word & !mask) | masked_value;
let bytes = word.to_le_bytes();
for (i, &byte) in bytes.iter().enumerate() {
if byte_offset + i < self.data.len() {
self.data[byte_offset + i] = byte;
}
}
}
/// Read an element at the specified index.
///
/// # Arguments
/// * `index` - Element index
///
/// # Returns
/// The value at the specified index
pub fn read(&self, index: usize) -> u64 {
assert!(index < self.length);
let b = (index as u64 * self.word_length as u64) as usize; // bit offset
let byte_offset = b / 8;
let bit_shift = (b % 8) as u32;
let word = if byte_offset + 8 <= self.data.len() {
// Fast path: read 8 bytes directly (no per-byte bounds checks)
// SAFETY: We just verified byte_offset + 8 <= data.len()
let bytes = &self.data[byte_offset..byte_offset + 8];
u64::from_le_bytes(bytes.try_into().unwrap())
} else {
// Slow path: near end of array, read byte-by-byte with bounds checks
u64::from_le_bytes([
self.data.get(byte_offset).copied().unwrap_or(0),
self.data.get(byte_offset + 1).copied().unwrap_or(0),
self.data.get(byte_offset + 2).copied().unwrap_or(0),
self.data.get(byte_offset + 3).copied().unwrap_or(0),
self.data.get(byte_offset + 4).copied().unwrap_or(0),
self.data.get(byte_offset + 5).copied().unwrap_or(0),
self.data.get(byte_offset + 6).copied().unwrap_or(0),
self.data.get(byte_offset + 7).copied().unwrap_or(0),
])
};
// Extract and mask the value
(word >> bit_shift) & self.bit_rec_mask
}
/// Get the number of elements.
pub fn len(&self) -> usize {
self.length
}
/// Check if the array is empty.
pub fn is_empty(&self) -> bool {
self.length == 0
}
/// Get the number of bits per element.
pub fn word_length(&self) -> u32 {
self.word_length
}
/// Get a reference to the raw byte data.
pub fn data(&self) -> &[u8] {
&self.data
}
/// Create a PackedArray from raw byte data.
///
/// # Arguments
/// * `word_length` - Bits per element
/// * `length` - Number of elements
/// * `data` - Raw byte data
pub fn from_bytes(word_length: u32, length: usize, data: Vec<u8>) -> Self {
assert!(word_length > 0 && word_length <= 64);
let word_comp_length = 64 - word_length;
let bit_rec_mask = if word_length == 64 {
u64::MAX
} else {
(1u64 << word_length) - 1
};
Self {
word_length,
word_comp_length,
bit_rec_mask,
length,
data,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn round_trip_single_byte() {
let mut arr = PackedArray::new(5, 10);
arr.write(0, 31); // 11111 in 5 bits
arr.write(1, 0); // 00000
arr.write(2, 17); // 10001
assert_eq!(arr.read(0), 31);
assert_eq!(arr.read(1), 0);
assert_eq!(arr.read(2), 17);
}
#[test]
fn round_trip_cross_byte_boundary() {
let mut arr = PackedArray::new(33, 100); // Human genome SA width
let test_values = [
0x1FFFFFFFF, // All 33 bits set
0x100000000, // Bit 32 set (strand bit)
0x0FFFFFFFF, // Bits 0-31 set (max forward position)
0,
12345678,
];
for (i, &val) in test_values.iter().enumerate() {
arr.write(i, val);
}
for (i, &expected) in test_values.iter().enumerate() {
assert_eq!(arr.read(i), expected);
}
}
#[test]
fn masking() {
let mut arr = PackedArray::new(10, 5);
// Write value larger than 10 bits — should be masked
arr.write(0, 0xFFFF); // All bits set
assert_eq!(arr.read(0), 0x3FF); // Only 10 bits = 1023
}
#[test]
fn bit_width_32() {
let mut arr = PackedArray::new(32, 10);
arr.write(0, 0xDEADBEEF);
arr.write(1, 0x12345678);
arr.write(5, 0xCAFEBABE);
assert_eq!(arr.read(0), 0xDEADBEEF);
assert_eq!(arr.read(1), 0x12345678);
assert_eq!(arr.read(5), 0xCAFEBABE);
}
#[test]
fn sequential_writes() {
let mut arr = PackedArray::new(7, 1000);
for i in 0..1000 {
arr.write(i, (i % 128) as u64);
}
for i in 0..1000 {
assert_eq!(arr.read(i), (i % 128) as u64);
}
}
}