-
Notifications
You must be signed in to change notification settings - Fork 64
Expand file tree
/
Copy pathchunking.rs
More file actions
1014 lines (882 loc) · 42.7 KB
/
chunking.rs
File metadata and controls
1014 lines (882 loc) · 42.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use std::cmp::min;
use std::io::{Read, Seek, SeekFrom};
use bytes::Bytes;
use more_asserts::{debug_assert_ge, debug_assert_le};
use crate::Chunk;
use crate::constants::{MAXIMUM_CHUNK_MULTIPLIER, MINIMUM_CHUNK_DIVISOR, TARGET_CHUNK_SIZE};
/// Chunk Generator given an input stream. Do not use directly.
/// Use `chunk_target_default`.
pub struct Chunker {
// configs
hash: gearhash::Hasher<'static>,
minimum_chunk: usize,
maximum_chunk: usize,
mask: u64,
// generator state
chunkbuf: Vec<u8>,
}
impl Default for Chunker {
fn default() -> Self {
Self::new(*TARGET_CHUNK_SIZE)
}
}
impl Chunker {
pub fn new(target_chunk_size: usize) -> Self {
assert_eq!(target_chunk_size.count_ones(), 1);
// Some of the logic only works if the target_chunk_size is greater than the
// window size of the hash.
assert!(target_chunk_size > 64);
// note the strict lesser than. Combined with count_ones() == 1,
// this limits to 2^31
assert!(target_chunk_size < u32::MAX as usize);
let mask = (target_chunk_size - 1) as u64;
// we will like to shift the mask left by a bunch since the right
// bits of the gear hash are affected by only a small number of bytes
// really. we just shift it all the way left.
let mask = mask << mask.leading_zeros();
let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
assert!(maximum_chunk > minimum_chunk);
let hash = gearhash::Hasher::default();
Chunker {
hash,
minimum_chunk,
maximum_chunk,
mask,
// generator state init
chunkbuf: Vec::with_capacity(maximum_chunk),
}
}
/// Create a chunker with custom min chunk sizes.
/// Only used by the partitioner which has special requirements.
fn new_with_min(target_chunk_size: usize, min_chunk_size: usize) -> Self {
let mut chunker = Self::new(target_chunk_size);
chunker.minimum_chunk = min_chunk_size;
chunker
}
/// Looks for the next chunk boundary in the data. Assumes that whatever is in the current
/// state has been prepended to the current data. If a boundary cannot be found based on the
/// current amount of data, then None is returned.
#[inline]
pub fn next_boundary(&mut self, data: &[u8]) -> Option<usize> {
const HASH_WINDOW_SIZE: usize = 64;
let n_bytes = data.len();
if n_bytes == 0 {
return None;
}
let previous_len = self.chunkbuf.len();
let mut cur_index = 0;
let mut create_chunk = false;
// skip the minimum chunk size
// and noting that the hash has a window size of 64
// so we should be careful to skip only minimum_chunk - 64 - 1
if previous_len + HASH_WINDOW_SIZE < self.minimum_chunk {
let skip = min(self.minimum_chunk - previous_len - HASH_WINDOW_SIZE - 1, n_bytes);
cur_index += skip;
}
// If we have a lot of data, don't read all the way to the end when we'll stop reading
// at the maximum chunk boundary.
let read_end = n_bytes.min(cur_index + self.maximum_chunk - previous_len);
loop {
if let Some(next_match) = self.hash.next_match(&data[cur_index..read_end], self.mask) {
cur_index += next_match;
// If we trigger a boundary before the end, create a chunk.
// We must enforce that the next boundary is actually past the minimum chunk size.
// Because of how the rolling hash is computed, bytes before HASH_WINDOW_SIZE don't affect the hash,
// so with the above skip we depend on it running for at least HASH_WINDOW_SIZE bytes before triggering
// a boundary. However, in rare occurances, there can be a boundary triggered before HASH_WINDOW_SIZE
// bytes have been processed, which means the boundary is triggered based on the previous state of the
// hasher rather than on the current chunk content. Thus we ensure this can't happen by ensuring that
// we have processed at least HASH_WINDOW_SIZE bytes.
if cur_index + previous_len < self.minimum_chunk {
continue;
}
create_chunk = true;
} else {
cur_index = read_end;
}
break;
}
// if we hit maximum chunk we must create a chunk
if cur_index + previous_len >= self.maximum_chunk {
cur_index = self.maximum_chunk - previous_len;
create_chunk = true;
}
if create_chunk {
self.hash.set_hash(0); // Reset for the next time.
debug_assert_ge!(cur_index + previous_len, self.minimum_chunk);
debug_assert_le!(cur_index + previous_len, self.maximum_chunk);
Some(cur_index)
} else {
None
}
}
fn reset_state(&mut self) {
// Strictly speaking, this is unneccesary, as we should always hash 64 bytes out making the previous state
// of the hasher irrelevant. However, this explicitly declares we're resetting things to the
// initial state.
self.hash.set_hash(0);
debug_assert!(self.chunkbuf.is_empty());
}
/// Process more data; this is a continuation of any data from before when calls were
///
/// Returns the next chunk, if available, and the amount of data that was digested.
///
/// If is_final is true, then it is assumed that no more data after this block will come,
/// and any data currently present and at the end will be put into a final chunk.
pub fn next(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
let (chunk_data, consume): (Bytes, usize) = {
if let Some(next_boundary) = self.next_boundary(data) {
if self.chunkbuf.is_empty() {
(Bytes::copy_from_slice(&data[..next_boundary]), next_boundary)
} else {
self.chunkbuf.extend_from_slice(&data[..next_boundary]);
(std::mem::take(&mut self.chunkbuf).into(), next_boundary)
}
} else if is_final {
// Put the rest of the data in the chunkbuf.
let r = if self.chunkbuf.is_empty() {
(Bytes::copy_from_slice(data), data.len())
} else {
self.chunkbuf.extend_from_slice(data);
(std::mem::take(&mut self.chunkbuf).into(), data.len())
};
if is_final {
self.reset_state();
}
r
} else {
self.chunkbuf.extend_from_slice(data);
return (None, data.len());
}
};
// Special case this specific case.
if chunk_data.is_empty() {
return (None, 0);
}
(Some(Chunk::new(chunk_data)), consume)
}
/// Keeps chunking until no more chunks can be reliably produced, returning a
/// vector of the resulting chunks.
pub fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
let mut ret = Vec::new();
let mut pos = 0;
loop {
debug_assert!(pos <= data.len());
if pos == data.len() {
if is_final {
self.reset_state();
}
return ret;
}
let (maybe_chunk, bytes_consumed) = self.next(&data[pos..], is_final);
if let Some(chunk) = maybe_chunk {
ret.push(chunk);
}
pos += bytes_consumed;
}
}
/// Parallel chunking for files
///
/// Uses memory mapping and multi-threading for content-defined chunking.
/// Requires the parallel-chunking feature to be enabled.
///
/// # Arguments
/// * `file_path` - Path to the file to chunk
/// * `thread_count` - Number of threads (None = auto-detect)
///
/// # Returns
/// * `Ok(Vec<Chunk>)` - Successfully chunked file
/// * `Err` - File too small for parallel processing or feature not enabled
#[cfg(feature = "parallel-chunking")]
pub fn chunk_file_parallel<P: AsRef<std::path::Path>>(
&self,
file_path: P,
thread_count: Option<usize>,
) -> std::io::Result<Vec<Chunk>> {
crate::parallel_chunking::chunk_file_parallel(file_path, thread_count.map(|t| t as u32))
}
/// Fallback method when parallel chunking is not available
#[cfg(not(feature = "parallel-chunking"))]
pub fn chunk_file_parallel<P: AsRef<std::path::Path>>(
&self,
_file_path: P,
_thread_count: Option<usize>,
) -> std::io::Result<Vec<Chunk>> {
Err(std::io::Error::new(
std::io::ErrorKind::Unsupported,
"Parallel chunking not available. Enable with --features parallel-chunking",
))
}
/// Keeps chunking until no more chunks can be reliably produced, returning a
/// vector of the resulting chunks.
///
/// The data is inserted here as a Bytes object, which means that no copying of the data
/// is performed except at the boundaries. The resulting chunks then end up each holding
/// a reference to the original data object, which will not be deallocated until all the
/// original bytes are gone.
pub fn next_block_bytes(&mut self, data: &Bytes, is_final: bool) -> Vec<Chunk> {
let mut ret = Vec::new();
let mut pos = 0;
// In this case, we have to perform a single cut using the old method,
// which would copy the data.
if !self.chunkbuf.is_empty() {
let (maybe_chunk, skip_idx) = self.next(data, is_final);
if let Some(chunk) = maybe_chunk {
ret.push(chunk);
}
pos = skip_idx;
}
while pos < data.len() {
let maybe_next_boundary = self.next_boundary(&data[pos..]);
if let Some(chunk_size) = maybe_next_boundary {
let next_pos = pos + chunk_size;
ret.push(Chunk::new(data.slice(pos..next_pos)));
pos = next_pos;
} else {
// No more chunks in this block.
if is_final {
ret.push(Chunk::new(data.slice(pos..)));
} else {
self.chunkbuf.extend_from_slice(&data[pos..]);
}
break;
}
}
if is_final {
self.reset_state();
}
ret
}
// Finishes, returning the final chunk if one exists, and resets the hasher to
pub fn finish(&mut self) -> Option<Chunk> {
self.next(&[], true).0
}
}
/// Find valid partition points in a file where we can
/// chunk in parallel. Returns the start points of each partition
/// (i.e. file offset 0 is always the first entry, and `file_size`
/// is never in the result).
/// Note that reader position is modified and not restored.
///
/// partition_scan_bytes is the number of bytes to scan at each
/// proposed partition boundary in search of a valid chunk.
///
/// Due to a known issue in how we do chunking, note that these
/// partitions are not 100% guaranteed to align. See the
/// parallel_chunking.pdf for details.
pub fn find_partitions<R: Read + Seek>(
reader: &mut R,
file_size: usize,
target_chunk_size: usize,
min_partition_size: usize,
partition_scan_bytes: usize,
) -> std::io::Result<Vec<usize>> {
assert!(min_partition_size > 0);
let mut partitions: Vec<usize> = Vec::new();
partitions.push(0);
// minimum chunk must be at least the hash window size.
// the way the chunker works, the minimum may be up to
// target_min_chunk_size - 64
let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
assert!(minimum_chunk > 64);
if maximum_chunk >= min_partition_size {
return Ok(partitions);
}
let mut buf = vec![0u8; partition_scan_bytes];
let mut curpos: usize = 0;
// we jump curpos forward by min_partition_size
// and read *PARALLEL_CHUNKING_PARTITION_SCAN_BYTES bytes
// and try to find a partition boundary condition.
//
// We should also make sure There should also be at least
// min_partition_size bytes remaining at curpos so that
// we do not make a teeny tiny partition.
while curpos < file_size {
curpos += min_partition_size;
// there are not enough bytes to make a full partition
// or not enough bytes to scan for a partition
if curpos + min_partition_size >= file_size || curpos + partition_scan_bytes >= file_size {
break;
}
// read and chunk the scan bytes
reader.seek(SeekFrom::Start(curpos as u64))?;
reader.read_exact(&mut buf)?;
let mut chunker = Chunker::new_with_min(target_chunk_size, 0);
// TODO: there is a definite optimization here
// as we really only need the chunk lengths and not the data
let chunks = chunker.next_block(&buf, false);
if chunks.is_empty() {
continue;
}
// skip the first chunk
let mut offset = chunks[0].data.len();
offset += chunks[1].data.len();
for i in 2..chunks.len() {
let cprev = chunks[i - 1].data.len();
let c = chunks[i].data.len();
offset += chunks[i].data.len();
if cprev > minimum_chunk
&& cprev < maximum_chunk - minimum_chunk
&& c > minimum_chunk
&& c < maximum_chunk - minimum_chunk
{
// we have a valid partition at this position
partitions.push(curpos + offset);
break;
}
}
}
Ok(partitions)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use std::io::Cursor;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
/// A helper to create random test data using a specified `seed` and `len`.
/// Using a fixed seed ensures tests are reproducible.
fn make_test_data(seed: u64, len: usize) -> Vec<u8> {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = vec![0; len];
rng.fill(&mut data[..]);
data
}
fn check_chunks_equal(chunks: &[Chunk], data: &[u8]) {
// Validate all the chunks are exact.
let mut new_vec = Vec::with_capacity(10000);
for c in chunks.iter() {
new_vec.extend_from_slice(&c.data[..]);
}
assert!(new_vec == data);
}
// A chunker that wraps two versions of the Chunker class,
// exposing next_block but then internally testing next_bytes_block and next_block and
// verifying the output is identical.
#[derive(Default)]
struct ChunkerTestWrapper {
chunker_chunks: Chunker,
chunker_bytes: Chunker,
}
impl ChunkerTestWrapper {
fn new(target_chunk_size: usize) -> Self {
ChunkerTestWrapper {
chunker_chunks: Chunker::new(target_chunk_size),
chunker_bytes: Chunker::new(target_chunk_size),
}
}
fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
let chunks = self.chunker_chunks.next_block(data, is_final);
let bytes_chunks = self.chunker_bytes.next_block_bytes(&Bytes::copy_from_slice(data), is_final);
// Check that the two match.
assert_eq!(chunks.len(), bytes_chunks.len());
for (c1, c2) in chunks.iter().zip(bytes_chunks.iter()) {
assert_eq!(c1.data, c2.data);
}
chunks
}
fn next_chunk(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
let (chunk, consumed) = self.chunker_chunks.next(data, is_final);
let (bytes_chunk, bytes_consumed) = self.chunker_bytes.next(&Bytes::copy_from_slice(data), is_final);
// Check that the two match.
if let Some(c) = &chunk {
assert_eq!(c.data, bytes_chunk.unwrap().data);
} else {
assert!(bytes_chunk.is_none());
}
(chunk, consumed.max(bytes_consumed))
}
}
#[test]
fn test_empty_data_no_chunk_until_final() {
let mut chunker = ChunkerTestWrapper::new(128);
// Passing empty slice without final => no chunk
let (chunk, consumed) = chunker.next_chunk(&[], false);
assert!(chunk.is_none());
assert_eq!(consumed, 0);
// Passing empty slice again with is_final = true => no leftover data, so no chunk
let (chunk, consumed) = chunker.next_chunk(&[], true);
assert!(chunk.is_none());
assert_eq!(consumed, 0);
}
#[test]
fn test_data_smaller_than_minimum_no_boundary() {
let mut chunker = ChunkerTestWrapper::new(128);
// Create a small random data buffer. For example, length=3.
let data = make_test_data(0, 63);
// We expect no chunk until we finalize, because there's not enough data
// to trigger a boundary, nor to reach the maximum chunk size.
let (chunk, consumed) = chunker.next_chunk(&data, false);
assert!(chunk.is_none());
assert_eq!(consumed, data.len());
// Now finalize: we expect a chunk with the leftover data
let (chunk, consumed) = chunker.next_chunk(&[], true);
assert!(chunk.is_some());
assert_eq!(consumed, 0);
let chunk = chunk.unwrap();
assert_eq!(chunk.data.len(), 63);
assert_eq!(&chunk.data[..], &data[..], "Chunk should contain exactly what was passed in");
}
#[test]
fn test_multiple_chunks_produced() {
let mut chunker = ChunkerTestWrapper::new(128);
// Produce 100 bytes of random data
let data = make_test_data(42, 10000);
// Pass everything at once, final = true
let chunks = chunker.next_block(&data, true);
assert!(!chunks.is_empty());
check_chunks_equal(&chunks, &data);
}
#[test]
fn test_repeated_calls_partial_consumption() {
// We'll feed in two pieces of data to ensure partial consumption
let data = make_test_data(42, 10000);
let mut chunks_1 = Vec::new();
let mut pos = 0;
let mut chunker = ChunkerTestWrapper::new(128);
while pos < data.len() {
for i in 0..16 {
let next_pos = (pos + i).min(data.len());
chunks_1.append(&mut chunker.next_block(&data[pos..next_pos], next_pos == data.len()));
pos = next_pos;
}
}
check_chunks_equal(&chunks_1, &data);
// Now, rechunk with all at once and make sure it's equal.
let chunks_2 = ChunkerTestWrapper::new(128).next_block(&data, true);
assert_eq!(chunks_1, chunks_2);
}
#[test]
fn test_exact_maximum_chunk() {
// If the data hits the maximum chunk size exactly, we should force a boundary.
// For target_chunk_size = 128, if MAXIMUM_CHUNK_MULTIPLIER = 2, then max = 256.
// Adjust if your constants differ.
let mut chunker = ChunkerTestWrapper::new(512);
// Use constant data
let data = vec![0; 8 * *MAXIMUM_CHUNK_MULTIPLIER * 512];
let chunks = chunker.next_block(&data, true);
assert_eq!(chunks.len(), 8);
for c in chunks.iter() {
assert_eq!(c.data.len(), *MAXIMUM_CHUNK_MULTIPLIER * 512);
}
}
#[test]
fn test_partition() {
for _i in 1..5 {
let data = make_test_data(42, 1000000);
let mut chunker = Chunker::new(1024);
let chunks = chunker.next_block(&data, true);
let mut chunk_offsets = HashSet::new();
let mut offset = 0;
eprintln!("{:?}", chunker.minimum_chunk);
for i in 0..chunks.len() {
chunk_offsets.insert(offset);
offset += chunks[i].data.len();
}
let partitions =
find_partitions(&mut Cursor::new(&mut data.as_slice()), data.len(), 1024, 100000, 10000).unwrap();
assert!(partitions.len() > 1);
for i in 0..partitions.len() {
assert!(chunk_offsets.contains(&partitions[i]));
}
}
}
/// Simple SplitMix64-based deterministic random number generator.
/// Portable to C, Python, etc. (see https://prng.di.unimi.it/splitmix64.c)
fn splitmix64_next(state: &mut u64) -> u64 {
*state = state.wrapping_add(0x9E3779B97F4A7C15);
let mut z = *state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
z ^ (z >> 31)
}
fn create_random_data(n: usize, seed: u64) -> Vec<u8> {
// This test will actually need to be run in different environments, so to generate
// the table below, create random data using a simple SplitMix rng that can be ported here
// as is without dependening on other po
let mut ret = Vec::with_capacity(n + 7);
let mut state = seed;
while ret.len() < n {
let next_u64 = splitmix64_next(&mut state);
ret.extend_from_slice(&next_u64.to_le_bytes());
}
// Has extra bits on there since we're adding in blocks of 8.
ret.resize(n, 0);
ret
}
fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec<usize> {
chunks
.iter()
.scan(0, |state, chunk| {
*state += chunk.data.len();
Some(*state)
})
.collect()
}
#[test]
fn test_chunk_boundaries() {
let data = create_random_data(256000, 1);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let ref_chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Test that it's correct across different chunk varieties.
for add_size in [1, 37, 255] {
let mut chunker = Chunker::default();
// Add repeatedly in blocks of add_size, appending to alt_chunks
let mut alt_chunks = Vec::with_capacity(chunks.len());
let mut pos = 0;
while pos < data.len() {
let next_pos = (pos + add_size).min(data.len());
let next_chunk = chunker.next_block(&data[pos..next_pos], next_pos == data.len());
alt_chunks.extend(next_chunk);
pos = next_pos;
}
let alt_boundaries = get_chunk_boundaries(&alt_chunks);
assert_eq!(alt_boundaries, ref_chunk_boundaries);
}
}
#[test]
fn test_correctness_1mb_random_data() {
// Test this data.
let data = create_random_data(1000000, 0);
// Uncomment these to create the lines below:
// eprintln!("(data[0], {});", data[0] as usize);
// eprintln!("(data[127], {});", data[127] as usize);
// eprintln!("(data[111111], {});", data[111111] as usize);
assert_eq!(data[0], 175);
assert_eq!(data[127], 132);
assert_eq!(data[111111], 118);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Uncomment this to create the line below.
// eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
assert_eq!(
chunk_boundaries,
vec![
84493, 134421, 144853, 243318, 271793, 336457, 467529, 494581, 582000, 596735, 616815, 653164, 678202,
724510, 815591, 827760, 958832, 991092, 1000000
]
);
}
#[test]
fn test_correctness_1mb_const_data() {
// Test this data.
let data = vec![59u8; 1000000];
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Uncomment this to create the line below.
// eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000])
}
fn get_triggering_base_data(n: usize, padding: usize) -> Vec<u8> {
// This pattern is known to trigger the boundary detection in the chunker, so repeat it to test the
// correctness of the minimum chunk size processing.
let mut data = vec![
154, 52, 42, 34, 159, 75, 126, 224, 70, 236, 12, 196, 79, 236, 178, 124, 127, 50, 99, 178, 44, 176, 174,
126, 250, 235, 205, 174, 252, 122, 35, 10, 20, 101, 214, 69, 193, 8, 115, 105, 158, 228, 120, 111, 136,
162, 198, 251, 211, 183, 253, 252, 164, 147, 63, 16, 186, 162, 117, 23, 170, 36, 205, 187, 174, 76, 210,
174, 211, 175, 12, 173, 145, 59, 2, 70, 222, 181, 159, 227, 182, 156, 189, 51, 226, 106, 24, 50, 183, 157,
140, 10, 8, 23, 212, 70, 10, 234, 23, 33, 219, 254, 39, 236, 70, 49, 191, 116, 9, 115, 15, 101, 26, 159,
165, 220, 15, 170, 56, 125, 92, 163, 94, 235, 38, 40, 49, 81,
];
// Add padding so we can comprehensively test the nuances of boundaries.
data.resize(data.len() + padding, 0u8);
// Repeat the above pattern until we've filled out n bytes.
while data.len() < n {
let n_take = (n - data.len()).min(data.len());
data.extend_from_within(0..n_take);
}
data
}
#[test]
fn test_correctness_100kb_hitting_data() {
// To ensure we've checked all the nuances of dealing with minimum chunk boundaries,
// and with the correct chunks as well, run through all the different options with the padding,
// checking each one. With this, then, we have a pattern that hits once per pattern with varying
// bits between the widths.
let mut data_sample_at_11111 = [0u8; 128];
let mut ref_cb = vec![Vec::new(); 128];
data_sample_at_11111[0] = 236;
ref_cb[0] = vec![8256, 16448, 24640, 32832, 41024, 49216, 57408, 65536];
data_sample_at_11111[1] = 50;
ref_cb[1] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
data_sample_at_11111[2] = 36;
ref_cb[2] = vec![8254, 16574, 24894, 33214, 41534, 49854, 58174, 65536];
data_sample_at_11111[3] = 116;
ref_cb[3] = vec![8317, 16570, 24823, 33076, 41329, 49582, 57835, 65536];
data_sample_at_11111[4] = 126;
ref_cb[4] = vec![8248, 16564, 24880, 33196, 41512, 49828, 58144, 65536];
data_sample_at_11111[5] = 145;
ref_cb[5] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536];
data_sample_at_11111[6] = 235;
ref_cb[6] = vec![8238, 16546, 24854, 33162, 41470, 49778, 58086, 65536];
data_sample_at_11111[7] = 228;
ref_cb[7] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
data_sample_at_11111[8] = 70;
ref_cb[8] = vec![8224, 16520, 24816, 33112, 41408, 49704, 58000, 65536];
data_sample_at_11111[9] = 178;
ref_cb[9] = vec![8284, 16504, 24724, 32944, 41164, 49384, 57604, 65536];
data_sample_at_11111[10] = 173;
ref_cb[10] = vec![8206, 16486, 24766, 33046, 41326, 49606, 57886, 65536];
data_sample_at_11111[11] = 0;
ref_cb[11] = vec![8265, 16466, 24667, 32868, 41069, 49270, 57471, 65536];
data_sample_at_11111[12] = 252;
ref_cb[12] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536];
data_sample_at_11111[13] = 159;
ref_cb[13] = vec![8242, 16561, 24880, 33199, 41518, 49837, 58156, 65536];
data_sample_at_11111[14] = 69;
ref_cb[14] = vec![8300, 16536, 24772, 33008, 41244, 49480, 57716, 65536];
data_sample_at_11111[15] = 219;
ref_cb[15] = vec![8215, 16509, 24803, 33097, 41391, 49685, 57979, 65536];
data_sample_at_11111[16] = 126;
ref_cb[16] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[17] = 10;
ref_cb[17] = vec![8329, 16594, 24859, 33124, 41389, 49654, 57919, 65536];
data_sample_at_11111[18] = 124;
ref_cb[18] = vec![8240, 16562, 24884, 33206, 41528, 49850, 58172, 65536];
data_sample_at_11111[19] = 24;
ref_cb[19] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[20] = 196;
ref_cb[20] = vec![8204, 16492, 24780, 33068, 41356, 49644, 57932, 65536];
data_sample_at_11111[21] = 106;
ref_cb[21] = vec![8259, 16454, 24649, 32844, 41039, 49234, 57429, 65536];
data_sample_at_11111[22] = 196;
ref_cb[22] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[23] = 183;
ref_cb[23] = vec![8218, 16523, 24828, 33133, 41438, 49743, 58048, 65536];
data_sample_at_11111[24] = 124;
ref_cb[24] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[25] = 70;
ref_cb[25] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[26] = 126;
ref_cb[26] = vec![8226, 16542, 24858, 33174, 41490, 49806, 58122, 65536];
data_sample_at_11111[27] = 191;
ref_cb[27] = vec![8279, 16494, 24709, 32924, 41139, 49354, 57569, 65536];
data_sample_at_11111[28] = 69;
ref_cb[28] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
data_sample_at_11111[29] = 163;
ref_cb[29] = vec![8228, 16549, 24870, 33191, 41512, 49833, 58154, 65536];
data_sample_at_11111[30] = 252;
ref_cb[30] = vec![8280, 16496, 24712, 32928, 41144, 49360, 57576, 65536];
data_sample_at_11111[31] = 0;
ref_cb[31] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
data_sample_at_11111[32] = 173;
ref_cb[32] = vec![8224, 16544, 24864, 33184, 41504, 49824, 58144, 65536];
data_sample_at_11111[33] = 42;
ref_cb[33] = vec![8275, 16486, 24697, 32908, 41119, 49330, 57541, 65536];
data_sample_at_11111[34] = 70;
ref_cb[34] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[35] = 174;
ref_cb[35] = vec![8214, 16527, 24840, 33153, 41466, 49779, 58092, 65536];
data_sample_at_11111[36] = 235;
ref_cb[36] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[37] = 186;
ref_cb[37] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[38] = 0;
ref_cb[38] = vec![8198, 16498, 24798, 33098, 41398, 49698, 57998, 65536];
data_sample_at_11111[39] = 157;
ref_cb[39] = vec![8247, 16597, 24947, 33297, 41647, 49997, 58347, 65536];
data_sample_at_11111[40] = 126;
ref_cb[40] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[41] = 49;
ref_cb[41] = vec![8345, 16626, 24907, 33188, 41469, 49750, 58031, 65536];
data_sample_at_11111[42] = 36;
ref_cb[42] = vec![8224, 16554, 24884, 33214, 41544, 49874, 58204, 65536];
data_sample_at_11111[43] = 0;
ref_cb[43] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[44] = 236;
ref_cb[44] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
data_sample_at_11111[45] = 105;
ref_cb[45] = vec![8195, 16499, 24803, 33107, 41411, 49715, 58019, 65536];
data_sample_at_11111[46] = 0;
ref_cb[46] = vec![8242, 16594, 24946, 33298, 41650, 50002, 58354, 65536];
data_sample_at_11111[47] = 24;
ref_cb[47] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
data_sample_at_11111[48] = 126;
ref_cb[48] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
data_sample_at_11111[49] = 0;
ref_cb[49] = vec![8206, 16525, 24844, 33163, 41482, 49801, 58120, 65536];
data_sample_at_11111[50] = 70;
ref_cb[50] = vec![8252, 16618, 24984, 33350, 41716, 50082, 58448, 65536];
data_sample_at_11111[51] = 236;
ref_cb[51] = vec![8298, 16532, 24766, 33000, 41234, 49468, 57702, 65536];
data_sample_at_11111[52] = 0;
ref_cb[52] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[53] = 12;
ref_cb[53] = vec![8209, 16535, 24861, 33187, 41513, 49839, 58165, 65536];
data_sample_at_11111[54] = 236;
ref_cb[54] = vec![8254, 16626, 24998, 33370, 41742, 50114, 58486, 65536];
data_sample_at_11111[55] = 0;
ref_cb[55] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
data_sample_at_11111[56] = 173;
ref_cb[56] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[57] = 196;
ref_cb[57] = vec![8204, 16529, 24854, 33179, 41504, 49829, 58154, 65536];
data_sample_at_11111[58] = 0;
ref_cb[58] = vec![8248, 16618, 24988, 33358, 41728, 50098, 58468, 65536];
data_sample_at_11111[59] = 159;
ref_cb[59] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
data_sample_at_11111[60] = 178;
ref_cb[60] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
data_sample_at_11111[61] = 0;
ref_cb[61] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[62] = 10;
ref_cb[62] = vec![8234, 16594, 24954, 33314, 41674, 50034, 58394, 65536];
data_sample_at_11111[63] = 101;
ref_cb[63] = vec![8277, 16490, 24703, 32916, 41129, 49342, 57555, 65536];
data_sample_at_11111[64] = 0;
ref_cb[64] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
data_sample_at_11111[65] = 15;
ref_cb[65] = vec![8363, 16662, 24961, 33260, 41559, 49858, 58157, 65536];
data_sample_at_11111[66] = 147;
ref_cb[66] = vec![8212, 16554, 24896, 33238, 41580, 49922, 58264, 65536];
data_sample_at_11111[67] = 0;
ref_cb[67] = vec![8254, 16639, 25024, 33409, 41794, 50179, 58564, 65536];
data_sample_at_11111[68] = 0;
ref_cb[68] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[69] = 227;
ref_cb[69] = vec![8338, 16612, 24886, 33160, 41434, 49708, 57982, 65536];
data_sample_at_11111[70] = 126;
ref_cb[70] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[71] = 0;
ref_cb[71] = vec![8223, 16581, 24939, 33297, 41655, 50013, 58371, 65536];
data_sample_at_11111[72] = 101;
ref_cb[72] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[73] = 186;
ref_cb[73] = vec![8305, 16546, 24787, 33028, 41269, 49510, 57751, 65536];
data_sample_at_11111[74] = 52;
ref_cb[74] = vec![8346, 16628, 24910, 33192, 41474, 49756, 58038, 65536];
data_sample_at_11111[75] = 0;
ref_cb[75] = vec![8387, 16710, 25033, 33356, 41679, 50002, 58325, 65536];
data_sample_at_11111[76] = 70;
ref_cb[76] = vec![8224, 16588, 24952, 33316, 41680, 50044, 58408, 65536];
data_sample_at_11111[77] = 228;
ref_cb[77] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[78] = 0;
ref_cb[78] = vec![8304, 16544, 24784, 33024, 41264, 49504, 57744, 65536];
data_sample_at_11111[79] = 0;
ref_cb[79] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[80] = 50;
ref_cb[80] = vec![8384, 16704, 25024, 33344, 41664, 49984, 58304, 65536];
data_sample_at_11111[81] = 214;
ref_cb[81] = vec![8215, 16575, 24935, 33295, 41655, 50015, 58375, 65536];
data_sample_at_11111[82] = 0;
ref_cb[82] = vec![8254, 16654, 25054, 33454, 41854, 50254, 58654, 65536];
data_sample_at_11111[83] = 0;
ref_cb[83] = vec![8293, 16522, 24751, 32980, 41209, 49438, 57667, 65536];
data_sample_at_11111[84] = 50;
ref_cb[84] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
data_sample_at_11111[85] = 69;
ref_cb[85] = vec![8371, 16678, 24985, 33292, 41599, 49906, 58213, 65536];
data_sample_at_11111[86] = 0;
ref_cb[86] = vec![8196, 16542, 24888, 33234, 41580, 49926, 58272, 65536];
data_sample_at_11111[87] = 0;
ref_cb[87] = vec![8234, 16619, 25004, 33389, 41774, 50159, 58544, 65536];
data_sample_at_11111[88] = 70;
ref_cb[88] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[89] = 136;
ref_cb[89] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536];
data_sample_at_11111[90] = 0;
ref_cb[90] = vec![8348, 16632, 24916, 33200, 41484, 49768, 58052, 65536];
data_sample_at_11111[91] = 0;
ref_cb[91] = vec![8386, 16708, 25030, 33352, 41674, 49996, 58318, 65536];
data_sample_at_11111[92] = 101;
ref_cb[92] = vec![8204, 16564, 24924, 33284, 41644, 50004, 58364, 65536];
data_sample_at_11111[93] = 36;
ref_cb[93] = vec![8241, 16639, 25037, 33435, 41833, 50231, 58629, 65536];
data_sample_at_11111[94] = 196;
ref_cb[94] = vec![8278, 16492, 24706, 32920, 41134, 49348, 57562, 65536];
data_sample_at_11111[95] = 0;
ref_cb[95] = vec![8315, 16566, 24817, 33068, 41319, 49570, 57821, 65536];
data_sample_at_11111[96] = 0;
ref_cb[96] = vec![8352, 16640, 24928, 33216, 41504, 49792, 58080, 65536];
data_sample_at_11111[97] = 24;
ref_cb[97] = vec![8389, 16714, 25039, 33364, 41689, 50014, 58339, 65536];
data_sample_at_11111[98] = 8;
ref_cb[98] = vec![8200, 16562, 24924, 33286, 41648, 50010, 58372, 65536];
data_sample_at_11111[99] = 0;
ref_cb[99] = vec![8236, 16635, 25034, 33433, 41832, 50231, 58630, 65536];
data_sample_at_11111[100] = 0;
ref_cb[100] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[101] = 125;
ref_cb[101] = vec![8308, 16552, 24796, 33040, 41284, 49528, 57772, 65536];
data_sample_at_11111[102] = 173;
ref_cb[102] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[103] = 126;
ref_cb[103] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[104] = 0;
ref_cb[104] = vec![8416, 16768, 25120, 33472, 41824, 50176, 58528, 65536];
data_sample_at_11111[105] = 0;
ref_cb[105] = vec![8219, 16607, 24995, 33383, 41771, 50159, 58547, 65536];
data_sample_at_11111[106] = 159;
ref_cb[106] = vec![8254, 16678, 25102, 33526, 41950, 50374, 58798, 65536];
data_sample_at_11111[107] = 210;
ref_cb[107] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
data_sample_at_11111[108] = 178;
ref_cb[108] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536];
data_sample_at_11111[109] = 0;
ref_cb[109] = vec![8359, 16654, 24949, 33244, 41539, 49834, 58129, 65536];
data_sample_at_11111[110] = 0;
ref_cb[110] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
data_sample_at_11111[111] = 170;
ref_cb[111] = vec![8429, 16794, 25159, 33524, 41889, 50254, 58619, 65536];
data_sample_at_11111[112] = 173;
ref_cb[112] = vec![8224, 16624, 25024, 33424, 41824, 50224, 58624, 65536];
data_sample_at_11111[113] = 235;
ref_cb[113] = vec![8258, 16452, 24646, 32840, 41034, 49228, 57422, 65536];
data_sample_at_11111[114] = 0;
ref_cb[114] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
data_sample_at_11111[115] = 0;
ref_cb[115] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[116] = 0;
ref_cb[116] = vec![8360, 16656, 24952, 33248, 41544, 49840, 58136, 65536];
data_sample_at_11111[117] = 24;
ref_cb[117] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
data_sample_at_11111[118] = 228;
ref_cb[118] = vec![8428, 16792, 25156, 33520, 41884, 50248, 58612, 65536];
data_sample_at_11111[119] = 0;
ref_cb[119] = vec![8215, 16613, 25011, 33409, 41807, 50205, 58603, 65536];
data_sample_at_11111[120] = 0;
ref_cb[120] = vec![8248, 16680, 25112, 33544, 41976, 50408, 58840, 65536];
data_sample_at_11111[121] = 0;
ref_cb[121] = vec![8281, 16498, 24715, 32932, 41149, 49366, 57583, 65536];
data_sample_at_11111[122] = 101;
ref_cb[122] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[123] = 174;
ref_cb[123] = vec![8347, 16630, 24913, 33196, 41479, 49762, 58045, 65536];
data_sample_at_11111[124] = 126;
ref_cb[124] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[125] = 0;
ref_cb[125] = vec![8413, 16762, 25111, 33460, 41809, 50158, 58507, 65536];
data_sample_at_11111[126] = 0;
ref_cb[126] = vec![8192, 16574, 24956, 33338, 41720, 50102, 58484, 65536];
data_sample_at_11111[127] = 0;
ref_cb[127] = vec![8224, 16639, 25054, 33469, 41884, 50299, 58714, 65536];
// Now run the loop with this reference data.
for i in 0..128 {
let data = get_triggering_base_data(65536, i);
// This check is here so that the tests written against this chunker
// can verify that the test data input is correct.
assert_eq!(data[11111], data_sample_at_11111[i]);
// Uncomment to create the line above.
// eprintln!("data_sample_at_11111[{i}]={};", data[11111]);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);