Skip to content

Commit 9d8e336

Browse files
avoid duplicate buffer copy (#2)
`to_vec` and `_platform_memmove` took 17% of `decode` time. The decoder copies the input buffer twice, once for SIMD padding and once for caching. This PR combines both into a single copy by writing directly into `cached_buf`, padding for the SIMD pipeline, then truncating back to the original length before field extraction "no benchmark because I'm running other benchmarks atm" <img width="1509" height="375" alt="Screenshot 2026-03-26 at 5 47 55 PM" src="https://github.com/user-attachments/assets/ccd84add-1902-4280-a1a0-37974b504be5" />
1 parent e2e9afe commit 9d8e336

1 file changed

Lines changed: 8 additions & 5 deletions

File tree

src/decoder.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,13 @@ impl Decoder {
6363
return self.extract_from_cache();
6464
}
6565

66-
// run SIMD pipeline once on the entire buffer
67-
let mut padded = buf.to_vec();
68-
padded.resize(padded.len().next_multiple_of(64), 0);
66+
self.cached_buf.clear();
67+
self.cached_buf.extend_from_slice(buf);
68+
69+
let original_len = self.cached_buf.len();
70+
self.cached_buf.resize(original_len.next_multiple_of(64), 0);
71+
72+
let padded = &self.cached_buf;
6973

7074
// phase 1+2: classify and build bitsets in one pass
7175
let low_nibbles = u8x16::from_slice_unchecked(&LOW_NIBBLES);
@@ -118,8 +122,7 @@ impl Decoder {
118122
extract_positions(n, base, &mut newline_pos);
119123
}
120124

121-
// cache everything
122-
self.cached_buf = buf.to_vec();
125+
self.cached_buf.truncate(original_len);
123126
self.cached_comma_pos = comma_pos;
124127
self.cached_newline_pos = newline_pos;
125128
self.cached_ci = 0;

0 commit comments

Comments
 (0)