avoid duplicate buffer copy (#2)

friendlymatthew · web-flow · commit 9d8e33609990 · 2026-03-26T17:51:40.000-04:00
`to_vec` and `_platform_memmove` took 17% of `decode` time. The decoder copies the input buffer twice, once for SIMD padding and once for caching. This PR combines both into a single copy by writing directly into `cached_buf`, padding for the SIMD pipeline, then truncating back to the original length before field extraction "no benchmark because I'm running other benchmarks atm" <img width="1509" height="375" alt="Screenshot 2026-03-26 at 5 47 55 PM" src="https://github.com/user-attachments/assets/ccd84add-1902-4280-a1a0-37974b504be5" />
diff --git a/src/decoder.rs b/src/decoder.rs
@@ -63,9 +63,13 @@ impl Decoder {
             return self.extract_from_cache();
         }
 
-        // run SIMD pipeline once on the entire buffer
-        let mut padded = buf.to_vec();
-        padded.resize(padded.len().next_multiple_of(64), 0);
+        self.cached_buf.clear();
+        self.cached_buf.extend_from_slice(buf);
+
+        let original_len = self.cached_buf.len();
+        self.cached_buf.resize(original_len.next_multiple_of(64), 0);
+
+        let padded = &self.cached_buf;
 
         // phase 1+2: classify and build bitsets in one pass
         let low_nibbles = u8x16::from_slice_unchecked(&LOW_NIBBLES);
@@ -118,8 +122,7 @@ impl Decoder {
             extract_positions(n, base, &mut newline_pos);
         }
 
-        // cache everything
-        self.cached_buf = buf.to_vec();
+        self.cached_buf.truncate(original_len);
         self.cached_comma_pos = comma_pos;
         self.cached_newline_pos = newline_pos;
         self.cached_ci = 0;