feat: add zero-copy string access via ValueView and allocation-reuse APIs

bartlomieju · claude · bartlomieju · commit eaa92b08566c · 2026-03-11T23:01:45.000+01:00
Add ValueView::as_str() for true zero-copy &amp;str access to ASCII strings,
ValueView::to_cow_lossy() for zero-copy-when-possible string conversion,
String::write_utf8_into() for allocation reuse, and a public latin1_to_utf8
SIMD-friendly transcoder utility.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/lib.rs b/src/lib.rs
@@ -175,6 +175,7 @@ pub use string::ValueView;
 pub use string::ValueViewData;
 pub use string::WriteFlags;
 pub use string::WriteOptions;
+pub use string::latin1_to_utf8;
 pub use support::SharedPtr;
 pub use support::SharedRef;
 pub use support::UniquePtr;
diff --git a/src/string.rs b/src/string.rs
@@ -17,6 +17,71 @@ use std::mem::MaybeUninit;
 use std::ptr::NonNull;
 use std::slice;
 
+/// Converts Latin-1 encoded bytes to UTF-8, writing into the output buffer.
+///
+/// The output buffer must have at least `2 * input_length` bytes of capacity,
+/// since each Latin-1 byte can expand to at most 2 UTF-8 bytes.
+///
+/// Returns the number of bytes written to the output buffer.
+///
+/// # Safety
+///
+/// - `inbuf` must point to at least `input_length` readable bytes.
+/// - `outbuf` must point to at least `2 * input_length` writable bytes.
+#[inline(always)]
+pub unsafe fn latin1_to_utf8(
+  input_length: usize,
+  inbuf: *const u8,
+  outbuf: *mut u8,
+) -> usize {
+  unsafe {
+    let mut output = 0;
+    let mut input = 0;
+
+    // Process 8 bytes at a time: check if all are ASCII with a single AND
+    while input + 8 <= input_length {
+      let chunk = (inbuf.add(input) as *const u64).read_unaligned();
+      if chunk & 0x8080_8080_8080_8080 == 0 {
+        // All 8 bytes are ASCII, copy in bulk
+        (outbuf.add(output) as *mut u64).write_unaligned(chunk);
+        input += 8;
+        output += 8;
+      } else {
+        // At least one non-ASCII byte, process individually
+        let end = input + 8;
+        while input < end {
+          let byte = *(inbuf.add(input));
+          if byte < 0x80 {
+            *(outbuf.add(output)) = byte;
+            output += 1;
+          } else {
+            // Latin-1 byte to two-byte UTF-8 sequence
+            *(outbuf.add(output)) = (byte >> 6) | 0b1100_0000;
+            *(outbuf.add(output + 1)) = (byte & 0b0011_1111) | 0b1000_0000;
+            output += 2;
+          }
+          input += 1;
+        }
+      }
+    }
+
+    // Handle remaining bytes
+    while input < input_length {
+      let byte = *(inbuf.add(input));
+      if byte < 0x80 {
+        *(outbuf.add(output)) = byte;
+        output += 1;
+      } else {
+        *(outbuf.add(output)) = (byte >> 6) | 0b1100_0000;
+        *(outbuf.add(output + 1)) = (byte & 0b0011_1111) | 0b1000_0000;
+        output += 2;
+      }
+      input += 1;
+    }
+    output
+  }
+}
+
 unsafe extern "C" {
   fn v8__String__Empty(isolate: *mut RealIsolate) -> *const String;
 
@@ -878,6 +943,59 @@ impl String {
     }
   }
 
+  /// Writes the UTF-8 representation of this string into an existing
+  /// [`std::string::String`], reusing its allocation.
+  ///
+  /// The buffer is cleared first, then filled with the string's UTF-8
+  /// contents. This avoids repeated heap allocation when converting
+  /// many V8 strings — callers can keep a single `String` and reuse it.
+  pub fn write_utf8_into(
+    &self,
+    scope: &Isolate,
+    buf: &mut std::string::String,
+  ) {
+    buf.clear();
+    let len_utf16 = self.length();
+    if len_utf16 == 0 {
+      return;
+    }
+
+    let len_utf8 = self.utf8_length(scope);
+    buf.reserve(len_utf8);
+
+    // SAFETY: We write valid UTF-8 data into the spare capacity, then
+    // set the length. After clear(), len == 0 so spare_capacity covers
+    // the full allocation. kReplaceInvalidUtf8 guarantees valid UTF-8.
+    unsafe {
+      let vec = buf.as_mut_vec();
+      if self.is_onebyte() && len_utf8 == len_utf16 {
+        // ASCII fast path
+        self.write_one_byte_uninit_v2(
+          scope,
+          0,
+          slice::from_raw_parts_mut(
+            vec.as_mut_ptr() as *mut MaybeUninit<u8>,
+            len_utf16,
+          ),
+          WriteFlags::kReplaceInvalidUtf8,
+        );
+        vec.set_len(len_utf16);
+      } else {
+        let written = self.write_utf8_uninit_v2(
+          scope,
+          slice::from_raw_parts_mut(
+            vec.as_mut_ptr() as *mut MaybeUninit<u8>,
+            len_utf8,
+          ),
+          WriteFlags::kReplaceInvalidUtf8,
+          None,
+        );
+        debug_assert!(written == len_utf8);
+        vec.set_len(written);
+      }
+    }
+  }
+
   /// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
   /// provided buffer.
   pub fn to_rust_cow_lossy<'a, const N: usize>(
@@ -1037,6 +1155,66 @@ impl<'s> ValueView<'s> {
       }
     }
   }
+
+  /// Returns a zero-copy `&str` if the string is one-byte and pure ASCII.
+  ///
+  /// This is the fastest way to access a V8 string's contents as a Rust
+  /// `&str` — no allocation, no copy, no transcoding. Returns `None` for
+  /// strings that contain non-ASCII Latin-1 bytes or are two-byte encoded.
+  ///
+  /// The returned reference is valid as long as this `ValueView` is alive.
+  #[inline(always)]
+  pub fn as_str(&self) -> Option<&str> {
+    match self.data() {
+      ValueViewData::OneByte(bytes) => {
+        if bytes.is_ascii() {
+          // SAFETY: ASCII bytes are valid UTF-8.
+          Some(unsafe { std::str::from_utf8_unchecked(bytes) })
+        } else {
+          None
+        }
+      }
+      ValueViewData::TwoByte(_) => None,
+    }
+  }
+
+  /// Returns the string contents as a `Cow<str>`.
+  ///
+  /// - **One-byte ASCII**: returns `Cow::Borrowed(&str)` — true zero-copy.
+  /// - **One-byte Latin-1** (non-ASCII): transcodes to UTF-8, returns
+  ///   `Cow::Owned`.
+  /// - **Two-byte** (UTF-16): transcodes to UTF-8 via
+  ///   [`std::string::String::from_utf16_lossy`], returns `Cow::Owned`.
+  ///
+  /// For the common case of ASCII strings this is zero-copy. The
+  /// Latin-1 transcoding uses a SIMD-friendly loop that processes 8 bytes
+  /// at a time.
+  #[inline(always)]
+  pub fn to_cow_lossy(&self) -> Cow<'_, str> {
+    match self.data() {
+      ValueViewData::OneByte(bytes) => {
+        if bytes.is_ascii() {
+          // SAFETY: ASCII bytes are valid UTF-8.
+          Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(bytes) })
+        } else {
+          // Latin-1 → UTF-8 transcoding. Each byte can expand to at
+          // most 2 UTF-8 bytes.
+          let mut buf = Vec::with_capacity(bytes.len() * 2);
+          // SAFETY: buf has capacity >= bytes.len() * 2, and
+          // latin1_to_utf8 writes valid UTF-8.
+          unsafe {
+            let written =
+              latin1_to_utf8(bytes.len(), bytes.as_ptr(), buf.as_mut_ptr());
+            buf.set_len(written);
+            Cow::Owned(std::string::String::from_utf8_unchecked(buf))
+          }
+        }
+      }
+      ValueViewData::TwoByte(units) => {
+        Cow::Owned(std::string::String::from_utf16_lossy(units))
+      }
+    }
+  }
 }
 
 impl Drop for ValueView<'_> {
diff --git a/tests/test_api.rs b/tests/test_api.rs
@@ -12270,6 +12270,178 @@ fn string_valueview() {
   }
 }
 
+#[test]
+fn string_valueview_as_str() {
+  let _setup_guard = setup::parallel_test();
+  let mut isolate = v8::Isolate::new(Default::default());
+  let scope = pin!(v8::HandleScope::new(&mut isolate));
+  let mut scope = scope.init();
+  let context = v8::Context::new(&scope, Default::default());
+  let scope = &mut v8::ContextScope::new(&mut scope, context);
+
+  // ASCII string: as_str returns Some
+  {
+    let s = v8::String::new(scope, "hello world").unwrap();
+    let view = v8::ValueView::new(scope, s);
+    assert_eq!(view.as_str(), Some("hello world"));
+  }
+
+  // Empty string: as_str returns Some("")
+  {
+    let s = v8::String::empty(scope);
+    let view = v8::ValueView::new(scope, s);
+    assert_eq!(view.as_str(), Some(""));
+  }
+
+  // Latin-1 non-ASCII: as_str returns None
+  {
+    let s = v8::String::new_from_one_byte(
+      scope,
+      &[0xC0, 0xE9, 0xF1],
+      v8::NewStringType::Normal,
+    )
+    .unwrap();
+    let view = v8::ValueView::new(scope, s);
+    assert_eq!(view.as_str(), None);
+  }
+
+  // Two-byte string: as_str returns None
+  {
+    let s = v8::String::new_from_two_byte(
+      scope,
+      &[0x4F60, 0x597D],
+      v8::NewStringType::Normal,
+    )
+    .unwrap();
+    let view = v8::ValueView::new(scope, s);
+    assert_eq!(view.as_str(), None);
+  }
+}
+
+#[test]
+fn string_valueview_to_cow_lossy() {
+  let _setup_guard = setup::parallel_test();
+  let mut isolate = v8::Isolate::new(Default::default());
+  let scope = pin!(v8::HandleScope::new(&mut isolate));
+  let mut scope = scope.init();
+  let context = v8::Context::new(&scope, Default::default());
+  let scope = &mut v8::ContextScope::new(&mut scope, context);
+
+  // ASCII: zero-copy Borrowed
+  {
+    let s = v8::String::new(scope, "hello").unwrap();
+    let view = v8::ValueView::new(scope, s);
+    let cow = view.to_cow_lossy();
+    assert!(matches!(cow, std::borrow::Cow::Borrowed(_)));
+    assert_eq!(&*cow, "hello");
+  }
+
+  // Latin-1 non-ASCII: Owned with correct transcoding
+  {
+    let s = v8::String::new_from_one_byte(
+      scope,
+      &[0xC0, 0xE9],
+      v8::NewStringType::Normal,
+    )
+    .unwrap();
+    let view = v8::ValueView::new(scope, s);
+    let cow = view.to_cow_lossy();
+    assert!(matches!(cow, std::borrow::Cow::Owned(_)));
+    assert_eq!(&*cow, "\u{00C0}\u{00E9}");
+  }
+
+  // Two-byte: Owned
+  {
+    let s = v8::String::new_from_two_byte(
+      scope,
+      &[0x4F60, 0x597D],
+      v8::NewStringType::Normal,
+    )
+    .unwrap();
+    let view = v8::ValueView::new(scope, s);
+    let cow = view.to_cow_lossy();
+    assert!(matches!(cow, std::borrow::Cow::Owned(_)));
+    assert_eq!(&*cow, "你好");
+  }
+}
+
+#[test]
+fn string_write_utf8_into() {
+  let _setup_guard = setup::parallel_test();
+  let mut isolate = v8::Isolate::new(Default::default());
+  let scope = pin!(v8::HandleScope::new(&mut isolate));
+  let mut scope = scope.init();
+  let context = v8::Context::new(&scope, Default::default());
+  let scope = &mut v8::ContextScope::new(&mut scope, context);
+
+  let mut buf = String::new();
+
+  // ASCII string
+  {
+    let s = v8::String::new(scope, "hello world").unwrap();
+    s.write_utf8_into(scope, &mut buf);
+    assert_eq!(buf, "hello world");
+  }
+
+  // Buffer reuse: allocation should be reused
+  {
+    let ptr_before = buf.as_ptr();
+    let s = v8::String::new(scope, "hi").unwrap();
+    s.write_utf8_into(scope, &mut buf);
+    assert_eq!(buf, "hi");
+    assert_eq!(buf.as_ptr(), ptr_before);
+  }
+
+  // Empty string
+  {
+    let s = v8::String::empty(scope);
+    s.write_utf8_into(scope, &mut buf);
+    assert_eq!(buf, "");
+  }
+
+  // Unicode string
+  {
+    let s = v8::String::new(scope, "café ☕").unwrap();
+    s.write_utf8_into(scope, &mut buf);
+    assert_eq!(buf, "café ☕");
+  }
+}
+
+#[test]
+fn latin1_to_utf8() {
+  // Pure ASCII
+  let input = b"hello world";
+  let mut output = vec![0u8; input.len() * 2];
+  let written = unsafe {
+    v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
+  };
+  assert_eq!(&output[..written], b"hello world");
+
+  // Latin-1 with non-ASCII: À = 0xC0, é = 0xE9
+  let input = &[0xC0u8, 0xE9];
+  let mut output = vec![0u8; input.len() * 2];
+  let written = unsafe {
+    v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
+  };
+  let s = std::str::from_utf8(&output[..written]).unwrap();
+  assert_eq!(s, "\u{00C0}\u{00E9}");
+
+  // Mixed ASCII and Latin-1 (exercises the 8-byte SIMD path)
+  let input = b"ABCDEFGH\xC0\xE9";
+  let mut output = vec![0u8; input.len() * 2];
+  let written = unsafe {
+    v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
+  };
+  let s = std::str::from_utf8(&output[..written]).unwrap();
+  assert_eq!(s, "ABCDEFGH\u{00C0}\u{00E9}");
+
+  // Empty
+  let mut output = vec![0u8; 4];
+  let written =
+    unsafe { v8::latin1_to_utf8(0, [].as_ptr(), output.as_mut_ptr()) };
+  assert_eq!(written, 0);
+}
+
 #[test]
 fn host_defined_options() {
   let _setup_guard = setup::parallel_test();