Skip to content

Commit eaa92b0

Browse files
bartlomiejuclaude
andcommitted
feat: add zero-copy string access via ValueView and allocation-reuse APIs
Add ValueView::as_str() for true zero-copy &str access to ASCII strings, ValueView::to_cow_lossy() for zero-copy-when-possible string conversion, String::write_utf8_into() for allocation reuse, and a public latin1_to_utf8 SIMD-friendly transcoder utility. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9eaafb5 commit eaa92b0

File tree

3 files changed

+351
-0
lines changed

3 files changed

+351
-0
lines changed

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ pub use string::ValueView;
175175
pub use string::ValueViewData;
176176
pub use string::WriteFlags;
177177
pub use string::WriteOptions;
178+
pub use string::latin1_to_utf8;
178179
pub use support::SharedPtr;
179180
pub use support::SharedRef;
180181
pub use support::UniquePtr;

src/string.rs

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,71 @@ use std::mem::MaybeUninit;
1717
use std::ptr::NonNull;
1818
use std::slice;
1919

20+
/// Converts Latin-1 encoded bytes to UTF-8, writing into the output buffer.
21+
///
22+
/// The output buffer must have at least `2 * input_length` bytes of capacity,
23+
/// since each Latin-1 byte can expand to at most 2 UTF-8 bytes.
24+
///
25+
/// Returns the number of bytes written to the output buffer.
26+
///
27+
/// # Safety
28+
///
29+
/// - `inbuf` must point to at least `input_length` readable bytes.
30+
/// - `outbuf` must point to at least `2 * input_length` writable bytes.
31+
#[inline(always)]
32+
pub unsafe fn latin1_to_utf8(
33+
input_length: usize,
34+
inbuf: *const u8,
35+
outbuf: *mut u8,
36+
) -> usize {
37+
unsafe {
38+
let mut output = 0;
39+
let mut input = 0;
40+
41+
// Process 8 bytes at a time: check if all are ASCII with a single AND
42+
while input + 8 <= input_length {
43+
let chunk = (inbuf.add(input) as *const u64).read_unaligned();
44+
if chunk & 0x8080_8080_8080_8080 == 0 {
45+
// All 8 bytes are ASCII, copy in bulk
46+
(outbuf.add(output) as *mut u64).write_unaligned(chunk);
47+
input += 8;
48+
output += 8;
49+
} else {
50+
// At least one non-ASCII byte, process individually
51+
let end = input + 8;
52+
while input < end {
53+
let byte = *(inbuf.add(input));
54+
if byte < 0x80 {
55+
*(outbuf.add(output)) = byte;
56+
output += 1;
57+
} else {
58+
// Latin-1 byte to two-byte UTF-8 sequence
59+
*(outbuf.add(output)) = (byte >> 6) | 0b1100_0000;
60+
*(outbuf.add(output + 1)) = (byte & 0b0011_1111) | 0b1000_0000;
61+
output += 2;
62+
}
63+
input += 1;
64+
}
65+
}
66+
}
67+
68+
// Handle remaining bytes
69+
while input < input_length {
70+
let byte = *(inbuf.add(input));
71+
if byte < 0x80 {
72+
*(outbuf.add(output)) = byte;
73+
output += 1;
74+
} else {
75+
*(outbuf.add(output)) = (byte >> 6) | 0b1100_0000;
76+
*(outbuf.add(output + 1)) = (byte & 0b0011_1111) | 0b1000_0000;
77+
output += 2;
78+
}
79+
input += 1;
80+
}
81+
output
82+
}
83+
}
84+
2085
unsafe extern "C" {
2186
fn v8__String__Empty(isolate: *mut RealIsolate) -> *const String;
2287

@@ -878,6 +943,59 @@ impl String {
878943
}
879944
}
880945

946+
/// Writes the UTF-8 representation of this string into an existing
947+
/// [`std::string::String`], reusing its allocation.
948+
///
949+
/// The buffer is cleared first, then filled with the string's UTF-8
950+
/// contents. This avoids repeated heap allocation when converting
951+
/// many V8 strings — callers can keep a single `String` and reuse it.
952+
pub fn write_utf8_into(
953+
&self,
954+
scope: &Isolate,
955+
buf: &mut std::string::String,
956+
) {
957+
buf.clear();
958+
let len_utf16 = self.length();
959+
if len_utf16 == 0 {
960+
return;
961+
}
962+
963+
let len_utf8 = self.utf8_length(scope);
964+
buf.reserve(len_utf8);
965+
966+
// SAFETY: We write valid UTF-8 data into the spare capacity, then
967+
// set the length. After clear(), len == 0 so spare_capacity covers
968+
// the full allocation. kReplaceInvalidUtf8 guarantees valid UTF-8.
969+
unsafe {
970+
let vec = buf.as_mut_vec();
971+
if self.is_onebyte() && len_utf8 == len_utf16 {
972+
// ASCII fast path
973+
self.write_one_byte_uninit_v2(
974+
scope,
975+
0,
976+
slice::from_raw_parts_mut(
977+
vec.as_mut_ptr() as *mut MaybeUninit<u8>,
978+
len_utf16,
979+
),
980+
WriteFlags::kReplaceInvalidUtf8,
981+
);
982+
vec.set_len(len_utf16);
983+
} else {
984+
let written = self.write_utf8_uninit_v2(
985+
scope,
986+
slice::from_raw_parts_mut(
987+
vec.as_mut_ptr() as *mut MaybeUninit<u8>,
988+
len_utf8,
989+
),
990+
WriteFlags::kReplaceInvalidUtf8,
991+
None,
992+
);
993+
debug_assert!(written == len_utf8);
994+
vec.set_len(written);
995+
}
996+
}
997+
}
998+
881999
/// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
8821000
/// provided buffer.
8831001
pub fn to_rust_cow_lossy<'a, const N: usize>(
@@ -1037,6 +1155,66 @@ impl<'s> ValueView<'s> {
10371155
}
10381156
}
10391157
}
1158+
1159+
/// Returns a zero-copy `&str` if the string is one-byte and pure ASCII.
1160+
///
1161+
/// This is the fastest way to access a V8 string's contents as a Rust
1162+
/// `&str` — no allocation, no copy, no transcoding. Returns `None` for
1163+
/// strings that contain non-ASCII Latin-1 bytes or are two-byte encoded.
1164+
///
1165+
/// The returned reference is valid as long as this `ValueView` is alive.
1166+
#[inline(always)]
1167+
pub fn as_str(&self) -> Option<&str> {
1168+
match self.data() {
1169+
ValueViewData::OneByte(bytes) => {
1170+
if bytes.is_ascii() {
1171+
// SAFETY: ASCII bytes are valid UTF-8.
1172+
Some(unsafe { std::str::from_utf8_unchecked(bytes) })
1173+
} else {
1174+
None
1175+
}
1176+
}
1177+
ValueViewData::TwoByte(_) => None,
1178+
}
1179+
}
1180+
1181+
/// Returns the string contents as a `Cow<str>`.
1182+
///
1183+
/// - **One-byte ASCII**: returns `Cow::Borrowed(&str)` — true zero-copy.
1184+
/// - **One-byte Latin-1** (non-ASCII): transcodes to UTF-8, returns
1185+
/// `Cow::Owned`.
1186+
/// - **Two-byte** (UTF-16): transcodes to UTF-8 via
1187+
/// [`std::string::String::from_utf16_lossy`], returns `Cow::Owned`.
1188+
///
1189+
/// For the common case of ASCII strings this is zero-copy. The
1190+
/// Latin-1 transcoding uses a SIMD-friendly loop that processes 8 bytes
1191+
/// at a time.
1192+
#[inline(always)]
1193+
pub fn to_cow_lossy(&self) -> Cow<'_, str> {
1194+
match self.data() {
1195+
ValueViewData::OneByte(bytes) => {
1196+
if bytes.is_ascii() {
1197+
// SAFETY: ASCII bytes are valid UTF-8.
1198+
Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(bytes) })
1199+
} else {
1200+
// Latin-1 → UTF-8 transcoding. Each byte can expand to at
1201+
// most 2 UTF-8 bytes.
1202+
let mut buf = Vec::with_capacity(bytes.len() * 2);
1203+
// SAFETY: buf has capacity >= bytes.len() * 2, and
1204+
// latin1_to_utf8 writes valid UTF-8.
1205+
unsafe {
1206+
let written =
1207+
latin1_to_utf8(bytes.len(), bytes.as_ptr(), buf.as_mut_ptr());
1208+
buf.set_len(written);
1209+
Cow::Owned(std::string::String::from_utf8_unchecked(buf))
1210+
}
1211+
}
1212+
}
1213+
ValueViewData::TwoByte(units) => {
1214+
Cow::Owned(std::string::String::from_utf16_lossy(units))
1215+
}
1216+
}
1217+
}
10401218
}
10411219

10421220
impl Drop for ValueView<'_> {

tests/test_api.rs

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12270,6 +12270,178 @@ fn string_valueview() {
1227012270
}
1227112271
}
1227212272

12273+
#[test]
12274+
fn string_valueview_as_str() {
12275+
let _setup_guard = setup::parallel_test();
12276+
let mut isolate = v8::Isolate::new(Default::default());
12277+
let scope = pin!(v8::HandleScope::new(&mut isolate));
12278+
let mut scope = scope.init();
12279+
let context = v8::Context::new(&scope, Default::default());
12280+
let scope = &mut v8::ContextScope::new(&mut scope, context);
12281+
12282+
// ASCII string: as_str returns Some
12283+
{
12284+
let s = v8::String::new(scope, "hello world").unwrap();
12285+
let view = v8::ValueView::new(scope, s);
12286+
assert_eq!(view.as_str(), Some("hello world"));
12287+
}
12288+
12289+
// Empty string: as_str returns Some("")
12290+
{
12291+
let s = v8::String::empty(scope);
12292+
let view = v8::ValueView::new(scope, s);
12293+
assert_eq!(view.as_str(), Some(""));
12294+
}
12295+
12296+
// Latin-1 non-ASCII: as_str returns None
12297+
{
12298+
let s = v8::String::new_from_one_byte(
12299+
scope,
12300+
&[0xC0, 0xE9, 0xF1],
12301+
v8::NewStringType::Normal,
12302+
)
12303+
.unwrap();
12304+
let view = v8::ValueView::new(scope, s);
12305+
assert_eq!(view.as_str(), None);
12306+
}
12307+
12308+
// Two-byte string: as_str returns None
12309+
{
12310+
let s = v8::String::new_from_two_byte(
12311+
scope,
12312+
&[0x4F60, 0x597D],
12313+
v8::NewStringType::Normal,
12314+
)
12315+
.unwrap();
12316+
let view = v8::ValueView::new(scope, s);
12317+
assert_eq!(view.as_str(), None);
12318+
}
12319+
}
12320+
12321+
#[test]
12322+
fn string_valueview_to_cow_lossy() {
12323+
let _setup_guard = setup::parallel_test();
12324+
let mut isolate = v8::Isolate::new(Default::default());
12325+
let scope = pin!(v8::HandleScope::new(&mut isolate));
12326+
let mut scope = scope.init();
12327+
let context = v8::Context::new(&scope, Default::default());
12328+
let scope = &mut v8::ContextScope::new(&mut scope, context);
12329+
12330+
// ASCII: zero-copy Borrowed
12331+
{
12332+
let s = v8::String::new(scope, "hello").unwrap();
12333+
let view = v8::ValueView::new(scope, s);
12334+
let cow = view.to_cow_lossy();
12335+
assert!(matches!(cow, std::borrow::Cow::Borrowed(_)));
12336+
assert_eq!(&*cow, "hello");
12337+
}
12338+
12339+
// Latin-1 non-ASCII: Owned with correct transcoding
12340+
{
12341+
let s = v8::String::new_from_one_byte(
12342+
scope,
12343+
&[0xC0, 0xE9],
12344+
v8::NewStringType::Normal,
12345+
)
12346+
.unwrap();
12347+
let view = v8::ValueView::new(scope, s);
12348+
let cow = view.to_cow_lossy();
12349+
assert!(matches!(cow, std::borrow::Cow::Owned(_)));
12350+
assert_eq!(&*cow, "\u{00C0}\u{00E9}");
12351+
}
12352+
12353+
// Two-byte: Owned
12354+
{
12355+
let s = v8::String::new_from_two_byte(
12356+
scope,
12357+
&[0x4F60, 0x597D],
12358+
v8::NewStringType::Normal,
12359+
)
12360+
.unwrap();
12361+
let view = v8::ValueView::new(scope, s);
12362+
let cow = view.to_cow_lossy();
12363+
assert!(matches!(cow, std::borrow::Cow::Owned(_)));
12364+
assert_eq!(&*cow, "你好");
12365+
}
12366+
}
12367+
12368+
#[test]
12369+
fn string_write_utf8_into() {
12370+
let _setup_guard = setup::parallel_test();
12371+
let mut isolate = v8::Isolate::new(Default::default());
12372+
let scope = pin!(v8::HandleScope::new(&mut isolate));
12373+
let mut scope = scope.init();
12374+
let context = v8::Context::new(&scope, Default::default());
12375+
let scope = &mut v8::ContextScope::new(&mut scope, context);
12376+
12377+
let mut buf = String::new();
12378+
12379+
// ASCII string
12380+
{
12381+
let s = v8::String::new(scope, "hello world").unwrap();
12382+
s.write_utf8_into(scope, &mut buf);
12383+
assert_eq!(buf, "hello world");
12384+
}
12385+
12386+
// Buffer reuse: allocation should be reused
12387+
{
12388+
let ptr_before = buf.as_ptr();
12389+
let s = v8::String::new(scope, "hi").unwrap();
12390+
s.write_utf8_into(scope, &mut buf);
12391+
assert_eq!(buf, "hi");
12392+
assert_eq!(buf.as_ptr(), ptr_before);
12393+
}
12394+
12395+
// Empty string
12396+
{
12397+
let s = v8::String::empty(scope);
12398+
s.write_utf8_into(scope, &mut buf);
12399+
assert_eq!(buf, "");
12400+
}
12401+
12402+
// Unicode string
12403+
{
12404+
let s = v8::String::new(scope, "café ☕").unwrap();
12405+
s.write_utf8_into(scope, &mut buf);
12406+
assert_eq!(buf, "café ☕");
12407+
}
12408+
}
12409+
12410+
#[test]
12411+
fn latin1_to_utf8() {
12412+
// Pure ASCII
12413+
let input = b"hello world";
12414+
let mut output = vec![0u8; input.len() * 2];
12415+
let written = unsafe {
12416+
v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
12417+
};
12418+
assert_eq!(&output[..written], b"hello world");
12419+
12420+
// Latin-1 with non-ASCII: À = 0xC0, é = 0xE9
12421+
let input = &[0xC0u8, 0xE9];
12422+
let mut output = vec![0u8; input.len() * 2];
12423+
let written = unsafe {
12424+
v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
12425+
};
12426+
let s = std::str::from_utf8(&output[..written]).unwrap();
12427+
assert_eq!(s, "\u{00C0}\u{00E9}");
12428+
12429+
// Mixed ASCII and Latin-1 (exercises the 8-byte SIMD path)
12430+
let input = b"ABCDEFGH\xC0\xE9";
12431+
let mut output = vec![0u8; input.len() * 2];
12432+
let written = unsafe {
12433+
v8::latin1_to_utf8(input.len(), input.as_ptr(), output.as_mut_ptr())
12434+
};
12435+
let s = std::str::from_utf8(&output[..written]).unwrap();
12436+
assert_eq!(s, "ABCDEFGH\u{00C0}\u{00E9}");
12437+
12438+
// Empty
12439+
let mut output = vec![0u8; 4];
12440+
let written =
12441+
unsafe { v8::latin1_to_utf8(0, [].as_ptr(), output.as_mut_ptr()) };
12442+
assert_eq!(written, 0);
12443+
}
12444+
1227312445
#[test]
1227412446
fn host_defined_options() {
1227512447
let _setup_guard = setup::parallel_test();

0 commit comments

Comments
 (0)