Skip to content

Commit fa63f51

Browse files
bartlomiejuclaude
andcommitted
refactor: rewrite to_rust_cow_lossy to use ValueView internally
Eliminates the utf8_length pre-scan by using ValueView for direct access to string contents. For one-byte strings this reduces from 2 FFI calls + 2 passes to 1 FFI call + 1 pass. Latin-1 transcoding uses latin1_to_utf8, and two-byte strings are transcoded directly into the stack buffer without an intermediate allocation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent eaa92b0 commit fa63f51

File tree

1 file changed

+110
-88
lines changed

1 file changed

+110
-88
lines changed

src/string.rs

Lines changed: 110 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -996,106 +996,111 @@ impl String {
996996
}
997997
}
998998

999-
/// Converts a [`crate::String`] to either an owned [`std::string::String`], or a borrowed [`str`], depending on whether it fits into the
1000-
/// provided buffer.
999+
/// Converts a [`crate::String`] to either an owned [`std::string::String`],
1000+
/// or a borrowed [`str`], depending on whether it fits into the provided
1001+
/// buffer.
1002+
///
1003+
/// Uses [`ValueView`] internally for direct access to the string's
1004+
/// contents, eliminating the `utf8_length` pre-scan that the previous
1005+
/// implementation required.
10011006
pub fn to_rust_cow_lossy<'a, const N: usize>(
10021007
&self,
10031008
scope: &mut Isolate,
10041009
buffer: &'a mut [MaybeUninit<u8>; N],
10051010
) -> Cow<'a, str> {
1006-
let len_utf16 = self.length();
1007-
1008-
// No need to allocate or do any work for zero-length strings
1009-
if len_utf16 == 0 {
1011+
let len = self.length();
1012+
if len == 0 {
10101013
return "".into();
10111014
}
10121015

1013-
// TODO(mmastrac): Ideally we should be able to access the string's internal representation
1014-
let len_utf8 = self.utf8_length(scope);
1016+
// SAFETY: `self` is a valid V8 string reachable from a handle scope.
1017+
// The ValueView is dropped before we return, so the
1018+
// DisallowGarbageCollection scope it holds is properly scoped.
1019+
let view = unsafe { ValueView::new_from_ref(scope, self) };
10151020

1016-
// If len_utf8 == len_utf16 and the string is one-byte, we can take the fast memcpy path. This is true iff the
1017-
// string is 100% 7-bit ASCII.
1018-
if self.is_onebyte() && len_utf8 == len_utf16 {
1019-
if len_utf16 <= N {
1020-
self.write_one_byte_uninit_v2(scope, 0, buffer, WriteFlags::empty());
1021-
unsafe {
1022-
// Get a slice of &[u8] of what we know is initialized now
1023-
let buffer = &mut buffer[..len_utf16];
1024-
let buffer = &mut *(buffer as *mut [_] as *mut [u8]);
1025-
1026-
// We know it's valid UTF-8, so make a string
1027-
return Cow::Borrowed(std::str::from_utf8_unchecked(buffer));
1021+
match view.data() {
1022+
ValueViewData::OneByte(bytes) => {
1023+
if bytes.is_ascii() {
1024+
// ASCII: direct memcpy, no transcoding needed.
1025+
if bytes.len() <= N {
1026+
unsafe {
1027+
std::ptr::copy_nonoverlapping(
1028+
bytes.as_ptr(),
1029+
buffer.as_mut_ptr() as *mut u8,
1030+
bytes.len(),
1031+
);
1032+
let buf = &mut buffer[..bytes.len()];
1033+
let buf = &mut *(buf as *mut [_] as *mut [u8]);
1034+
Cow::Borrowed(std::str::from_utf8_unchecked(buf))
1035+
}
1036+
} else {
1037+
// SAFETY: ASCII bytes are valid UTF-8.
1038+
unsafe {
1039+
Cow::Owned(std::string::String::from_utf8_unchecked(
1040+
bytes.to_vec(),
1041+
))
1042+
}
1043+
}
1044+
} else {
1045+
// Latin-1 non-ASCII: each byte can expand to at most 2 UTF-8
1046+
// bytes. Use conservative size check.
1047+
let max_utf8_len = bytes.len() * 2;
1048+
if max_utf8_len <= N {
1049+
let written = unsafe {
1050+
latin1_to_utf8(
1051+
bytes.len(),
1052+
bytes.as_ptr(),
1053+
buffer.as_mut_ptr() as *mut u8,
1054+
)
1055+
};
1056+
unsafe {
1057+
let buf = &mut buffer[..written];
1058+
let buf = &mut *(buf as *mut [_] as *mut [u8]);
1059+
Cow::Borrowed(std::str::from_utf8_unchecked(buf))
1060+
}
1061+
} else {
1062+
let mut buf = Vec::with_capacity(max_utf8_len);
1063+
unsafe {
1064+
let written =
1065+
latin1_to_utf8(bytes.len(), bytes.as_ptr(), buf.as_mut_ptr());
1066+
buf.set_len(written);
1067+
Cow::Owned(std::string::String::from_utf8_unchecked(buf))
1068+
}
1069+
}
10281070
}
10291071
}
1030-
1031-
unsafe {
1032-
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
1033-
// accidentally creating a slice of u8 which would be invalid.
1034-
let layout = std::alloc::Layout::from_size_align(len_utf16, 1).unwrap();
1035-
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
1036-
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf16);
1037-
1038-
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
1039-
self.write_one_byte_uninit_v2(
1040-
scope,
1041-
0,
1042-
&mut *buffer,
1043-
WriteFlags::kReplaceInvalidUtf8,
1044-
);
1045-
1046-
// Return an owned string from this guaranteed now-initialized data
1047-
let buffer = data as *mut u8;
1048-
return Cow::Owned(std::string::String::from_raw_parts(
1049-
buffer, len_utf16, len_utf16,
1050-
));
1051-
}
1052-
}
1053-
1054-
if len_utf8 <= N {
1055-
// No malloc path
1056-
let length = self.write_utf8_uninit_v2(
1057-
scope,
1058-
buffer,
1059-
WriteFlags::kReplaceInvalidUtf8,
1060-
None,
1061-
);
1062-
debug_assert!(length == len_utf8);
1063-
1064-
// SAFETY: We know that we wrote `length` UTF-8 bytes. See `slice_assume_init_mut` for additional guarantee information.
1065-
unsafe {
1066-
// Get a slice of &[u8] of what we know is initialized now
1067-
let buffer = &mut buffer[..length];
1068-
let buffer = &mut *(buffer as *mut [_] as *mut [u8]);
1069-
1070-
// We know it's valid UTF-8, so make a string
1071-
return Cow::Borrowed(std::str::from_utf8_unchecked(buffer));
1072+
ValueViewData::TwoByte(units) => {
1073+
// Transcode UTF-16 directly into the stack buffer when possible.
1074+
let mut pos = 0;
1075+
let mut tmp = [0u8; 4];
1076+
let mut all_fit = true;
1077+
for result in std::char::decode_utf16(units.iter().copied()) {
1078+
let c = result.unwrap_or('\u{FFFD}');
1079+
let encoded = c.encode_utf8(&mut tmp);
1080+
if pos + encoded.len() > N {
1081+
all_fit = false;
1082+
break;
1083+
}
1084+
unsafe {
1085+
std::ptr::copy_nonoverlapping(
1086+
encoded.as_ptr(),
1087+
(buffer.as_mut_ptr() as *mut u8).add(pos),
1088+
encoded.len(),
1089+
);
1090+
}
1091+
pos += encoded.len();
1092+
}
1093+
if all_fit {
1094+
unsafe {
1095+
let buf = &mut buffer[..pos];
1096+
let buf = &mut *(buf as *mut [_] as *mut [u8]);
1097+
Cow::Borrowed(std::str::from_utf8_unchecked(buf))
1098+
}
1099+
} else {
1100+
Cow::Owned(std::string::String::from_utf16_lossy(units))
1101+
}
10721102
}
10731103
}
1074-
1075-
// SAFETY: This allocates a buffer manually using the default allocator using the string's capacity.
1076-
// We have a large number of invariants to uphold, so please check changes to this code carefully
1077-
unsafe {
1078-
// Create an uninitialized buffer of `capacity` bytes. We need to be careful here to avoid
1079-
// accidentally creating a slice of u8 which would be invalid.
1080-
let layout = std::alloc::Layout::from_size_align(len_utf8, 1).unwrap();
1081-
let data = std::alloc::alloc(layout) as *mut MaybeUninit<u8>;
1082-
let buffer = std::ptr::slice_from_raw_parts_mut(data, len_utf8);
1083-
1084-
// Write to this MaybeUninit buffer, assuming we're going to fill this entire buffer
1085-
let length = self.write_utf8_uninit_v2(
1086-
scope,
1087-
&mut *buffer,
1088-
WriteFlags::kReplaceInvalidUtf8,
1089-
None,
1090-
);
1091-
debug_assert!(length == len_utf8);
1092-
1093-
// Return an owned string from this guaranteed now-initialized data
1094-
let buffer = data as *mut u8;
1095-
Cow::Owned(std::string::String::from_raw_parts(
1096-
buffer, length, len_utf8,
1097-
))
1098-
}
10991104
}
11001105
}
11011106

@@ -1132,12 +1137,29 @@ pub struct ValueView<'s>(
11321137
impl<'s> ValueView<'s> {
11331138
#[inline(always)]
11341139
pub fn new(isolate: &mut Isolate, string: Local<'s, String>) -> Self {
1140+
// SAFETY: Local<'s, String> derefs to &String; delegate to new_from_ref.
1141+
unsafe { Self::new_from_ref(isolate, &*string) }
1142+
}
1143+
1144+
/// Constructs a `ValueView` from a raw string reference.
1145+
///
1146+
/// # Safety
1147+
///
1148+
/// The caller must ensure that `string` is a valid V8 string that
1149+
/// remains alive for at least `'s`. In practice this means the
1150+
/// string must be reachable from a handle scope that outlives the
1151+
/// returned `ValueView`.
1152+
#[inline(always)]
1153+
pub(crate) unsafe fn new_from_ref(
1154+
isolate: &mut Isolate,
1155+
string: &'s String,
1156+
) -> Self {
11351157
let mut v = std::mem::MaybeUninit::uninit();
11361158
unsafe {
11371159
v8__String__ValueView__CONSTRUCT(
11381160
v.as_mut_ptr(),
11391161
isolate.as_real_ptr(),
1140-
&*string,
1162+
string,
11411163
);
11421164
v.assume_init()
11431165
}

0 commit comments

Comments
 (0)