Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion core/string/src/iter.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::iter::FusedIterator;

use crate::JsStr;
use crate::{CodePoint, JsStr};

use super::JsStrVariant;

Expand Down Expand Up @@ -97,3 +97,45 @@ impl ExactSizeIterator for Windows<'_> {
}
}
}

#[derive(Debug, Clone)]
enum CodePointsIterInner<'a> {
Latin1(std::iter::Copied<std::slice::Iter<'a, u8>>),
Utf16(std::char::DecodeUtf16<std::iter::Copied<std::slice::Iter<'a, u16>>>),
}

#[derive(Debug, Clone)]
pub struct CodePointsIter<'a> {
inner: CodePointsIterInner<'a>,
}

impl<'a> CodePointsIter<'a> {
pub(crate) fn new(s: JsStr<'a>) -> Self {
let inner = match s.variant() {
JsStrVariant::Latin1(s) => CodePointsIterInner::Latin1(s.iter().copied()),
JsStrVariant::Utf16(s) => {
CodePointsIterInner::Utf16(char::decode_utf16(s.iter().copied()))
}
};
CodePointsIter { inner }
}
}

impl Iterator for CodePointsIter<'_> {
type Item = CodePoint;

#[inline]
fn next(&mut self) -> Option<Self::Item> {
match &mut self.inner {
CodePointsIterInner::Latin1(iter) => {
iter.next().map(|b| CodePoint::Unicode(char::from(b)))
}
CodePointsIterInner::Utf16(iter) => iter.next().map(|res| match res {
Ok(c) => CodePoint::Unicode(c),
Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()),
}),
}
}
}

impl FusedIterator for CodePointsIter<'_> {}
11 changes: 4 additions & 7 deletions core/string/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::{
slice::SliceIndex,
};

use super::iter::Windows;
use super::iter::{CodePointsIter, Windows};

// Modified port of <https://doc.rust-lang.org/std/primitive.slice.html#method.trim_ascii_start>
#[inline]
Expand Down Expand Up @@ -450,13 +450,10 @@ impl<'a> JsStr<'a> {
}

/// Gets an iterator of all the Unicode codepoints of a [`JsStr`].
// TODO: optimize for Latin1 strings.
#[inline]
pub fn code_points(&self) -> impl Iterator<Item = CodePoint> + Clone + use<'a> {
char::decode_utf16(self.iter()).map(|res| match res {
Ok(c) => CodePoint::Unicode(c),
Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()),
})
#[must_use]
pub fn code_points(&self) -> CodePointsIter<'a> {
CodePointsIter::new(*self)
}

/// Checks if the [`JsStr`] contains a byte.
Expand Down
20 changes: 19 additions & 1 deletion core/string/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use std::hash::{BuildHasher, BuildHasherDefault, Hash};

use crate::{
CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsStrings,
CodePoint, CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsStrings,
Utf16JsStringBuilder,
};

Expand Down Expand Up @@ -469,3 +469,21 @@ fn common_js_string_builder() {
"Déjà vu2024年5月21日🎹"
);
}

#[test]
fn code_points_optimization() {
// Test Latin1 optimization with extended Latin1 characters
let latin1_str = JsStr::latin1(b"Caf\xe9 na\xefve"); // "Café naïve" in Latin1 encoding
let latin1_points: Vec<CodePoint> = latin1_str.code_points().collect();
let expected_latin1: Vec<CodePoint> = "Café naïve".chars().map(CodePoint::Unicode).collect();
assert_eq!(latin1_points, expected_latin1);

// Test UTF-16 behavior unchanged (including non-ASCII)
let utf16_str = JsStr::utf16(&[
0x0043, 0x0061, 0x0066, 0x00E9, // "Café"
0x0020, // space
0x006E, 0x0061, 0x00EF, 0x0076, 0x0065, // "naïve"
]);
let utf16_points: Vec<CodePoint> = utf16_str.code_points().collect();
assert_eq!(latin1_points, utf16_points); // Same result for same content
}
1 change: 1 addition & 0 deletions typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ forin = "forin"
mis = "mis"
pn = "pn"
tru = "tru"
Caf = "Caf"
Loading