diff --git a/core/string/src/iter.rs b/core/string/src/iter.rs index ead0e8c94c7..e90c394d7b7 100644 --- a/core/string/src/iter.rs +++ b/core/string/src/iter.rs @@ -1,6 +1,6 @@ use std::iter::FusedIterator; -use crate::JsStr; +use crate::{CodePoint, JsStr}; use super::JsStrVariant; @@ -97,3 +97,45 @@ impl ExactSizeIterator for Windows<'_> { } } } + +#[derive(Debug, Clone)] +enum CodePointsIterInner<'a> { + Latin1(std::iter::Copied>), + Utf16(std::char::DecodeUtf16>>), +} + +#[derive(Debug, Clone)] +pub struct CodePointsIter<'a> { + inner: CodePointsIterInner<'a>, +} + +impl<'a> CodePointsIter<'a> { + pub(crate) fn new(s: JsStr<'a>) -> Self { + let inner = match s.variant() { + JsStrVariant::Latin1(s) => CodePointsIterInner::Latin1(s.iter().copied()), + JsStrVariant::Utf16(s) => { + CodePointsIterInner::Utf16(char::decode_utf16(s.iter().copied())) + } + }; + CodePointsIter { inner } + } +} + +impl Iterator for CodePointsIter<'_> { + type Item = CodePoint; + + #[inline] + fn next(&mut self) -> Option { + match &mut self.inner { + CodePointsIterInner::Latin1(iter) => { + iter.next().map(|b| CodePoint::Unicode(char::from(b))) + } + CodePointsIterInner::Utf16(iter) => iter.next().map(|res| match res { + Ok(c) => CodePoint::Unicode(c), + Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), + }), + } + } +} + +impl FusedIterator for CodePointsIter<'_> {} diff --git a/core/string/src/str.rs b/core/string/src/str.rs index c915aeb2390..489beea4e99 100644 --- a/core/string/src/str.rs +++ b/core/string/src/str.rs @@ -9,7 +9,7 @@ use std::{ slice::SliceIndex, }; -use super::iter::Windows; +use super::iter::{CodePointsIter, Windows}; // Modified port of #[inline] @@ -450,13 +450,10 @@ impl<'a> JsStr<'a> { } /// Gets an iterator of all the Unicode codepoints of a [`JsStr`]. - // TODO: optimize for Latin1 strings. #[inline] - pub fn code_points(&self) -> impl Iterator + Clone + use<'a> { - char::decode_utf16(self.iter()).map(|res| match res { - Ok(c) => CodePoint::Unicode(c), - Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()), - }) + #[must_use] + pub fn code_points(&self) -> CodePointsIter<'a> { + CodePointsIter::new(*self) } /// Checks if the [`JsStr`] contains a byte. diff --git a/core/string/src/tests.rs b/core/string/src/tests.rs index 533333883c7..114092978c8 100644 --- a/core/string/src/tests.rs +++ b/core/string/src/tests.rs @@ -3,7 +3,7 @@ use std::hash::{BuildHasher, BuildHasherDefault, Hash}; use crate::{ - CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsStrings, + CodePoint, CommonJsStringBuilder, JsStr, JsString, Latin1JsStringBuilder, StaticJsStrings, Utf16JsStringBuilder, }; @@ -469,3 +469,21 @@ fn common_js_string_builder() { "Déjà vu2024年5月21日🎹" ); } + +#[test] +fn code_points_optimization() { + // Test Latin1 optimization with extended Latin1 characters + let latin1_str = JsStr::latin1(b"Caf\xe9 na\xefve"); // "Café naïve" in Latin1 encoding + let latin1_points: Vec = latin1_str.code_points().collect(); + let expected_latin1: Vec = "Café naïve".chars().map(CodePoint::Unicode).collect(); + assert_eq!(latin1_points, expected_latin1); + + // Test UTF-16 behavior unchanged (including non-ASCII) + let utf16_str = JsStr::utf16(&[ + 0x0043, 0x0061, 0x0066, 0x00E9, // "Café" + 0x0020, // space + 0x006E, 0x0061, 0x00EF, 0x0076, 0x0065, // "naïve" + ]); + let utf16_points: Vec = utf16_str.code_points().collect(); + assert_eq!(latin1_points, utf16_points); // Same result for same content +} diff --git a/typos.toml b/typos.toml index 82bafb0cb73..3cc33d77a77 100644 --- a/typos.toml +++ b/typos.toml @@ -9,3 +9,4 @@ forin = "forin" mis = "mis" pn = "pn" tru = "tru" +Caf = "Caf"