From be9f63dc15a26760ee9b3993b00a3e37ed630e68 Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Tue, 14 Apr 2026 13:01:50 +0200 Subject: [PATCH 1/4] check struct invariants during deserialization --- components/collator/src/comparison.rs | 80 ++++++++------------------- components/collator/src/elements.rs | 19 +++---- components/collator/src/provider.rs | 73 ++++++++++++++++++++++-- 3 files changed, 101 insertions(+), 71 deletions(-) diff --git a/components/collator/src/comparison.rs b/components/collator/src/comparison.rs index cdfb4a36595..5f2fd289cbd 100644 --- a/components/collator/src/comparison.rs +++ b/components/collator/src/comparison.rs @@ -21,7 +21,7 @@ use crate::elements::FALLBACK_CE32; use crate::elements::NON_ROUND_TRIP_MARKER; use crate::elements::{ char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32, - HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY, + HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY, NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK, }; @@ -43,7 +43,7 @@ use crate::provider::CollationSpecialPrimariesV1; use crate::provider::CollationSpecialPrimariesValidated; use crate::provider::CollationTailoringV1; use core::cmp::Ordering; -use core::convert::{Infallible, TryFrom}; +use core::convert::Infallible; use icu_normalizer::provider::DecompositionData; use icu_normalizer::provider::DecompositionTables; use icu_normalizer::provider::NormalizerNfdDataV1; @@ -55,7 +55,6 @@ use icu_provider::prelude::*; use smallvec::SmallVec; use utf16_iter::Utf16CharsEx; use utf8_iter::Utf8CharsEx; -use zerovec::ule::AsULE; // Special sort key bytes for all levels. const LEVEL_SEPARATOR_BYTE: u8 = 1; @@ -649,16 +648,6 @@ impl Collator { let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; - // TODO: redesign Korean search collation handling - if jamo.get().ce32s.len() != JAMO_COUNT { - return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO)); - } - - // `variant_count` isn't stable yet: - // https://github.com/rust-lang/rust/issues/73662 - if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) { - return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO)); - } let special_primaries = special_primaries.map_project(|csp, _| { let compressible_bytes = (csp.last_primaries.len() == MaxVariable::Currency as usize + 16) @@ -758,25 +747,6 @@ impl CollatorBorrowed<'static> { let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; - // TODO: redesign Korean search collation handling - const _: () = assert!( - crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1 - .ce32s - .as_slice() - .len() - == JAMO_COUNT - ); - - // `variant_count` isn't stable yet: - // https://github.com/rust-lang/rust/issues/73662 - const _: () = assert!( - crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 - .last_primaries - .as_slice() - .len() - > (MaxVariable::Currency as usize) - ); - let special_primaries = const { &CollationSpecialPrimariesValidated { last_primaries: zerovec::ZeroSlice::from_ule_slice( @@ -791,7 +761,7 @@ impl CollatorBorrowed<'static> { numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .numeric_primary, compressible_bytes: { - const C: &[::ULE] = + const C: &[::ULE] = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 .last_primaries .as_slice() @@ -874,27 +844,25 @@ impl CollatorBorrowed<'static> { } } -macro_rules! collation_elements { - ($self:expr, $chars:expr, $tailoring:expr, $numeric_primary:expr) => {{ - let jamo = <&[::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice()); - - let jamo = jamo.unwrap(); - +impl<'a> CollatorBorrowed<'a> { + fn collation_elements>( + &self, + chars: C, + tailoring: &'a CollationData<'a>, + numeric_primary: Option, + ) -> CollationElements<'a, C> { CollationElements::new( - $chars, - $self.root, - $tailoring, - jamo, - &$self.diacritics.secondaries, - $self.decompositions, - $self.tables, - $numeric_primary, - $self.lithuanian_dot_above, + chars, + self.root, + tailoring, + self.jamo.as_array(), + &self.diacritics.secondaries, + self.decompositions, + self.tables, + numeric_primary, + self.lithuanian_dot_above, ) - }}; -} - -impl CollatorBorrowed<'_> { + } /// The resolved options showing how the default options, the requested options, /// and the options from locale data were combined. pub fn resolved_options(&self) -> ResolvedCollatorOptions { @@ -971,7 +939,7 @@ impl CollatorBorrowed<'_> { ); #[inline(always)] - fn tailoring_or_root(&self) -> &CollationData<'_> { + fn tailoring_or_root(&self) -> &'a CollationData<'a> { if let Some(tailoring) = &self.tailoring { tailoring } else { @@ -1052,8 +1020,8 @@ impl CollatorBorrowed<'_> { let tailoring = self.tailoring_or_root(); let numeric_primary = self.numeric_primary(); - let mut left = collation_elements!(self, left_chars, tailoring, numeric_primary); - let mut right = collation_elements!(self, right_chars, tailoring, numeric_primary); + let mut left = self.collation_elements(left_chars, tailoring, numeric_primary); + let mut right = self.collation_elements(right_chars, tailoring, numeric_primary); // Start identical prefix @@ -1922,7 +1890,7 @@ impl CollatorBorrowed<'_> { let levels = self.sort_key_levels(); let mut iter = - collation_elements!(self, iter, self.tailoring_or_root(), self.numeric_primary()); + self.collation_elements(iter, self.tailoring_or_root(), self.numeric_primary()); iter.init(); let variable_top = self.variable_top(); diff --git a/components/collator/src/elements.rs b/components/collator/src/elements.rs index 56b95edafec..31c244e057c 100644 --- a/components/collator/src/elements.rs +++ b/components/collator/src/elements.rs @@ -1502,16 +1502,7 @@ where if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { // The character is its own decomposition let jamo_index = (c as usize).wrapping_sub(HANGUL_L_BASE as usize); - // Attribute belongs on an inner expression, but - // https://github.com/rust-lang/rust/issues/15701 - #[expect(clippy::indexing_slicing)] - if jamo_index >= self.jamo.len() { - ce32 = data.ce32_for_char(c); - if ce32 == FALLBACK_CE32 { - data = self.root; - ce32 = data.ce32_for_char(c); - } - } else { + if let Some(&jamo) = self.jamo.get(jamo_index) { // The purpose of reading the CE32 from the jamo table instead // of the trie even in this case is to make it unnecessary // for all search collation tries to carry a copy of the Hangul @@ -1531,7 +1522,13 @@ where data = self.root; // Index in range by construction above. Not using `get` with // `if let` in order to put the likely branch first. - ce32 = CollationElement32::new_from_ule(self.jamo[jamo_index]); + ce32 = CollationElement32::new_from_ule(jamo); + } else { + ce32 = data.ce32_for_char(c); + if ce32 == FALLBACK_CE32 { + data = self.root; + ce32 = data.ce32_for_char(c); + } } if self.is_next_decomposition_starts_with_starter() { if let Some(ce) = ce32.to_ce_simple_or_long_primary() { diff --git a/components/collator/src/provider.rs b/components/collator/src/provider.rs index 2b7e221930e..6058c77a3ce 100644 --- a/components/collator/src/provider.rs +++ b/components/collator/src/provider.rs @@ -323,14 +323,50 @@ icu_provider::data_struct!( #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] #[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] pub struct CollationJamo<'data> { /// `CollationElement32`s (as `u32`s) for the Hangul Jamo Unicode Block. /// The length must be equal to the size of the block (256). - #[cfg_attr(feature = "serde", serde(borrow))] pub ce32s: ZeroVec<'data, u32>, } +impl<'data> CollationJamo<'data> { + pub(crate) fn as_array( + &'data self, + ) -> &'data [::ULE; crate::elements::JAMO_COUNT] { + #[allow(clippy::unwrap_used)] // by invariant + self.ce32s.as_ule_slice().try_into().unwrap() + } +} + +// TODO: redesign Korean search collation handling + +#[cfg(feature = "compiled_data")] +const _: () = assert!( + Baked::SINGLETON_COLLATION_JAMO_V1.ce32s.as_slice().len() == crate::elements::JAMO_COUNT +); + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for CollationJamo<'de> { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(serde::Deserialize)] + struct Raw<'data> { + #[cfg_attr(feature = "serde", serde(borrow))] + ce32s: ZeroVec<'data, u32>, + } + + let Raw { ce32s } = Raw::deserialize(deserializer)?; + + if ce32s.len() != crate::elements::JAMO_COUNT { + return Err(serde::de::Error::custom("invalid")); + } + + Ok(Self { ce32s }) + } +} + icu_provider::data_struct!( CollationJamo<'_>, #[cfg(feature = "datagen")] @@ -554,7 +590,6 @@ impl CollationMetadata { #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] #[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] pub struct CollationSpecialPrimaries<'data> { /// The primaries corresponding to `MaxVariable` /// character classes packed so that each fits in @@ -564,12 +599,42 @@ pub struct CollationSpecialPrimaries<'data> { /// This is potentially followed by 256 bits /// (packed in 16 u16s) to classify every possible /// byte into compressible or non-compressible. - #[cfg_attr(feature = "serde", serde(borrow))] pub last_primaries: ZeroVec<'data, u16>, /// The high 8 bits of the numeric primary pub numeric_primary: u8, } +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for CollationSpecialPrimaries<'de> { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(serde::Deserialize)] + struct Raw<'data> { + #[cfg_attr(feature = "serde", serde(borrow))] + last_primaries: ZeroVec<'data, u16>, + numeric_primary: u8, + } + + let Raw { + last_primaries, + numeric_primary, + } = Raw::deserialize(deserializer)?; + + // `variant_count` isn't stable yet: + // https://github.com/rust-lang/rust/issues/73662 + if last_primaries.len() <= (MaxVariable::Currency as usize) { + return Err(serde::de::Error::custom("invalid")); + } + + Ok(Self { + last_primaries, + numeric_primary, + }) + } +} + #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] pub(crate) struct CollationSpecialPrimariesValidated<'data> { /// The primaries corresponding to `MaxVariable` From 708bc76ec5a4742d6652df9a4675ec8608daba45 Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Tue, 14 Apr 2026 12:27:36 +0200 Subject: [PATCH 2/4] consolidate special primaries code --- components/collator/src/comparison.rs | 72 ++------------------------- components/collator/src/provider.rs | 61 ++++++++++++++++++++++- 2 files changed, 62 insertions(+), 71 deletions(-) diff --git a/components/collator/src/comparison.rs b/components/collator/src/comparison.rs index 5f2fd289cbd..85a6ff34b46 100644 --- a/components/collator/src/comparison.rs +++ b/components/collator/src/comparison.rs @@ -26,9 +26,7 @@ use crate::elements::{ QUATERNARY_MASK, }; use crate::options::CollatorOptionsBitField; -use crate::options::{ - AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength, -}; +use crate::options::{AlternateHandling, CollatorOptions, ResolvedCollatorOptions, Strength}; use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType}; use crate::provider::CollationData; use crate::provider::CollationDiacritics; @@ -648,28 +646,7 @@ impl Collator { let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; - let special_primaries = special_primaries.map_project(|csp, _| { - let compressible_bytes = (csp.last_primaries.len() - == MaxVariable::Currency as usize + 16) - .then(|| { - csp.last_primaries - .as_maybe_borrowed()? - .as_ule_slice() - .get((MaxVariable::Currency as usize)..)? - .try_into() - .ok() - }) - .flatten() - .unwrap_or( - CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK, - ); - - CollationSpecialPrimariesValidated { - last_primaries: csp.last_primaries.truncated(MaxVariable::Currency as usize), - numeric_primary: csp.numeric_primary, - compressible_bytes, - } - }); + let special_primaries = special_primaries.map_project(|csp, _| csp.validated()); Ok(Collator { special_primaries, @@ -748,50 +725,7 @@ impl CollatorBorrowed<'static> { LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; let special_primaries = const { - &CollationSpecialPrimariesValidated { - last_primaries: zerovec::ZeroSlice::from_ule_slice( - crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 - .last_primaries - .as_slice() - .as_ule_slice() - .split_at(MaxVariable::Currency as usize) - .0, - ) - .as_zerovec(), - numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 - .numeric_primary, - compressible_bytes: { - const C: &[::ULE] = - crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 - .last_primaries - .as_slice() - .as_ule_slice(); - if C.len() == MaxVariable::Currency as usize + 16 { - let i = MaxVariable::Currency as usize; - #[allow(clippy::indexing_slicing)] // protected, const - &[ - C[i], - C[i + 1], - C[i + 2], - C[i + 3], - C[i + 4], - C[i + 5], - C[i + 6], - C[i + 7], - C[i + 8], - C[i + 9], - C[i + 10], - C[i + 11], - C[i + 12], - C[i + 13], - C[i + 14], - C[i + 15], - ] - } else { - CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK - } - }, - } + &crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated() }; // Attribute belongs closer to `unwrap`, but diff --git a/components/collator/src/provider.rs b/components/collator/src/provider.rs index 6058c77a3ce..0dd3ea6564d 100644 --- a/components/collator/src/provider.rs +++ b/components/collator/src/provider.rs @@ -649,8 +649,8 @@ pub(crate) struct CollationSpecialPrimariesValidated<'data> { pub compressible_bytes: &'data [::ULE; 16], } -impl CollationSpecialPrimariesValidated<'static> { - pub(crate) const HARDCODED_COMPRESSIBLE_BYTES_FALLBACK: &'static [::ULE; 16] = &[ +impl<'a> CollationSpecialPrimaries<'a> { + const HARDCODED_COMPRESSIBLE_BYTES_FALLBACK: &'static [::ULE; 16] = &[ ::ULE::from_unsigned(0b0000_0000_0000_0000), ::ULE::from_unsigned(0b0000_0000_0000_0000), ::ULE::from_unsigned(0b0000_0000_0000_0000), @@ -668,6 +668,63 @@ impl CollationSpecialPrimariesValidated<'static> { ::ULE::from_unsigned(0b0000_0000_0000_0000), ::ULE::from_unsigned(0b0100_0000_0000_0000), ]; + + pub(crate) fn validated(self) -> CollationSpecialPrimariesValidated<'a> { + let (last_primaries, compressible_bytes) = + if let Some(borrowed) = self.last_primaries.as_maybe_borrowed() { + let (l, c) = borrowed + .as_ule_slice() + // by invariant + .split_at(MaxVariable::Currency as usize + 1); + ( + l, + c.try_into() + .unwrap_or(Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK), + ) + } else { + ( + self.last_primaries.as_slice().as_ule_slice(), + Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK, + ) + }; + + let last_primaries_truncate_len = last_primaries.len(); + CollationSpecialPrimariesValidated { + last_primaries: self.last_primaries.truncated(last_primaries_truncate_len), + numeric_primary: self.numeric_primary, + compressible_bytes, + } + } + + pub(crate) const fn const_validated(&'static self) -> CollationSpecialPrimariesValidated<'a> { + let borrowed = self.last_primaries.as_slice(); + let (last_primaries, compressible_bytes) = borrowed + .as_ule_slice() + // by invariant + .split_at(MaxVariable::Currency as usize + 1); + // TODO: use c.as_array() on MSRV 1.93 + let compressible_bytes = if compressible_bytes.len() == 16 { + unsafe { &*(compressible_bytes.as_ptr() as *const [::ULE; 16]) } + } else { + Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK + }; + + CollationSpecialPrimariesValidated { + last_primaries: ZeroSlice::from_ule_slice(last_primaries).as_zerovec(), + numeric_primary: self.numeric_primary, + compressible_bytes, + } + } +} + +#[test] +fn compressible_bytes() { + assert_eq!( + Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 + .clone() + .validated(), + Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated(), + ); } icu_provider::data_struct!( From ecef237c35a0c73bc74f24f322bfefd97ce0da6c Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Tue, 14 Apr 2026 16:47:04 +0200 Subject: [PATCH 3/4] evolve special primaries data struct --- components/collator/src/comparison.rs | 14 +- components/collator/src/provider.rs | 165 +++++++----------- .../collation_special_primaries_v1.rs.data | 4 +- provider/data/collator/fingerprints.csv | 2 +- .../collation_special_primaries_v1.rs.data | 4 +- provider/source/src/collator/mod.rs | 8 +- 6 files changed, 75 insertions(+), 122 deletions(-) diff --git a/components/collator/src/comparison.rs b/components/collator/src/comparison.rs index 85a6ff34b46..f41f215b388 100644 --- a/components/collator/src/comparison.rs +++ b/components/collator/src/comparison.rs @@ -37,8 +37,8 @@ use crate::provider::CollationMetadataV1; use crate::provider::CollationReordering; use crate::provider::CollationReorderingV1; use crate::provider::CollationRootV1; +use crate::provider::CollationSpecialPrimaries; use crate::provider::CollationSpecialPrimariesV1; -use crate::provider::CollationSpecialPrimariesValidated; use crate::provider::CollationTailoringV1; use core::cmp::Ordering; use core::convert::Infallible; @@ -48,7 +48,6 @@ use icu_normalizer::provider::NormalizerNfdDataV1; use icu_normalizer::provider::NormalizerNfdTablesV1; use icu_normalizer::DecomposingNormalizerBorrowed; use icu_normalizer::Decomposition; -use icu_provider::marker::ErasedMarker; use icu_provider::prelude::*; use smallvec::SmallVec; use utf16_iter::Utf16CharsEx; @@ -546,7 +545,7 @@ impl LocaleSpecificDataHolder { /// Compares strings according to culturally-relevant ordering. #[derive(Debug)] pub struct Collator { - special_primaries: DataPayload>>, + special_primaries: DataPayload, root: DataPayload, tailoring: Option>, jamo: DataPayload, @@ -646,8 +645,6 @@ impl Collator { let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; - let special_primaries = special_primaries.map_project(|csp, _| csp.validated()); - Ok(Collator { special_primaries, root, @@ -693,7 +690,7 @@ macro_rules! compare { /// borrowed version. #[derive(Debug)] pub struct CollatorBorrowed<'a> { - special_primaries: &'a CollationSpecialPrimariesValidated<'a>, + special_primaries: &'a CollationSpecialPrimaries<'a>, root: &'a CollationData<'a>, tailoring: Option<&'a CollationData<'a>>, jamo: &'a CollationJamo<'a>, @@ -720,13 +717,10 @@ impl CollatorBorrowed<'static> { let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1; let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1; let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1; - let locale_dependent = LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?; - let special_primaries = const { - &crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated() - }; + let special_primaries = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1; // Attribute belongs closer to `unwrap`, but // https://github.com/rust-lang/rust/issues/15701 diff --git a/components/collator/src/provider.rs b/components/collator/src/provider.rs index 0dd3ea6564d..127998f7ed2 100644 --- a/components/collator/src/provider.rs +++ b/components/collator/src/provider.rs @@ -588,20 +588,22 @@ impl CollationMetadata { /// to be stable, their Rust representation might not be. Use with caution. /// #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] +#[cfg_attr(feature = "datagen", derive(databake::Bake))] #[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))] pub struct CollationSpecialPrimaries<'data> { /// The primaries corresponding to `MaxVariable` /// character classes packed so that each fits in /// 16 bits. Length must match the number of enum /// variants in `MaxVariable`, currently 4. - /// - /// This is potentially followed by 256 bits - /// (packed in 16 u16s) to classify every possible - /// byte into compressible or non-compressible. pub last_primaries: ZeroVec<'data, u16>, /// The high 8 bits of the numeric primary pub numeric_primary: u8, + /// 256 bits (packed in 16 u16s) to classify every possible + /// byte into compressible or non-compressible. + /// + /// In the serde encoding, this is appended to `last_primaries`, + /// or might be missing. + pub compressible_bytes: ZeroVec<'data, u16>, } #[cfg(feature = "serde")] @@ -613,126 +615,88 @@ impl<'de> serde::Deserialize<'de> for CollationSpecialPrimaries<'de> { #[derive(serde::Deserialize)] struct Raw<'data> { #[cfg_attr(feature = "serde", serde(borrow))] - last_primaries: ZeroVec<'data, u16>, + concatenated: &'data ZeroSlice, numeric_primary: u8, } let Raw { - last_primaries, + concatenated, numeric_primary, } = Raw::deserialize(deserializer)?; - // `variant_count` isn't stable yet: - // https://github.com/rust-lang/rust/issues/73662 - if last_primaries.len() <= (MaxVariable::Currency as usize) { + let Some((l, c)) = concatenated + .as_ule_slice() + // `variant_count` isn't stable yet: + // https://github.com/rust-lang/rust/issues/73662 + .split_at_checked(MaxVariable::Currency as usize + 1) + else { return Err(serde::de::Error::custom("invalid")); + }; + + let last_primaries = ZeroSlice::from_ule_slice(l).as_zerovec(); + let mut compressible_bytes = ZeroSlice::from_ule_slice(c).as_zerovec(); + + if c.len() != 16 { + compressible_bytes = zerovec::zerovec!( + u16; ::ULE::from_unsigned; [ + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b1111_1111_1111_1110, + 0b1111_1111_1111_1111, + 0b0000_0000_0000_0001, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0000_0000_0000_0000, + 0b0100_0000_0000_0000, + ]); } Ok(Self { last_primaries, numeric_primary, + compressible_bytes, }) } } -#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] -pub(crate) struct CollationSpecialPrimariesValidated<'data> { - /// The primaries corresponding to `MaxVariable` - /// character classes packed so that each fits in - /// 16 bits. Length must match the number of enum - /// variants in `MaxVariable`, currently 4. - pub last_primaries: ZeroVec<'data, u16>, - /// The high 8 bits of the numeric primary - pub numeric_primary: u8, - /// 256 bits (packed in 16 u16s) to classify every possible - /// byte into compressible or non-compressible. - pub compressible_bytes: &'data [::ULE; 16], -} - -impl<'a> CollationSpecialPrimaries<'a> { - const HARDCODED_COMPRESSIBLE_BYTES_FALLBACK: &'static [::ULE; 16] = &[ - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b1111_1111_1111_1110), - ::ULE::from_unsigned(0b1111_1111_1111_1111), - ::ULE::from_unsigned(0b0000_0000_0000_0001), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0000_0000_0000_0000), - ::ULE::from_unsigned(0b0100_0000_0000_0000), - ]; - - pub(crate) fn validated(self) -> CollationSpecialPrimariesValidated<'a> { - let (last_primaries, compressible_bytes) = - if let Some(borrowed) = self.last_primaries.as_maybe_borrowed() { - let (l, c) = borrowed - .as_ule_slice() - // by invariant - .split_at(MaxVariable::Currency as usize + 1); - ( - l, - c.try_into() - .unwrap_or(Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK), - ) - } else { - ( - self.last_primaries.as_slice().as_ule_slice(), - Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK, - ) - }; - - let last_primaries_truncate_len = last_primaries.len(); - CollationSpecialPrimariesValidated { - last_primaries: self.last_primaries.truncated(last_primaries_truncate_len), - numeric_primary: self.numeric_primary, - compressible_bytes, +#[cfg(feature = "datagen")] +impl serde::Serialize for CollationSpecialPrimaries<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + #[derive(serde::Serialize)] + struct Raw { + #[serde(rename = "last_primaries")] + concatenated: ZeroVec<'static, u16>, + numeric_primary: u8, } - } - pub(crate) const fn const_validated(&'static self) -> CollationSpecialPrimariesValidated<'a> { - let borrowed = self.last_primaries.as_slice(); - let (last_primaries, compressible_bytes) = borrowed - .as_ule_slice() - // by invariant - .split_at(MaxVariable::Currency as usize + 1); - // TODO: use c.as_array() on MSRV 1.93 - let compressible_bytes = if compressible_bytes.len() == 16 { - unsafe { &*(compressible_bytes.as_ptr() as *const [::ULE; 16]) } - } else { - Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK - }; - - CollationSpecialPrimariesValidated { - last_primaries: ZeroSlice::from_ule_slice(last_primaries).as_zerovec(), + Raw { + concatenated: self + .last_primaries + .iter() + .chain(self.compressible_bytes.iter()) + .collect(), numeric_primary: self.numeric_primary, - compressible_bytes, } + .serialize(serializer) } } -#[test] -fn compressible_bytes() { - assert_eq!( - Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1 - .clone() - .validated(), - Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated(), - ); -} - icu_provider::data_struct!( CollationSpecialPrimaries<'_>, #[cfg(feature = "datagen")] ); -impl CollationSpecialPrimariesValidated<'_> { +impl CollationSpecialPrimaries<'_> { #[expect(clippy::unwrap_used)] pub(crate) fn last_primary_for_group(&self, max_variable: MaxVariable) -> u32 { // `unwrap` is OK, because `Collator::try_new` validates the length. @@ -744,11 +708,10 @@ impl CollationSpecialPrimariesValidated<'_> { #[allow(dead_code)] pub(crate) fn is_compressible(&self, b: u8) -> bool { - // Indexing slicing OK by construction and pasting this - // into Compiler Explorer shows that the panic - // is optimized away. - #[expect(clippy::indexing_slicing)] - let field = u16::from_unaligned(self.compressible_bytes[usize::from(b >> 4)]); + let field = self + .compressible_bytes + .get(usize::from(b >> 4)) + .unwrap_or_default(); let mask = 1 << (b & 0b1111); (field & mask) != 0 } diff --git a/provider/data/collator/data/collation_special_primaries_v1.rs.data b/provider/data/collator/data/collation_special_primaries_v1.rs.data index c91beed2438..60fd85c90c7 100644 --- a/provider/data/collator/data/collation_special_primaries_v1.rs.data +++ b/provider/data/collator/data/collation_special_primaries_v1.rs.data @@ -4,7 +4,7 @@ /// `icu`'s `_unstable` constructors. /// /// Using this implementation will embed the following data in the binary's data segment: -/// * 72B[^1] for the singleton data struct +/// * 96B[^1] for the singleton data struct /// /// [^1]: these numbers can be smaller in practice due to linker deduplication /// @@ -21,7 +21,7 @@ macro_rules! __impl_collation_special_primaries_v1 { #[clippy::msrv = "1.86"] impl $provider { #[doc(hidden)] - pub const SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1: &'static ::DataStruct = &icu::collator::provider::CollationSpecialPrimaries { last_primaries: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\x06\x05\0\x0C\xA3\r\0\x0F\0\0\0\0\0\0\0\0\0\0\0\0\xFE\xFF\xFF\xFF\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0@") }, numeric_primary: 16u8 }; + pub const SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1: &'static ::DataStruct = &icu::collator::provider::CollationSpecialPrimaries { last_primaries: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\x06\x05\0\x0C\xA3\r\0\x0F") }, numeric_primary: 16u8, compressible_bytes: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\0\0\0\0\0\0\0\0\0\0\0\0\xFE\xFF\xFF\xFF\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0@") } }; } #[clippy::msrv = "1.86"] impl icu_provider::DataProvider for $provider { diff --git a/provider/data/collator/fingerprints.csv b/provider/data/collator/fingerprints.csv index 4a63f8fa557..2c3f4ab9e05 100644 --- a/provider/data/collator/fingerprints.csv +++ b/provider/data/collator/fingerprints.csv @@ -182,7 +182,7 @@ collation/reordering/v1, und-Hans, -> und-Hani/pinyin collation/reordering/v1, und-Hant, -> und-Hani/stroke collation/reordering/v1, ur, -> ar collation/root/v1, , 131040B, 130923B, 787ce37ea65e1e9 -collation/special/primaries/v1, , 72B, 42B, 46181a77c61fe445 +collation/special/primaries/v1, , 96B, 42B, 46181a77c61fe445 collation/tailoring/v1, , 618B, 103 identifiers collation/tailoring/v1, , 920210B, 908458B, 93 unique payloads collation/tailoring/v1, af, 1004B, 877B, 8e3ca7ba0c0efe4b diff --git a/provider/data/collator/stubdata/collation_special_primaries_v1.rs.data b/provider/data/collator/stubdata/collation_special_primaries_v1.rs.data index c91beed2438..60fd85c90c7 100644 --- a/provider/data/collator/stubdata/collation_special_primaries_v1.rs.data +++ b/provider/data/collator/stubdata/collation_special_primaries_v1.rs.data @@ -4,7 +4,7 @@ /// `icu`'s `_unstable` constructors. /// /// Using this implementation will embed the following data in the binary's data segment: -/// * 72B[^1] for the singleton data struct +/// * 96B[^1] for the singleton data struct /// /// [^1]: these numbers can be smaller in practice due to linker deduplication /// @@ -21,7 +21,7 @@ macro_rules! __impl_collation_special_primaries_v1 { #[clippy::msrv = "1.86"] impl $provider { #[doc(hidden)] - pub const SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1: &'static ::DataStruct = &icu::collator::provider::CollationSpecialPrimaries { last_primaries: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\x06\x05\0\x0C\xA3\r\0\x0F\0\0\0\0\0\0\0\0\0\0\0\0\xFE\xFF\xFF\xFF\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0@") }, numeric_primary: 16u8 }; + pub const SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1: &'static ::DataStruct = &icu::collator::provider::CollationSpecialPrimaries { last_primaries: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\x06\x05\0\x0C\xA3\r\0\x0F") }, numeric_primary: 16u8, compressible_bytes: unsafe { zerovec::ZeroVec::from_bytes_unchecked(b"\0\0\0\0\0\0\0\0\0\0\0\0\xFE\xFF\xFF\xFF\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0@") } }; } #[clippy::msrv = "1.86"] impl icu_provider::DataProvider for $provider { diff --git a/provider/source/src/collator/mod.rs b/provider/source/src/collator/mod.rs index dcb53d6b14c..29710617e12 100644 --- a/provider/source/src/collator/mod.rs +++ b/provider/source/src/collator/mod.rs @@ -395,13 +395,9 @@ impl TryInto> for &collator_serde::CollationS } Ok(CollationSpecialPrimaries { - last_primaries: self - .last_primaries - .iter() - .copied() - .chain(packed_compressible_bytes) - .collect(), + last_primaries: self.last_primaries.iter().copied().collect(), numeric_primary: self.numeric_primary, + compressible_bytes: packed_compressible_bytes.into_iter().collect(), }) } } From 1e966fe705dbf57aff458c750fae449df17e4681 Mon Sep 17 00:00:00 2001 From: Robert Bastian <4706271+robertbastian@users.noreply.github.com> Date: Wed, 15 Apr 2026 17:15:22 +0200 Subject: [PATCH 4/4] inline --- components/collator/src/comparison.rs | 56 ++++++++++++++++----------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/components/collator/src/comparison.rs b/components/collator/src/comparison.rs index f41f215b388..78c89dbf669 100644 --- a/components/collator/src/comparison.rs +++ b/components/collator/src/comparison.rs @@ -773,24 +773,6 @@ impl CollatorBorrowed<'static> { } impl<'a> CollatorBorrowed<'a> { - fn collation_elements>( - &self, - chars: C, - tailoring: &'a CollationData<'a>, - numeric_primary: Option, - ) -> CollationElements<'a, C> { - CollationElements::new( - chars, - self.root, - tailoring, - self.jamo.as_array(), - &self.diacritics.secondaries, - self.decompositions, - self.tables, - numeric_primary, - self.lithuanian_dot_above, - ) - } /// The resolved options showing how the default options, the requested options, /// and the options from locale data were combined. pub fn resolved_options(&self) -> ResolvedCollatorOptions { @@ -948,8 +930,29 @@ impl<'a> CollatorBorrowed<'a> { let tailoring = self.tailoring_or_root(); let numeric_primary = self.numeric_primary(); - let mut left = self.collation_elements(left_chars, tailoring, numeric_primary); - let mut right = self.collation_elements(right_chars, tailoring, numeric_primary); + let jamo = self.jamo.as_array(); + let mut left = CollationElements::new( + left_chars, + self.root, + tailoring, + jamo, + &self.diacritics.secondaries, + self.decompositions, + self.tables, + numeric_primary, + self.lithuanian_dot_above, + ); + let mut right = CollationElements::new( + right_chars, + self.root, + tailoring, + self.jamo.as_array(), + &self.diacritics.secondaries, + self.decompositions, + self.tables, + numeric_primary, + self.lithuanian_dot_above, + ); // Start identical prefix @@ -1817,8 +1820,17 @@ impl<'a> CollatorBorrowed<'a> { // This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C. let levels = self.sort_key_levels(); - let mut iter = - self.collation_elements(iter, self.tailoring_or_root(), self.numeric_primary()); + let mut iter = CollationElements::new( + iter, + self.root, + self.tailoring_or_root(), + self.jamo.as_array(), + &self.diacritics.secondaries, + self.decompositions, + self.tables, + self.numeric_primary(), + self.lithuanian_dot_above, + ); iter.init(); let variable_top = self.variable_top();