Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 43 additions & 135 deletions components/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,12 @@ use crate::elements::FALLBACK_CE32;
use crate::elements::NON_ROUND_TRIP_MARKER;
use crate::elements::{
char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT,
QUATERNARY_MASK,
};
use crate::options::CollatorOptionsBitField;
use crate::options::{
AlternateHandling, CollatorOptions, MaxVariable, ResolvedCollatorOptions, Strength,
};
use crate::options::{AlternateHandling, CollatorOptions, ResolvedCollatorOptions, Strength};
use crate::preferences::{CollationCaseFirst, CollationNumericOrdering, CollationType};
use crate::provider::CollationData;
use crate::provider::CollationDiacritics;
Expand All @@ -39,23 +37,21 @@ use crate::provider::CollationMetadataV1;
use crate::provider::CollationReordering;
use crate::provider::CollationReorderingV1;
use crate::provider::CollationRootV1;
use crate::provider::CollationSpecialPrimaries;
use crate::provider::CollationSpecialPrimariesV1;
use crate::provider::CollationSpecialPrimariesValidated;
use crate::provider::CollationTailoringV1;
use core::cmp::Ordering;
use core::convert::{Infallible, TryFrom};
use core::convert::Infallible;
use icu_normalizer::provider::DecompositionData;
use icu_normalizer::provider::DecompositionTables;
use icu_normalizer::provider::NormalizerNfdDataV1;
use icu_normalizer::provider::NormalizerNfdTablesV1;
use icu_normalizer::DecomposingNormalizerBorrowed;
use icu_normalizer::Decomposition;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
use smallvec::SmallVec;
use utf16_iter::Utf16CharsEx;
use utf8_iter::Utf8CharsEx;
use zerovec::ule::AsULE;

// Special sort key bytes for all levels.
const LEVEL_SEPARATOR_BYTE: u8 = 1;
Expand Down Expand Up @@ -549,7 +545,7 @@ impl LocaleSpecificDataHolder {
/// Compares strings according to culturally-relevant ordering.
#[derive(Debug)]
pub struct Collator {
special_primaries: DataPayload<ErasedMarker<CollationSpecialPrimariesValidated<'static>>>,
special_primaries: DataPayload<CollationSpecialPrimariesV1>,
root: DataPayload<CollationRootV1>,
tailoring: Option<DataPayload<CollationTailoringV1>>,
jamo: DataPayload<CollationJamoV1>,
Expand Down Expand Up @@ -649,39 +645,6 @@ impl Collator {
let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;

// TODO: redesign Korean search collation handling
if jamo.get().ce32s.len() != JAMO_COUNT {
return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
}

// `variant_count` isn't stable yet:
// https://github.com/rust-lang/rust/issues/73662
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO));
}
let special_primaries = special_primaries.map_project(|csp, _| {
let compressible_bytes = (csp.last_primaries.len()
== MaxVariable::Currency as usize + 16)
.then(|| {
csp.last_primaries
.as_maybe_borrowed()?
.as_ule_slice()
.get((MaxVariable::Currency as usize)..)?
.try_into()
.ok()
})
.flatten()
.unwrap_or(
CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK,
);

CollationSpecialPrimariesValidated {
last_primaries: csp.last_primaries.truncated(MaxVariable::Currency as usize),
numeric_primary: csp.numeric_primary,
compressible_bytes,
}
});

Ok(Collator {
special_primaries,
root,
Expand Down Expand Up @@ -727,7 +690,7 @@ macro_rules! compare {
/// borrowed version.
#[derive(Debug)]
pub struct CollatorBorrowed<'a> {
special_primaries: &'a CollationSpecialPrimariesValidated<'a>,
special_primaries: &'a CollationSpecialPrimaries<'a>,
root: &'a CollationData<'a>,
tailoring: Option<&'a CollationData<'a>>,
jamo: &'a CollationJamo<'a>,
Expand All @@ -754,75 +717,10 @@ impl CollatorBorrowed<'static> {
let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;

let locale_dependent =
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;

// TODO: redesign Korean search collation handling
const _: () = assert!(
crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1
.ce32s
.as_slice()
.len()
== JAMO_COUNT
);

// `variant_count` isn't stable yet:
// https://github.com/rust-lang/rust/issues/73662
const _: () = assert!(
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.len()
> (MaxVariable::Currency as usize)
);

let special_primaries = const {
&CollationSpecialPrimariesValidated {
last_primaries: zerovec::ZeroSlice::from_ule_slice(
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.as_ule_slice()
.split_at(MaxVariable::Currency as usize)
.0,
)
.as_zerovec(),
numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.numeric_primary,
compressible_bytes: {
const C: &[<u16 as AsULE>::ULE] =
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
.last_primaries
.as_slice()
.as_ule_slice();
if C.len() == MaxVariable::Currency as usize + 16 {
let i = MaxVariable::Currency as usize;
#[allow(clippy::indexing_slicing)] // protected, const
&[
C[i],
C[i + 1],
C[i + 2],
C[i + 3],
C[i + 4],
C[i + 5],
C[i + 6],
C[i + 7],
C[i + 8],
C[i + 9],
C[i + 10],
C[i + 11],
C[i + 12],
C[i + 13],
C[i + 14],
C[i + 15],
]
} else {
CollationSpecialPrimariesValidated::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK
}
},
}
};
let special_primaries = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1;

// Attribute belongs closer to `unwrap`, but
// https://github.com/rust-lang/rust/issues/15701
Expand Down Expand Up @@ -874,27 +772,7 @@ impl CollatorBorrowed<'static> {
}
}

macro_rules! collation_elements {
($self:expr, $chars:expr, $tailoring:expr, $numeric_primary:expr) => {{
let jamo = <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice());

let jamo = jamo.unwrap();

CollationElements::new(
$chars,
$self.root,
$tailoring,
jamo,
&$self.diacritics.secondaries,
$self.decompositions,
$self.tables,
$numeric_primary,
$self.lithuanian_dot_above,
)
}};
}

impl CollatorBorrowed<'_> {
impl<'a> CollatorBorrowed<'a> {
/// The resolved options showing how the default options, the requested options,
/// and the options from locale data were combined.
pub fn resolved_options(&self) -> ResolvedCollatorOptions {
Expand Down Expand Up @@ -971,7 +849,7 @@ impl CollatorBorrowed<'_> {
);

#[inline(always)]
fn tailoring_or_root(&self) -> &CollationData<'_> {
fn tailoring_or_root(&self) -> &'a CollationData<'a> {
if let Some(tailoring) = &self.tailoring {
tailoring
} else {
Expand Down Expand Up @@ -1052,8 +930,29 @@ impl CollatorBorrowed<'_> {

let tailoring = self.tailoring_or_root();
let numeric_primary = self.numeric_primary();
let mut left = collation_elements!(self, left_chars, tailoring, numeric_primary);
let mut right = collation_elements!(self, right_chars, tailoring, numeric_primary);
let jamo = self.jamo.as_array();
let mut left = CollationElements::new(
left_chars,
self.root,
tailoring,
jamo,
&self.diacritics.secondaries,
self.decompositions,
self.tables,
numeric_primary,
self.lithuanian_dot_above,
);
let mut right = CollationElements::new(
right_chars,
self.root,
tailoring,
self.jamo.as_array(),
&self.diacritics.secondaries,
self.decompositions,
self.tables,
numeric_primary,
self.lithuanian_dot_above,
);

// Start identical prefix

Expand Down Expand Up @@ -1921,8 +1820,17 @@ impl CollatorBorrowed<'_> {
// This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C.
let levels = self.sort_key_levels();

let mut iter =
collation_elements!(self, iter, self.tailoring_or_root(), self.numeric_primary());
let mut iter = CollationElements::new(
iter,
self.root,
self.tailoring_or_root(),
self.jamo.as_array(),
&self.diacritics.secondaries,
self.decompositions,
self.tables,
self.numeric_primary(),
self.lithuanian_dot_above,
);
iter.init();
let variable_top = self.variable_top();

Expand Down
19 changes: 8 additions & 11 deletions components/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1502,16 +1502,7 @@ where
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
// The character is its own decomposition
let jamo_index = (c as usize).wrapping_sub(HANGUL_L_BASE as usize);
// Attribute belongs on an inner expression, but
// https://github.com/rust-lang/rust/issues/15701
#[expect(clippy::indexing_slicing)]
if jamo_index >= self.jamo.len() {
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
} else {
if let Some(&jamo) = self.jamo.get(jamo_index) {
// The purpose of reading the CE32 from the jamo table instead
// of the trie even in this case is to make it unnecessary
// for all search collation tries to carry a copy of the Hangul
Expand All @@ -1531,7 +1522,13 @@ where
data = self.root;
// Index in range by construction above. Not using `get` with
// `if let` in order to put the likely branch first.
ce32 = CollationElement32::new_from_ule(self.jamo[jamo_index]);
ce32 = CollationElement32::new_from_ule(jamo);
} else {
ce32 = data.ce32_for_char(c);
if ce32 == FALLBACK_CE32 {
data = self.root;
ce32 = data.ce32_for_char(c);
}
}
if self.is_next_decomposition_starts_with_starter() {
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {
Expand Down
Loading
Loading