Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ smallvec = { version = "1.10.0", default-features = false }
stable_deref_trait = { version = "1.2.0", default-features = false }
twox-hash = { version = "2.0.0", default-features = false, features = ["xxhash64"] }
unicode-bidi = { version = "0.3.11", default-features = false }
utf16_iter = { version = "1.0.2", default-features = false }
utf8_iter = { version = "1.0.2", default-features = false }
write16 = { version = "1.0.0", default-features = false }
utf16_iter = { path = "../utf16_iter", default-features = false }
utf8_iter = { path = "../utf8_iter", default-features = false }
write16 = { path = "../write16", default-features = false }

## External Deps Group 2: Heavy Dev and Datagen deps. No default features.
zip = { version = "2", default-features = false }
Expand Down
4 changes: 2 additions & 2 deletions components/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ icu_normalizer = { workspace = true, features = ["utf8_iter", "utf16_iter"] }
icu_locale_core = { workspace = true, features = ["alloc"] }
icu_properties = { workspace = true }
icu_provider = { workspace = true }
utf8_iter = { workspace = true }
utf16_iter = { workspace = true }
utf16_iter = { path = "../../../utf16_iter", features = ["icu_collections"] }
utf8_iter = { path = "../../../utf8_iter", features = ["icu_collections"] }
smallvec = { workspace = true, features = ["union", "const_generics", "const_new"] } # alloc
zerovec = { workspace = true }

Expand Down
128 changes: 69 additions & 59 deletions components/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,19 +44,32 @@ use crate::provider::CollationSpecialPrimariesValidated;
use crate::provider::CollationTailoringV1;
use core::cmp::Ordering;
use core::convert::{Infallible, TryFrom};
use icu_collections::codepointtrie::AbstractCodePointTrie;
use icu_collections::codepointtrie::CharsWithTrieDefaultForAsciiEx;
#[cfg(feature = "serde")]
use icu_collections::codepointtrie::CodePointTrie;
#[cfg(not(feature = "serde"))]
use icu_collections::codepointtrie::FastCodePointTrie;
#[cfg(feature = "latin1")]
use icu_collections::codepointtrie::Latin1CharsWithTrieEx;
use icu_collections::codepointtrie::WithTrie;
use icu_normalizer::provider::DecompositionData;
use icu_normalizer::provider::DecompositionTables;
use icu_normalizer::provider::NormalizerNfdDataV1;
use icu_normalizer::provider::NormalizerNfdTablesV1;
use icu_normalizer::DecomposingNormalizerBorrowed;
use icu_normalizer::Decomposition;
use icu_provider::marker::ErasedMarker;
use icu_provider::prelude::*;
use smallvec::SmallVec;
use utf16_iter::Utf16CharsEx;
use utf8_iter::Utf8CharsEx;
use utf16_iter::Utf16CharsWithTrieEx;
use utf8_iter::Utf8CharsWithTrieDefaultForAsciiEx;
use zerovec::ule::AsULE;

#[cfg(feature = "serde")]
type NormTrie<'trie> = CodePointTrie<'trie, u32>;

#[cfg(not(feature = "serde"))]
type NormTrie<'trie> = FastCodePointTrie<'trie, u32>;

// Special sort key bytes for all levels.
const LEVEL_SEPARATOR_BYTE: u8 = 1;

Expand Down Expand Up @@ -187,21 +200,6 @@ fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
i.wrapping_sub(start) <= (end - start)
}

/// Helper trait for getting a `char` iterator from Latin1 data.
///
/// ✨ *Enabled with the `latin1` Cargo feature.*
#[cfg(feature = "latin1")]
trait Latin1Chars {
fn latin1_chars(&self) -> impl DoubleEndedIterator<Item = char>;
}

#[cfg(feature = "latin1")]
impl Latin1Chars for [u8] {
fn latin1_chars(&self) -> impl DoubleEndedIterator<Item = char> {
self.iter().map(|b| char::from(*b))
}
}

/// Finds the identical prefix of `left` and `right` containing
/// Latin1.
///
Expand Down Expand Up @@ -712,10 +710,12 @@ macro_rules! compare {
if left_tail.is_empty() && right_tail.is_empty() {
return Ordering::Equal;
}
let ret = self.compare_impl(left_tail.$left_to_iter(), right_tail.$right_to_iter(), head.$left_to_iter().rev());
let norm_trie = self.norm_trie();
let ret = self.compare_impl(left_tail.$left_to_iter(norm_trie), right_tail.$right_to_iter(norm_trie), head.$left_to_iter(norm_trie).rev());
if self.options.strength() == Strength::Identical && ret == Ordering::Equal {
return Decomposition::new(left_tail.$left_to_iter(), self.decompositions, self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }).cmp(
Decomposition::new(right_tail.$right_to_iter(), self.decompositions, self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }),
// We don't need to remove the leading U+0000, because it compares equal anyway.
return icu_normalizer::new_decomposition(left_tail.$left_to_iter(norm_trie), self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }).cmp(
icu_normalizer::new_decomposition(right_tail.$right_to_iter(norm_trie), self.tables).map(|c| if c != MERGE_SEPARATOR { c as i32 } else { -1i32 }),
);
}
ret
Expand Down Expand Up @@ -886,30 +886,35 @@ macro_rules! collation_elements {
$tailoring,
jamo,
&$self.diacritics.secondaries,
$self.decompositions,
$self.tables,
$numeric_primary,
$self.lithuanian_dot_above,
)
}};
}

impl CollatorBorrowed<'_> {
impl<'data> CollatorBorrowed<'data> {
/// The resolved options showing how the default options, the requested options,
/// and the options from locale data were combined.
pub fn resolved_options(&self) -> ResolvedCollatorOptions {
self.options.into()
}

fn norm_trie(&self) -> &'data NormTrie<'data> {
#[allow(clippy::useless_conversion)]
<&NormTrie<'data>>::try_from(&self.decompositions.trie)
.unwrap_or_else(|_| unreachable!("Incompatible data"))
}

compare!(
/// Compare guaranteed well-formed UTF-8 slices.
,
compare,
str,
str,
split_prefix,
chars,
chars,
chars_with_trie_default_for_ascii,
chars_with_trie_default_for_ascii,
);

compare!(
Expand All @@ -921,8 +926,8 @@ impl CollatorBorrowed<'_> {
[u8],
[u8],
split_prefix_u8,
chars,
chars,
chars_with_trie_default_for_ascii,
chars_with_trie_default_for_ascii,
);

compare!(
Expand All @@ -933,8 +938,8 @@ impl CollatorBorrowed<'_> {
[u16],
[u16],
split_prefix_u16,
chars,
chars,
chars_with_trie,
chars_with_trie,
);

compare!(
Expand All @@ -947,8 +952,8 @@ impl CollatorBorrowed<'_> {
[u8],
[u8],
split_prefix_latin1,
latin1_chars,
latin1_chars,
latin1_chars_with_trie,
latin1_chars_with_trie,
);

compare!(
Expand All @@ -966,8 +971,8 @@ impl CollatorBorrowed<'_> {
[u8],
[u16],
split_prefix_latin1_utf16,
latin1_chars,
chars,
latin1_chars_with_trie,
chars_with_trie,
);

#[inline(always)]
Expand Down Expand Up @@ -1018,16 +1023,18 @@ impl CollatorBorrowed<'_> {
/// `head_chars` is an iterator _backward_ over the identical
/// prefix and `left_chars` and `right_chars` are iterators
/// _forward_ over the parts after the identical prefix.
fn compare_impl<
L: Iterator<Item = char>,
R: Iterator<Item = char>,
H: Iterator<Item = char>,
>(
&self,
fn compare_impl<L, R, H, T>(
&'data self,
left_chars: L,
right_chars: R,
mut head_chars: H,
) -> Ordering {
) -> Ordering
where
L: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + 'data,
R: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + 'data,
H: Iterator<Item = (char, u32)> + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
{
// Sadly, it looks like variable CEs and backward second level
// require us to store the full 64-bit CEs instead of storing only
// the NonPrimary part.
Expand All @@ -1052,6 +1059,7 @@ impl CollatorBorrowed<'_> {

let tailoring = self.tailoring_or_root();
let numeric_primary = self.numeric_primary();

let mut left = collation_elements!(self, left_chars, tailoring, numeric_primary);
let mut right = collation_elements!(self, right_chars, tailoring, numeric_primary);

Expand All @@ -1077,11 +1085,10 @@ impl CollatorBorrowed<'_> {
// This loop is only broken out of as goto forward.
#[expect(clippy::never_loop)]
'prefix: loop {
if let Some(mut head_last_c) = head_chars.next() {
let norm_trie = &self.decompositions.trie;
if let Some((mut head_last_c, head_last_trie_val)) = head_chars.next() {
let mut head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
head_last_c,
norm_trie.get(head_last_c),
head_last_trie_val,
);
let mut head_last_ce32 = CollationElement32::default();
let mut head_last_ok = false;
Expand Down Expand Up @@ -1274,14 +1281,12 @@ impl CollatorBorrowed<'_> {
tail_first_ce32 = head_last_ce32;
tail_first_ok = head_last_ok;

head_last_c = if let Some(head_last_c) = head_chars.next() {
head_last_c
} else {
let Some((head_last_c_new, decomposition)) = head_chars.next() else {
// We need to step back beyond the start of the prefix.
// Treat as good boundary.
break 'prefix;
};
let decomposition = norm_trie.get(head_last_c);
head_last_c = head_last_c_new;
head_last = CharacterAndClassAndTrieValue::new_with_trie_val(
head_last_c,
decomposition,
Expand Down Expand Up @@ -1852,7 +1857,7 @@ impl CollatorBorrowed<'_> {
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars(), sink)
self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink)
}

/// Given potentially invalid UTF-8, write the sort key bytes up to the collator's strength.
Expand All @@ -1863,7 +1868,7 @@ impl CollatorBorrowed<'_> {
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars(), sink)
self.write_sort_key_impl(s.chars_with_trie_default_for_ascii(self.norm_trie()), sink)
}

/// Given potentially invalid UTF-16, write the sort key bytes up to the collator's strength.
Expand All @@ -1874,12 +1879,17 @@ impl CollatorBorrowed<'_> {
S: CollationKeySink + ?Sized,
S::State: Default,
{
self.write_sort_key_impl(s.chars(), sink)
self.write_sort_key_impl(s.chars_with_trie(self.norm_trie()), sink)
}

fn write_sort_key_impl<I, S>(&self, iter: I, sink: &mut S) -> Result<S::Output, S::Error>
fn write_sort_key_impl<I, T, S>(
&'data self,
iter: I,
sink: &mut S,
) -> Result<S::Output, S::Error>
where
I: Iterator<Item = char> + Clone,
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + Clone + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
S: CollationKeySink + ?Sized,
S::State: Default,
{
Expand All @@ -1893,11 +1903,10 @@ impl CollatorBorrowed<'_> {
self.write_sort_key_up_to_quaternary(iter, sink, &mut state)?;

if let Some(iter) = identical {
let nfd =
DecomposingNormalizerBorrowed::new_with_data(self.decompositions, self.tables);
sink.write_byte(&mut state, LEVEL_SEPARATOR_BYTE)?;

let iter = nfd.normalize_iter(iter);
let mut iter = icu_normalizer::new_decomposition(iter, self.tables);
let _ = iter.next(); // Discard the U+0000.
write_identical_level(iter, sink, &mut state)?;
}

Expand All @@ -1908,14 +1917,15 @@ impl CollatorBorrowed<'_> {
///
/// Optionally write the case level. Separate levels with the `LEVEL_SEPARATOR_BYTE`, but
/// do not write a terminating zero as with a C string.
fn write_sort_key_up_to_quaternary<I, S>(
&self,
fn write_sort_key_up_to_quaternary<I, S, T>(
&'data self,
iter: I,
sink: &mut S,
state: &mut S::State,
) -> Result<(), S::Error>
where
I: Iterator<Item = char>,
I: Iterator<Item = (char, u32)> + WithTrie<'data, T, u32> + Clone + 'data,
T: AbstractCodePointTrie<'data, u32> + 'data,
S: CollationKeySink + ?Sized,
{
// This algorithm comes from `CollationKeys::writeSortKeyUpToQuaternary` in ICU4C.
Expand Down
Loading
Loading