Skip to content

Commit afbcd33

Browse files
committed
evolve special primaries data struct
1 parent 3bc2131 commit afbcd33

File tree

6 files changed

+74
-122
lines changed

6 files changed

+74
-122
lines changed

components/collator/src/comparison.rs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ use crate::provider::CollationMetadataV1;
3737
use crate::provider::CollationReordering;
3838
use crate::provider::CollationReorderingV1;
3939
use crate::provider::CollationRootV1;
40+
use crate::provider::CollationSpecialPrimaries;
4041
use crate::provider::CollationSpecialPrimariesV1;
41-
use crate::provider::CollationSpecialPrimariesValidated;
4242
use crate::provider::CollationTailoringV1;
4343
use core::cmp::Ordering;
4444
use core::convert::Infallible;
@@ -48,7 +48,6 @@ use icu_normalizer::provider::NormalizerNfdDataV1;
4848
use icu_normalizer::provider::NormalizerNfdTablesV1;
4949
use icu_normalizer::DecomposingNormalizerBorrowed;
5050
use icu_normalizer::Decomposition;
51-
use icu_provider::marker::ErasedMarker;
5251
use icu_provider::prelude::*;
5352
use smallvec::SmallVec;
5453
use utf16_iter::Utf16CharsEx;
@@ -546,7 +545,7 @@ impl LocaleSpecificDataHolder {
546545
/// Compares strings according to culturally-relevant ordering.
547546
#[derive(Debug)]
548547
pub struct Collator {
549-
special_primaries: DataPayload<ErasedMarker<CollationSpecialPrimariesValidated<'static>>>,
548+
special_primaries: DataPayload<CollationSpecialPrimariesV1>,
550549
root: DataPayload<CollationRootV1>,
551550
tailoring: Option<DataPayload<CollationTailoringV1>>,
552551
jamo: DataPayload<CollationJamoV1>,
@@ -646,8 +645,6 @@ impl Collator {
646645
let locale_dependent =
647646
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
648647

649-
let special_primaries = special_primaries.map_project(|csp, _| csp.validated());
650-
651648
Ok(Collator {
652649
special_primaries,
653650
root,
@@ -693,7 +690,7 @@ macro_rules! compare {
693690
/// borrowed version.
694691
#[derive(Debug)]
695692
pub struct CollatorBorrowed<'a> {
696-
special_primaries: &'a CollationSpecialPrimariesValidated<'a>,
693+
special_primaries: &'a CollationSpecialPrimaries<'a>,
697694
root: &'a CollationData<'a>,
698695
tailoring: Option<&'a CollationData<'a>>,
699696
jamo: &'a CollationJamo<'a>,
@@ -720,13 +717,10 @@ impl CollatorBorrowed<'static> {
720717
let tables = icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1;
721718
let root = crate::provider::Baked::SINGLETON_COLLATION_ROOT_V1;
722719
let jamo = crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1;
723-
724720
let locale_dependent =
725721
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
726722

727-
let special_primaries = const {
728-
&crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated()
729-
};
723+
let special_primaries = crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1;
730724

731725
// Attribute belongs closer to `unwrap`, but
732726
// https://github.com/rust-lang/rust/issues/15701

components/collator/src/provider.rs

Lines changed: 63 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -588,20 +588,22 @@ impl CollationMetadata {
588588
/// to be stable, their Rust representation might not be. Use with caution.
589589
/// </div>
590590
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
591-
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
591+
#[cfg_attr(feature = "datagen", derive(databake::Bake))]
592592
#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
593593
pub struct CollationSpecialPrimaries<'data> {
594594
/// The primaries corresponding to `MaxVariable`
595595
/// character classes packed so that each fits in
596596
/// 16 bits. Length must match the number of enum
597597
/// variants in `MaxVariable`, currently 4.
598-
///
599-
/// This is potentially followed by 256 bits
600-
/// (packed in 16 u16s) to classify every possible
601-
/// byte into compressible or non-compressible.
602598
pub last_primaries: ZeroVec<'data, u16>,
603599
/// The high 8 bits of the numeric primary
604600
pub numeric_primary: u8,
601+
/// 256 bits (packed in 16 u16s) to classify every possible
602+
/// byte into compressible or non-compressible.
603+
///
604+
/// In the serde encoding, this is appended to `last_primaries`,
605+
/// or might be missing.
606+
pub compressible_bytes: ZeroVec<'data, u16>,
605607
}
606608

607609
#[cfg(feature = "serde")]
@@ -613,126 +615,87 @@ impl<'de> serde::Deserialize<'de> for CollationSpecialPrimaries<'de> {
613615
#[derive(serde::Deserialize)]
614616
struct Raw<'data> {
615617
#[cfg_attr(feature = "serde", serde(borrow))]
616-
last_primaries: ZeroVec<'data, u16>,
618+
concatenated: &'data ZeroSlice<u16>,
617619
numeric_primary: u8,
618620
}
619621

620622
let Raw {
621-
last_primaries,
623+
concatenated,
622624
numeric_primary,
623625
} = Raw::deserialize(deserializer)?;
624626

625-
// `variant_count` isn't stable yet:
626-
// https://github.com/rust-lang/rust/issues/73662
627-
if last_primaries.len() <= (MaxVariable::Currency as usize) {
627+
let Some((l, c)) = concatenated
628+
.as_ule_slice()
629+
// `variant_count` isn't stable yet:
630+
// https://github.com/rust-lang/rust/issues/73662
631+
.split_at_checked(MaxVariable::Currency as usize)
632+
else {
628633
return Err(serde::de::Error::custom("invalid"));
634+
};
635+
636+
let last_primaries = ZeroSlice::from_ule_slice(l).as_zerovec();
637+
let mut compressible_bytes = ZeroSlice::from_ule_slice(c).as_zerovec();
638+
639+
if c.len() != 16 {
640+
compressible_bytes = zerovec::zerovec!(
641+
u16; <u16 as AsULE>::ULE::from_unsigned; [
642+
0b0000_0000_0000_0000,
643+
0b0000_0000_0000_0000,
644+
0b0000_0000_0000_0000,
645+
0b0000_0000_0000_0000,
646+
0b0000_0000_0000_0000,
647+
0b0000_0000_0000_0000,
648+
0b1111_1111_1111_1110,
649+
0b1111_1111_1111_1111,
650+
0b0000_0000_0000_0001,
651+
0b0000_0000_0000_0000,
652+
0b0000_0000_0000_0000,
653+
0b0000_0000_0000_0000,
654+
0b0000_0000_0000_0000,
655+
0b0000_0000_0000_0000,
656+
0b0000_0000_0000_0000,
657+
0b0100_0000_0000_0000,
658+
]);
629659
}
630660

631661
Ok(Self {
632662
last_primaries,
633663
numeric_primary,
664+
compressible_bytes,
634665
})
635666
}
636667
}
637668

638-
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
639-
pub(crate) struct CollationSpecialPrimariesValidated<'data> {
640-
/// The primaries corresponding to `MaxVariable`
641-
/// character classes packed so that each fits in
642-
/// 16 bits. Length must match the number of enum
643-
/// variants in `MaxVariable`, currently 4.
644-
pub last_primaries: ZeroVec<'data, u16>,
645-
/// The high 8 bits of the numeric primary
646-
pub numeric_primary: u8,
647-
/// 256 bits (packed in 16 u16s) to classify every possible
648-
/// byte into compressible or non-compressible.
649-
pub compressible_bytes: &'data [<u16 as AsULE>::ULE; 16],
650-
}
651-
652-
impl<'a> CollationSpecialPrimaries<'a> {
653-
const HARDCODED_COMPRESSIBLE_BYTES_FALLBACK: &'static [<u16 as AsULE>::ULE; 16] = &[
654-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
655-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
656-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
657-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
658-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
659-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
660-
<u16 as AsULE>::ULE::from_unsigned(0b1111_1111_1111_1110),
661-
<u16 as AsULE>::ULE::from_unsigned(0b1111_1111_1111_1111),
662-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0001),
663-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
664-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
665-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
666-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
667-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
668-
<u16 as AsULE>::ULE::from_unsigned(0b0000_0000_0000_0000),
669-
<u16 as AsULE>::ULE::from_unsigned(0b0100_0000_0000_0000),
670-
];
671-
672-
pub(crate) fn validated(self) -> CollationSpecialPrimariesValidated<'a> {
673-
let (last_primaries, compressible_bytes) =
674-
if let Some(borrowed) = self.last_primaries.as_maybe_borrowed() {
675-
let (l, c) = borrowed
676-
.as_ule_slice()
677-
// by invariant
678-
.split_at(MaxVariable::Currency as usize);
679-
(
680-
l,
681-
c.try_into()
682-
.unwrap_or(Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK),
683-
)
684-
} else {
685-
(
686-
self.last_primaries.as_slice().as_ule_slice(),
687-
Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK,
688-
)
689-
};
690-
691-
let last_primaries_truncate_len = last_primaries.len();
692-
CollationSpecialPrimariesValidated {
693-
last_primaries: self.last_primaries.truncated(last_primaries_truncate_len),
694-
numeric_primary: self.numeric_primary,
695-
compressible_bytes,
669+
#[cfg(feature = "datagen")]
670+
impl serde::Serialize for CollationSpecialPrimaries<'_> {
671+
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
672+
where
673+
S: serde::Serializer,
674+
{
675+
#[derive(serde::Serialize)]
676+
struct Raw {
677+
concatenated: ZeroVec<'static, u16>,
678+
numeric_primary: u8,
696679
}
697-
}
698680

699-
pub(crate) const fn const_validated(&'static self) -> CollationSpecialPrimariesValidated<'a> {
700-
let borrowed = self.last_primaries.as_slice();
701-
let (last_primaries, compressible_bytes) = borrowed
702-
.as_ule_slice()
703-
// by invariant
704-
.split_at(MaxVariable::Currency as usize);
705-
// TODO: use c.as_array() on MSRV 1.93
706-
let compressible_bytes = if compressible_bytes.len() == 16 {
707-
unsafe { &*(compressible_bytes.as_ptr() as *const [<u16 as AsULE>::ULE; 16]) }
708-
} else {
709-
Self::HARDCODED_COMPRESSIBLE_BYTES_FALLBACK
710-
};
711-
712-
CollationSpecialPrimariesValidated {
713-
last_primaries: ZeroSlice::from_ule_slice(last_primaries).as_zerovec(),
681+
Raw {
682+
concatenated: self
683+
.last_primaries
684+
.iter()
685+
.chain(self.compressible_bytes.iter())
686+
.collect(),
714687
numeric_primary: self.numeric_primary,
715-
compressible_bytes,
716688
}
689+
.serialize(serializer)
717690
}
718691
}
719692

720-
#[test]
721-
fn compressible_bytes() {
722-
assert_eq!(
723-
Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
724-
.clone()
725-
.validated(),
726-
Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1.const_validated(),
727-
);
728-
}
729-
730693
icu_provider::data_struct!(
731694
CollationSpecialPrimaries<'_>,
732695
#[cfg(feature = "datagen")]
733696
);
734697

735-
impl CollationSpecialPrimariesValidated<'_> {
698+
impl CollationSpecialPrimaries<'_> {
736699
#[expect(clippy::unwrap_used)]
737700
pub(crate) fn last_primary_for_group(&self, max_variable: MaxVariable) -> u32 {
738701
// `unwrap` is OK, because `Collator::try_new` validates the length.
@@ -744,11 +707,10 @@ impl CollationSpecialPrimariesValidated<'_> {
744707

745708
#[allow(dead_code)]
746709
pub(crate) fn is_compressible(&self, b: u8) -> bool {
747-
// Indexing slicing OK by construction and pasting this
748-
// into Compiler Explorer shows that the panic
749-
// is optimized away.
750-
#[expect(clippy::indexing_slicing)]
751-
let field = u16::from_unaligned(self.compressible_bytes[usize::from(b >> 4)]);
710+
let field = self
711+
.compressible_bytes
712+
.get(usize::from(b >> 4))
713+
.unwrap_or_default();
752714
let mask = 1 << (b & 0b1111);
753715
(field & mask) != 0
754716
}

provider/data/collator/data/collation_special_primaries_v1.rs.data

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

provider/data/collator/fingerprints.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ collation/reordering/v1, und-Hans, -> und-Hani/pinyin
181181
collation/reordering/v1, und-Hant, -> und-Hani/stroke
182182
collation/reordering/v1, ur, -> ar
183183
collation/root/v1, <singleton>, 131040B, 130923B, 787ce37ea65e1e9
184-
collation/special/primaries/v1, <singleton>, 72B, 42B, 46181a77c61fe445
184+
collation/special/primaries/v1, <singleton>, 96B, 42B, 46181a77c61fe445
185185
collation/tailoring/v1, <lookup>, 608B, 102 identifiers
186186
collation/tailoring/v1, <total>, 920210B, 908458B, 93 unique payloads
187187
collation/tailoring/v1, af, 1004B, 877B, 8e3ca7ba0c0efe4b

provider/data/collator/stubdata/collation_special_primaries_v1.rs.data

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

provider/source/src/collator/mod.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -352,13 +352,9 @@ impl TryInto<CollationSpecialPrimaries<'static>> for &collator_serde::CollationS
352352
}
353353

354354
Ok(CollationSpecialPrimaries {
355-
last_primaries: self
356-
.last_primaries
357-
.iter()
358-
.copied()
359-
.chain(packed_compressible_bytes)
360-
.collect(),
355+
last_primaries: self.last_primaries.iter().copied().collect(),
361356
numeric_primary: self.numeric_primary,
357+
compressible_bytes: packed_compressible_bytes.into_iter().collect(),
362358
})
363359
}
364360
}

0 commit comments

Comments
 (0)