Skip to content

Commit be9f63d

Browse files
committed
check struct invariants during deserialization
1 parent 0f348cc commit be9f63d

File tree

3 files changed

+101
-71
lines changed

3 files changed

+101
-71
lines changed

components/collator/src/comparison.rs

Lines changed: 24 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use crate::elements::FALLBACK_CE32;
2121
use crate::elements::NON_ROUND_TRIP_MARKER;
2222
use crate::elements::{
2323
char_from_u32, CollationElement, CollationElements, NonPrimary, FFFD_CE32,
24-
HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, JAMO_COUNT, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
24+
HANGUL_SYLLABLE_MARKER, HIGH_ZEROS_MASK, LOW_ZEROS_MASK, NO_CE, NO_CE_PRIMARY,
2525
NO_CE_QUATERNARY, NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT,
2626
QUATERNARY_MASK,
2727
};
@@ -43,7 +43,7 @@ use crate::provider::CollationSpecialPrimariesV1;
4343
use crate::provider::CollationSpecialPrimariesValidated;
4444
use crate::provider::CollationTailoringV1;
4545
use core::cmp::Ordering;
46-
use core::convert::{Infallible, TryFrom};
46+
use core::convert::Infallible;
4747
use icu_normalizer::provider::DecompositionData;
4848
use icu_normalizer::provider::DecompositionTables;
4949
use icu_normalizer::provider::NormalizerNfdDataV1;
@@ -55,7 +55,6 @@ use icu_provider::prelude::*;
5555
use smallvec::SmallVec;
5656
use utf16_iter::Utf16CharsEx;
5757
use utf8_iter::Utf8CharsEx;
58-
use zerovec::ule::AsULE;
5958

6059
// Special sort key bytes for all levels.
6160
const LEVEL_SEPARATOR_BYTE: u8 = 1;
@@ -649,16 +648,6 @@ impl Collator {
649648
let locale_dependent =
650649
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
651650

652-
// TODO: redesign Korean search collation handling
653-
if jamo.get().ce32s.len() != JAMO_COUNT {
654-
return Err(DataError::custom("invalid").with_marker(CollationJamoV1::INFO));
655-
}
656-
657-
// `variant_count` isn't stable yet:
658-
// https://github.com/rust-lang/rust/issues/73662
659-
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
660-
return Err(DataError::custom("invalid").with_marker(CollationSpecialPrimariesV1::INFO));
661-
}
662651
let special_primaries = special_primaries.map_project(|csp, _| {
663652
let compressible_bytes = (csp.last_primaries.len()
664653
== MaxVariable::Currency as usize + 16)
@@ -758,25 +747,6 @@ impl CollatorBorrowed<'static> {
758747
let locale_dependent =
759748
LocaleSpecificDataHolder::try_new_unstable_internal(provider, prefs, options)?;
760749

761-
// TODO: redesign Korean search collation handling
762-
const _: () = assert!(
763-
crate::provider::Baked::SINGLETON_COLLATION_JAMO_V1
764-
.ce32s
765-
.as_slice()
766-
.len()
767-
== JAMO_COUNT
768-
);
769-
770-
// `variant_count` isn't stable yet:
771-
// https://github.com/rust-lang/rust/issues/73662
772-
const _: () = assert!(
773-
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
774-
.last_primaries
775-
.as_slice()
776-
.len()
777-
> (MaxVariable::Currency as usize)
778-
);
779-
780750
let special_primaries = const {
781751
&CollationSpecialPrimariesValidated {
782752
last_primaries: zerovec::ZeroSlice::from_ule_slice(
@@ -791,7 +761,7 @@ impl CollatorBorrowed<'static> {
791761
numeric_primary: crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
792762
.numeric_primary,
793763
compressible_bytes: {
794-
const C: &[<u16 as AsULE>::ULE] =
764+
const C: &[<u16 as zerovec::ule::AsULE>::ULE] =
795765
crate::provider::Baked::SINGLETON_COLLATION_SPECIAL_PRIMARIES_V1
796766
.last_primaries
797767
.as_slice()
@@ -874,27 +844,25 @@ impl CollatorBorrowed<'static> {
874844
}
875845
}
876846

877-
macro_rules! collation_elements {
878-
($self:expr, $chars:expr, $tailoring:expr, $numeric_primary:expr) => {{
879-
let jamo = <&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from($self.jamo.ce32s.as_ule_slice());
880-
881-
let jamo = jamo.unwrap();
882-
847+
impl<'a> CollatorBorrowed<'a> {
848+
fn collation_elements<C: Iterator<Item = char>>(
849+
&self,
850+
chars: C,
851+
tailoring: &'a CollationData<'a>,
852+
numeric_primary: Option<u8>,
853+
) -> CollationElements<'a, C> {
883854
CollationElements::new(
884-
$chars,
885-
$self.root,
886-
$tailoring,
887-
jamo,
888-
&$self.diacritics.secondaries,
889-
$self.decompositions,
890-
$self.tables,
891-
$numeric_primary,
892-
$self.lithuanian_dot_above,
855+
chars,
856+
self.root,
857+
tailoring,
858+
self.jamo.as_array(),
859+
&self.diacritics.secondaries,
860+
self.decompositions,
861+
self.tables,
862+
numeric_primary,
863+
self.lithuanian_dot_above,
893864
)
894-
}};
895-
}
896-
897-
impl CollatorBorrowed<'_> {
865+
}
898866
/// The resolved options showing how the default options, the requested options,
899867
/// and the options from locale data were combined.
900868
pub fn resolved_options(&self) -> ResolvedCollatorOptions {
@@ -971,7 +939,7 @@ impl CollatorBorrowed<'_> {
971939
);
972940

973941
#[inline(always)]
974-
fn tailoring_or_root(&self) -> &CollationData<'_> {
942+
fn tailoring_or_root(&self) -> &'a CollationData<'a> {
975943
if let Some(tailoring) = &self.tailoring {
976944
tailoring
977945
} else {
@@ -1052,8 +1020,8 @@ impl CollatorBorrowed<'_> {
10521020

10531021
let tailoring = self.tailoring_or_root();
10541022
let numeric_primary = self.numeric_primary();
1055-
let mut left = collation_elements!(self, left_chars, tailoring, numeric_primary);
1056-
let mut right = collation_elements!(self, right_chars, tailoring, numeric_primary);
1023+
let mut left = self.collation_elements(left_chars, tailoring, numeric_primary);
1024+
let mut right = self.collation_elements(right_chars, tailoring, numeric_primary);
10571025

10581026
// Start identical prefix
10591027

@@ -1922,7 +1890,7 @@ impl CollatorBorrowed<'_> {
19221890
let levels = self.sort_key_levels();
19231891

19241892
let mut iter =
1925-
collation_elements!(self, iter, self.tailoring_or_root(), self.numeric_primary());
1893+
self.collation_elements(iter, self.tailoring_or_root(), self.numeric_primary());
19261894
iter.init();
19271895
let variable_top = self.variable_top();
19281896

components/collator/src/elements.rs

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,16 +1502,7 @@ where
15021502
if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
15031503
// The character is its own decomposition
15041504
let jamo_index = (c as usize).wrapping_sub(HANGUL_L_BASE as usize);
1505-
// Attribute belongs on an inner expression, but
1506-
// https://github.com/rust-lang/rust/issues/15701
1507-
#[expect(clippy::indexing_slicing)]
1508-
if jamo_index >= self.jamo.len() {
1509-
ce32 = data.ce32_for_char(c);
1510-
if ce32 == FALLBACK_CE32 {
1511-
data = self.root;
1512-
ce32 = data.ce32_for_char(c);
1513-
}
1514-
} else {
1505+
if let Some(&jamo) = self.jamo.get(jamo_index) {
15151506
// The purpose of reading the CE32 from the jamo table instead
15161507
// of the trie even in this case is to make it unnecessary
15171508
// for all search collation tries to carry a copy of the Hangul
@@ -1531,7 +1522,13 @@ where
15311522
data = self.root;
15321523
// Index in range by construction above. Not using `get` with
15331524
// `if let` in order to put the likely branch first.
1534-
ce32 = CollationElement32::new_from_ule(self.jamo[jamo_index]);
1525+
ce32 = CollationElement32::new_from_ule(jamo);
1526+
} else {
1527+
ce32 = data.ce32_for_char(c);
1528+
if ce32 == FALLBACK_CE32 {
1529+
data = self.root;
1530+
ce32 = data.ce32_for_char(c);
1531+
}
15351532
}
15361533
if self.is_next_decomposition_starts_with_starter() {
15371534
if let Some(ce) = ce32.to_ce_simple_or_long_primary() {

components/collator/src/provider.rs

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -323,14 +323,50 @@ icu_provider::data_struct!(
323323
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
324324
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
325325
#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
326-
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
327326
pub struct CollationJamo<'data> {
328327
/// `CollationElement32`s (as `u32`s) for the Hangul Jamo Unicode Block.
329328
/// The length must be equal to the size of the block (256).
330-
#[cfg_attr(feature = "serde", serde(borrow))]
331329
pub ce32s: ZeroVec<'data, u32>,
332330
}
333331

332+
impl<'data> CollationJamo<'data> {
333+
pub(crate) fn as_array(
334+
&'data self,
335+
) -> &'data [<u32 as AsULE>::ULE; crate::elements::JAMO_COUNT] {
336+
#[allow(clippy::unwrap_used)] // by invariant
337+
self.ce32s.as_ule_slice().try_into().unwrap()
338+
}
339+
}
340+
341+
// TODO: redesign Korean search collation handling
342+
343+
#[cfg(feature = "compiled_data")]
344+
const _: () = assert!(
345+
Baked::SINGLETON_COLLATION_JAMO_V1.ce32s.as_slice().len() == crate::elements::JAMO_COUNT
346+
);
347+
348+
#[cfg(feature = "serde")]
349+
impl<'de> serde::Deserialize<'de> for CollationJamo<'de> {
350+
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
351+
where
352+
D: serde::Deserializer<'de>,
353+
{
354+
#[derive(serde::Deserialize)]
355+
struct Raw<'data> {
356+
#[cfg_attr(feature = "serde", serde(borrow))]
357+
ce32s: ZeroVec<'data, u32>,
358+
}
359+
360+
let Raw { ce32s } = Raw::deserialize(deserializer)?;
361+
362+
if ce32s.len() != crate::elements::JAMO_COUNT {
363+
return Err(serde::de::Error::custom("invalid"));
364+
}
365+
366+
Ok(Self { ce32s })
367+
}
368+
}
369+
334370
icu_provider::data_struct!(
335371
CollationJamo<'_>,
336372
#[cfg(feature = "datagen")]
@@ -554,7 +590,6 @@ impl CollationMetadata {
554590
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
555591
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
556592
#[cfg_attr(feature = "datagen", databake(path = icu_collator::provider))]
557-
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
558593
pub struct CollationSpecialPrimaries<'data> {
559594
/// The primaries corresponding to `MaxVariable`
560595
/// character classes packed so that each fits in
@@ -564,12 +599,42 @@ pub struct CollationSpecialPrimaries<'data> {
564599
/// This is potentially followed by 256 bits
565600
/// (packed in 16 u16s) to classify every possible
566601
/// byte into compressible or non-compressible.
567-
#[cfg_attr(feature = "serde", serde(borrow))]
568602
pub last_primaries: ZeroVec<'data, u16>,
569603
/// The high 8 bits of the numeric primary
570604
pub numeric_primary: u8,
571605
}
572606

607+
#[cfg(feature = "serde")]
608+
impl<'de> serde::Deserialize<'de> for CollationSpecialPrimaries<'de> {
609+
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
610+
where
611+
D: serde::Deserializer<'de>,
612+
{
613+
#[derive(serde::Deserialize)]
614+
struct Raw<'data> {
615+
#[cfg_attr(feature = "serde", serde(borrow))]
616+
last_primaries: ZeroVec<'data, u16>,
617+
numeric_primary: u8,
618+
}
619+
620+
let Raw {
621+
last_primaries,
622+
numeric_primary,
623+
} = Raw::deserialize(deserializer)?;
624+
625+
// `variant_count` isn't stable yet:
626+
// https://github.com/rust-lang/rust/issues/73662
627+
if last_primaries.len() <= (MaxVariable::Currency as usize) {
628+
return Err(serde::de::Error::custom("invalid"));
629+
}
630+
631+
Ok(Self {
632+
last_primaries,
633+
numeric_primary,
634+
})
635+
}
636+
}
637+
573638
#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
574639
pub(crate) struct CollationSpecialPrimariesValidated<'data> {
575640
/// The primaries corresponding to `MaxVariable`

0 commit comments

Comments
 (0)