Skip to content

Commit d7abc2e

Browse files
committed
Fix TrailingCase::Unchanged handling for Dutch
1 parent 523b814 commit d7abc2e

File tree

2 files changed

+280
-158
lines changed

2 files changed

+280
-158
lines changed

components/casemap/src/internals.rs

Lines changed: 172 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,38 @@ impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, '_, I
6565
let src = self.src;
6666
let mut mapping = self.mapping;
6767
let mut iter = src.char_indices();
68+
// Number of characters to titlecase at the beginning
69+
// for the dutch IJ case.
70+
//
71+
// This cannot be handled in full_helper: this affects the trailing case logic. All other special casing
72+
// situations are able to cope by only titlecasing the first character, Dutch is the only place where
73+
// the first two base characters get uppercased.
74+
let mut dutch_titlecase_count = if IS_TITLE_CONTEXT && self.locale == CaseMapLocale::Dutch {
75+
dutch_ij_pair_at_beginning_count(src, self.data)
76+
} else {
77+
None
78+
};
6879
for (i, c) in &mut iter {
6980
let context = ContextIterator::new(&src[..i], &src[i..]);
7081
self.data
7182
.full_helper::<IS_TITLE_CONTEXT, W>(c, context, self.locale, mapping, sink)?;
7283
if IS_TITLE_CONTEXT {
84+
// Check if we're uppercasing a dutch IJ
85+
if let Some(count) = dutch_titlecase_count {
86+
// If we are, we want to wait `count` characters
87+
// before we switch to lowercasing (or TrailingCase::Unchanged)
88+
if count > 1 {
89+
// We still have code points to process
90+
dutch_titlecase_count = Some(count - 1);
91+
// Continue the loop to skip the mode switching code below
92+
continue;
93+
} else {
94+
// We would have been down to zero. Time to continue the loop as normal.
95+
dutch_titlecase_count = None;
96+
}
97+
}
98+
99+
// If titlecasing, switch the mode to lowercasing/TrailingCase::Unchanged
73100
if self.titlecase_tail_casing == TrailingCase::Lower {
74101
mapping = MappingKind::Lower;
75102
} else {
@@ -211,18 +238,6 @@ impl<'data> CaseMap<'data> {
211238
!IS_TITLE_CONTEXT || kind == MappingKind::Title || kind == MappingKind::Lower
212239
);
213240

214-
// ICU4C's non-standard extension for Dutch IJ titlecasing
215-
// handled here instead of in full_lower_special_case because J does not have conditional
216-
// special casemapping.
217-
if IS_TITLE_CONTEXT && locale == CaseMapLocale::Dutch && kind == MappingKind::Lower {
218-
// When titlecasing, a J found immediately after an I at the beginning of the segment
219-
// should also uppercase. They are both allowed to have an acute accent but it must
220-
// be present on both letters or neither. They may not have any other combining marks.
221-
if (c == 'j' || c == 'J') && context.is_dutch_ij_pair_at_beginning(self) {
222-
return sink.write_char('J');
223-
}
224-
}
225-
226241
// ICU4C's non-standard extension for Greek uppercasing:
227242
// https://icu.unicode.org/design/case/greek-upper.
228243
// Effectively removes Greek accents from Greek vowels during uppercasing,
@@ -809,56 +824,156 @@ impl<'a> ContextIterator<'a> {
809824
}
810825
false
811826
}
827+
}
812828

813-
/// Checks the preceding and surrounding context of a j or J
814-
/// and returns true if it is preceded by an i or I at the start of the string.
815-
/// If one has an acute accent,
816-
/// both must have the accent for this to return true. No other accents are handled.
817-
fn is_dutch_ij_pair_at_beginning(&self, mapping: &CaseMap) -> bool {
818-
let mut before = self.before.chars().rev();
819-
let mut i_has_acute = false;
820-
loop {
821-
match before.next() {
822-
Some('i') | Some('I') => break,
823-
Some('í') | Some('Í') => {
824-
i_has_acute = true;
825-
break;
829+
/// Data on an i, I, í, or Í at the beginning of a string
830+
#[derive(PartialEq, Eq, Debug, Clone)]
831+
struct DutchIData<'a> {
832+
/// The rest of the string after this i
833+
rest: &'a str,
834+
has_acute: bool,
835+
// Number of code points consumed.
836+
char_count: usize,
837+
}
838+
839+
/// Is there an i at the beginning of the string which may be relevant
840+
/// for Dutch titlecasing?
841+
fn dutch_i_at_beginning(s: &'_ str) -> Option<DutchIData<'_>> {
842+
let mut chars = s.chars();
843+
match chars.next() {
844+
Some('i') | Some('I') => {
845+
let rest = chars.as_str();
846+
match chars.next() {
847+
Some(ACUTE) => {
848+
// We have consumed an i and an acute accent.
849+
// So chars.as_str() will have the rest of the string
850+
return Some(DutchIData {
851+
rest: chars.as_str(),
852+
has_acute: true,
853+
char_count: 2,
854+
});
855+
}
856+
_ => {
857+
// We have consumed an i and a non-acute accent character.
858+
// So `rest`, from before our `.next()` call, will have the rest of the string
859+
return Some(DutchIData {
860+
rest,
861+
has_acute: false,
862+
char_count: 1,
863+
});
826864
}
827-
Some(ACUTE) => i_has_acute = true,
828-
_ => return false,
829865
}
830866
}
831-
832-
if before.next().is_some() {
833-
// not at the beginning of a string, doesn't matter
834-
return false;
835-
}
836-
let mut j_has_acute = false;
837-
for c in self.after.chars() {
838-
if c == ACUTE {
839-
j_has_acute = true;
840-
continue;
841-
}
842-
// We are supposed to check that `j` has no other combining marks aside
843-
// from potentially an acute accent. Once we hit the first non-combining mark
844-
// we are done.
845-
//
846-
// ICU4C checks for `gc=Mn` to determine if something is a combining mark,
847-
// however this requires extra data (and is the *only* point in the casemapping algorithm
848-
// where there is a direct dependency on properties data not mediated by the casemapping data trie).
849-
//
850-
// Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
851-
//
852-
// See https://unicode-org.atlassian.net/browse/ICU-22429
853-
match mapping.dot_type(c) {
854-
// Not a combining character; ccc = 0
855-
DotType::NoDot | DotType::SoftDotted => break,
856-
// found combining character, bail
857-
_ => return false,
858-
}
867+
// We have consumed an i and an acute accent.
868+
// So chars.as_str() will have the rest of the string
869+
Some('í') | Some('Í') => {
870+
return Some(DutchIData {
871+
rest: chars.as_str(),
872+
has_acute: true,
873+
char_count: 1
874+
})
859875
}
876+
_ => return None,
877+
}
878+
}
860879

861-
// either both should have an acute accent, or none. this is an XNOR operation
862-
!(j_has_acute ^ i_has_acute)
880+
/// This checks for a Dutch-relevant IJ pair at the beginning of a string.
881+
/// This is an I followed by a J, with any casing, and no accents other than acute.
882+
/// Acute accents must be on both or neither.
883+
///
884+
/// This returns the number of characters (codepoint-wise, not code unit-wise)
885+
/// in the IJ pair, not including any combining characters on the J.
886+
///
887+
/// In dutch titlecasing mode, the first N characters should be uppercased:
888+
/// ijabc should titlecase to IJabc.
889+
fn dutch_ij_pair_at_beginning_count(s: &str, mapping: &CaseMap) -> Option<usize> {
890+
let i_at_beginning = dutch_i_at_beginning(s)?;
891+
892+
893+
let mut chars = i_at_beginning.rest.chars();
894+
895+
match chars.next() {
896+
Some('j' | 'J') => (),
897+
_ => return None,
898+
}
899+
900+
let mut j_has_acute = false;
901+
for c in chars {
902+
if c == ACUTE {
903+
j_has_acute = true;
904+
continue;
905+
}
906+
// We are supposed to check that `j` has no other combining marks aside
907+
// from potentially an acute accent. Once we hit the first non-combining mark
908+
// we are done.
909+
//
910+
// ICU4C checks for `gc=Mn` to determine if something is a combining mark,
911+
// however this requires extra data (and is the *only* point in the casemapping algorithm
912+
// where there is a direct dependency on properties data not mediated by the casemapping data trie).
913+
//
914+
// Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
915+
//
916+
// See https://unicode-org.atlassian.net/browse/ICU-22429
917+
match mapping.dot_type(c) {
918+
// Not a combining character; ccc = 0
919+
DotType::NoDot | DotType::SoftDotted => break,
920+
// found combining character, bail
921+
_ => return None,
922+
}
923+
}
924+
925+
// either both should have an acute accent, or none. this is an XNOR operation
926+
if !(j_has_acute ^ i_at_beginning.has_acute) {
927+
// There were char_count characters in the i, and 1 more j character.
928+
// The accent won't be cased.
929+
Some(i_at_beginning.char_count + 1)
930+
} else {
931+
None
863932
}
864933
}
934+
935+
#[test]
936+
fn test_dutch_i_at_beginning() {
937+
fn id(rest: &str, has_acute: bool, char_count: usize) -> Option<DutchIData<'_>> {
938+
Some(DutchIData { rest, has_acute, char_count })
939+
}
940+
941+
// Should remove the dutch I (capital or lowercase, possibly accented I)
942+
// and return the rest of the string
943+
assert_eq!(dutch_i_at_beginning("iX"), id("X", false, 1));
944+
assert_eq!(dutch_i_at_beginning("íX"), id("X", true, 1));
945+
assert_eq!(dutch_i_at_beginning("i\u{301}X"), id("X", true, 2));
946+
assert_eq!(dutch_i_at_beginning("IX"), id("X", false, 1));
947+
assert_eq!(dutch_i_at_beginning("ÍX"), id("X", true, 1));
948+
assert_eq!(dutch_i_at_beginning("ÍX"), id("X", true, 1));
949+
assert_eq!(dutch_i_at_beginning("I\u{301}X"), id("X", true, 2));
950+
951+
// Shouldn't get confused about other accent marks: ONLY acute accents, and only one of them.
952+
assert_eq!(dutch_i_at_beginning(\u{301}X"), id("\u{301}X", true, 1));
953+
assert_eq!(dutch_i_at_beginning("i\u{302}X"), id("\u{302}X", false, 1));
954+
// This is an acute accent that comes *after* but that's fine, other parts of the algorithm
955+
// will reject that.
956+
assert_eq!(
957+
dutch_i_at_beginning("i\u{302}\u{301}X"),
958+
id("\u{302}\u{301}X", false, 1)
959+
);
960+
961+
assert_eq!(dutch_i_at_beginning(\u{301}X"), None);
962+
963+
}
964+
965+
#[test]
966+
fn test_dutch_ij_at_beginning() {
967+
let data = crate::CaseMapperBorrowed::new().data;
968+
969+
assert_eq!(dutch_ij_pair_at_beginning_count("ijabcd", data), Some(2));
970+
assert_eq!(dutch_ij_pair_at_beginning_count("iJabcd", data), Some(2));
971+
assert_eq!(dutch_ij_pair_at_beginning_count("IJabcd", data), Some(2));
972+
assert_eq!(dutch_ij_pair_at_beginning_count("Ijabcd", data), Some(2));
973+
assert_eq!(dutch_ij_pair_at_beginning_count("íj\u{301}abcd", data), Some(2));
974+
assert_eq!(dutch_ij_pair_at_beginning_count("ÍJ\u{301}abcd", data), Some(2));
975+
assert_eq!(dutch_ij_pair_at_beginning_count("i\u{301}J\u{301}abcd", data), Some(3));
976+
assert_eq!(dutch_ij_pair_at_beginning_count("i\u{301}Jabcd", data), None);
977+
assert_eq!(dutch_ij_pair_at_beginning_count("íJabcd", data), None);
978+
assert_eq!(dutch_ij_pair_at_beginning_count("abcdijk", data), None);
979+
}

0 commit comments

Comments
 (0)