@@ -65,11 +65,38 @@ impl<'a, const IS_TITLE_CONTEXT: bool> Writeable for FullCaseWriteable<'a, '_, I
6565 let src = self . src ;
6666 let mut mapping = self . mapping ;
6767 let mut iter = src. char_indices ( ) ;
68+ // Number of characters to titlecase at the beginning
69+ // for the dutch IJ case.
70+ //
71+ // This cannot be handled in full_helper: this affects the trailing case logic. All other special casing
72+ // situations are able to cope by only titlecasing the first character, Dutch is the only place where
73+ // the first two base characters get uppercased.
74+ let mut dutch_titlecase_count = if IS_TITLE_CONTEXT && self . locale == CaseMapLocale :: Dutch {
75+ dutch_ij_pair_at_beginning_count ( src, self . data )
76+ } else {
77+ None
78+ } ;
6879 for ( i, c) in & mut iter {
6980 let context = ContextIterator :: new ( & src[ ..i] , & src[ i..] ) ;
7081 self . data
7182 . full_helper :: < IS_TITLE_CONTEXT , W > ( c, context, self . locale , mapping, sink) ?;
7283 if IS_TITLE_CONTEXT {
84+ // Check if we're uppercasing a dutch IJ
85+ if let Some ( count) = dutch_titlecase_count {
86+ // If we are, we want to wait `count` characters
87+ // before we switch to lowercasing (or TrailingCase::Unchanged)
88+ if count > 1 {
89+ // We still have code points to process
90+ dutch_titlecase_count = Some ( count - 1 ) ;
91+ // Continue the loop to skip the mode switching code below
92+ continue ;
93+ } else {
94+ // We would have been down to zero. Time to continue the loop as normal.
95+ dutch_titlecase_count = None ;
96+ }
97+ }
98+
99+ // If titlecasing, switch the mode to lowercasing/TrailingCase::Unchanged
73100 if self . titlecase_tail_casing == TrailingCase :: Lower {
74101 mapping = MappingKind :: Lower ;
75102 } else {
@@ -211,18 +238,6 @@ impl<'data> CaseMap<'data> {
211238 !IS_TITLE_CONTEXT || kind == MappingKind :: Title || kind == MappingKind :: Lower
212239 ) ;
213240
214- // ICU4C's non-standard extension for Dutch IJ titlecasing
215- // handled here instead of in full_lower_special_case because J does not have conditional
216- // special casemapping.
217- if IS_TITLE_CONTEXT && locale == CaseMapLocale :: Dutch && kind == MappingKind :: Lower {
218- // When titlecasing, a J found immediately after an I at the beginning of the segment
219- // should also uppercase. They are both allowed to have an acute accent but it must
220- // be present on both letters or neither. They may not have any other combining marks.
221- if ( c == 'j' || c == 'J' ) && context. is_dutch_ij_pair_at_beginning ( self ) {
222- return sink. write_char ( 'J' ) ;
223- }
224- }
225-
226241 // ICU4C's non-standard extension for Greek uppercasing:
227242 // https://icu.unicode.org/design/case/greek-upper.
228243 // Effectively removes Greek accents from Greek vowels during uppercasing,
@@ -809,56 +824,156 @@ impl<'a> ContextIterator<'a> {
809824 }
810825 false
811826 }
827+ }
812828
813- /// Checks the preceding and surrounding context of a j or J
814- /// and returns true if it is preceded by an i or I at the start of the string.
815- /// If one has an acute accent,
816- /// both must have the accent for this to return true. No other accents are handled.
817- fn is_dutch_ij_pair_at_beginning ( & self , mapping : & CaseMap ) -> bool {
818- let mut before = self . before . chars ( ) . rev ( ) ;
819- let mut i_has_acute = false ;
820- loop {
821- match before. next ( ) {
822- Some ( 'i' ) | Some ( 'I' ) => break ,
823- Some ( 'í' ) | Some ( 'Í' ) => {
824- i_has_acute = true ;
825- break ;
829+ /// Data on an i, I, í, or Í at the beginning of a string
830+ #[ derive( PartialEq , Eq , Debug , Clone ) ]
831+ struct DutchIData < ' a > {
832+ /// The rest of the string after this i
833+ rest : & ' a str ,
834+ has_acute : bool ,
835+ // Number of code points consumed.
836+ char_count : usize ,
837+ }
838+
839+ /// Is there an i at the beginning of the string which may be relevant
840+ /// for Dutch titlecasing?
841+ fn dutch_i_at_beginning ( s : & ' _ str ) -> Option < DutchIData < ' _ > > {
842+ let mut chars = s. chars ( ) ;
843+ match chars. next ( ) {
844+ Some ( 'i' ) | Some ( 'I' ) => {
845+ let rest = chars. as_str ( ) ;
846+ match chars. next ( ) {
847+ Some ( ACUTE ) => {
848+ // We have consumed an i and an acute accent.
849+ // So chars.as_str() will have the rest of the string
850+ return Some ( DutchIData {
851+ rest : chars. as_str ( ) ,
852+ has_acute : true ,
853+ char_count : 2 ,
854+ } ) ;
855+ }
856+ _ => {
857+ // We have consumed an i and a non-acute accent character.
858+ // So `rest`, from before our `.next()` call, will have the rest of the string
859+ return Some ( DutchIData {
860+ rest,
861+ has_acute : false ,
862+ char_count : 1 ,
863+ } ) ;
826864 }
827- Some ( ACUTE ) => i_has_acute = true ,
828- _ => return false ,
829865 }
830866 }
831-
832- if before. next ( ) . is_some ( ) {
833- // not at the beginning of a string, doesn't matter
834- return false ;
835- }
836- let mut j_has_acute = false ;
837- for c in self . after . chars ( ) {
838- if c == ACUTE {
839- j_has_acute = true ;
840- continue ;
841- }
842- // We are supposed to check that `j` has no other combining marks aside
843- // from potentially an acute accent. Once we hit the first non-combining mark
844- // we are done.
845- //
846- // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
847- // however this requires extra data (and is the *only* point in the casemapping algorithm
848- // where there is a direct dependency on properties data not mediated by the casemapping data trie).
849- //
850- // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
851- //
852- // See https://unicode-org.atlassian.net/browse/ICU-22429
853- match mapping. dot_type ( c) {
854- // Not a combining character; ccc = 0
855- DotType :: NoDot | DotType :: SoftDotted => break ,
856- // found combining character, bail
857- _ => return false ,
858- }
867+ // We have consumed an i and an acute accent.
868+ // So chars.as_str() will have the rest of the string
869+ Some ( 'í' ) | Some ( 'Í' ) => {
870+ return Some ( DutchIData {
871+ rest : chars. as_str ( ) ,
872+ has_acute : true ,
873+ char_count : 1
874+ } )
859875 }
876+ _ => return None ,
877+ }
878+ }
860879
861- // either both should have an acute accent, or none. this is an XNOR operation
862- !( j_has_acute ^ i_has_acute)
880+ /// This checks for a Dutch-relevant IJ pair at the beginning of a string.
881+ /// This is an I followed by a J, with any casing, and no accents other than acute.
882+ /// Acute accents must be on both or neither.
883+ ///
884+ /// This returns the number of characters (codepoint-wise, not code unit-wise)
885+ /// in the IJ pair, not including any combining characters on the J.
886+ ///
887+ /// In dutch titlecasing mode, the first N characters should be uppercased:
888+ /// ijabc should titlecase to IJabc.
889+ fn dutch_ij_pair_at_beginning_count ( s : & str , mapping : & CaseMap ) -> Option < usize > {
890+ let i_at_beginning = dutch_i_at_beginning ( s) ?;
891+
892+
893+ let mut chars = i_at_beginning. rest . chars ( ) ;
894+
895+ match chars. next ( ) {
896+ Some ( 'j' | 'J' ) => ( ) ,
897+ _ => return None ,
898+ }
899+
900+ let mut j_has_acute = false ;
901+ for c in chars {
902+ if c == ACUTE {
903+ j_has_acute = true ;
904+ continue ;
905+ }
906+ // We are supposed to check that `j` has no other combining marks aside
907+ // from potentially an acute accent. Once we hit the first non-combining mark
908+ // we are done.
909+ //
910+ // ICU4C checks for `gc=Mn` to determine if something is a combining mark,
911+ // however this requires extra data (and is the *only* point in the casemapping algorithm
912+ // where there is a direct dependency on properties data not mediated by the casemapping data trie).
913+ //
914+ // Instead, we can check for ccc via dot_type, the same way the rest of the algorithm does.
915+ //
916+ // See https://unicode-org.atlassian.net/browse/ICU-22429
917+ match mapping. dot_type ( c) {
918+ // Not a combining character; ccc = 0
919+ DotType :: NoDot | DotType :: SoftDotted => break ,
920+ // found combining character, bail
921+ _ => return None ,
922+ }
923+ }
924+
925+ // either both should have an acute accent, or none. this is an XNOR operation
926+ if !( j_has_acute ^ i_at_beginning. has_acute ) {
927+ // There were char_count characters in the i, and 1 more j character.
928+ // The accent won't be cased.
929+ Some ( i_at_beginning. char_count + 1 )
930+ } else {
931+ None
863932 }
864933}
934+
935+ #[ test]
936+ fn test_dutch_i_at_beginning ( ) {
937+ fn id ( rest : & str , has_acute : bool , char_count : usize ) -> Option < DutchIData < ' _ > > {
938+ Some ( DutchIData { rest, has_acute, char_count } )
939+ }
940+
941+ // Should remove the dutch I (capital or lowercase, possibly accented I)
942+ // and return the rest of the string
943+ assert_eq ! ( dutch_i_at_beginning( "iX" ) , id( "X" , false , 1 ) ) ;
944+ assert_eq ! ( dutch_i_at_beginning( "íX" ) , id( "X" , true , 1 ) ) ;
945+ assert_eq ! ( dutch_i_at_beginning( "i\u{301} X" ) , id( "X" , true , 2 ) ) ;
946+ assert_eq ! ( dutch_i_at_beginning( "IX" ) , id( "X" , false , 1 ) ) ;
947+ assert_eq ! ( dutch_i_at_beginning( "ÍX" ) , id( "X" , true , 1 ) ) ;
948+ assert_eq ! ( dutch_i_at_beginning( "ÍX" ) , id( "X" , true , 1 ) ) ;
949+ assert_eq ! ( dutch_i_at_beginning( "I\u{301} X" ) , id( "X" , true , 2 ) ) ;
950+
951+ // Shouldn't get confused about other accent marks: ONLY acute accents, and only one of them.
952+ assert_eq ! ( dutch_i_at_beginning( "í\u{301} X" ) , id( "\u{301} X" , true , 1 ) ) ;
953+ assert_eq ! ( dutch_i_at_beginning( "i\u{302} X" ) , id( "\u{302} X" , false , 1 ) ) ;
954+ // This is an acute accent that comes *after* but that's fine, other parts of the algorithm
955+ // will reject that.
956+ assert_eq ! (
957+ dutch_i_at_beginning( "i\u{302} \u{301} X" ) ,
958+ id( "\u{302} \u{301} X" , false , 1 )
959+ ) ;
960+
961+ assert_eq ! ( dutch_i_at_beginning( "ï\u{301} X" ) , None ) ;
962+
963+ }
964+
965+ #[ test]
966+ fn test_dutch_ij_at_beginning ( ) {
967+ let data = crate :: CaseMapperBorrowed :: new ( ) . data ;
968+
969+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "ijabcd" , data) , Some ( 2 ) ) ;
970+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "iJabcd" , data) , Some ( 2 ) ) ;
971+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "IJabcd" , data) , Some ( 2 ) ) ;
972+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "Ijabcd" , data) , Some ( 2 ) ) ;
973+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "íj\u{301} abcd" , data) , Some ( 2 ) ) ;
974+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "ÍJ\u{301} abcd" , data) , Some ( 2 ) ) ;
975+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "i\u{301} J\u{301} abcd" , data) , Some ( 3 ) ) ;
976+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "i\u{301} Jabcd" , data) , None ) ;
977+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "íJabcd" , data) , None ) ;
978+ assert_eq ! ( dutch_ij_pair_at_beginning_count( "abcdijk" , data) , None ) ;
979+ }
0 commit comments