@@ -563,6 +563,9 @@ struct SpmUgmTokenizer {
563563
564564 std::string_view input_view (input);
565565 int consumed = 0 ;
566+ size_t original_index = 0 ;
567+ bool pending_title_marker = false ;
568+ bool pending_upper_marker = false ;
566569
567570 while (!input_view.empty ()) {
568571 auto p = case_encoder_->NormalizePrefix (input_view);
@@ -576,6 +579,38 @@ struct SpmUgmTokenizer {
576579 normalized.append (space);
577580 is_space_prepended = true ;
578581 }
582+
583+ // Detect Titlecase/Acronym at the start of a word using original input (ASCII heuristic)
584+ // Titlecase: first letter uppercase followed by lowercase
585+ // Acronym: 2+ consecutive uppercase letters
586+ auto is_upper_ascii = [](unsigned char ch) { return ch >= ' A' && ch <= ' Z' ; };
587+ auto is_lower_ascii = [](unsigned char ch) { return ch >= ' a' && ch <= ' z' ; };
588+ size_t idx = original_index;
589+ while (idx < input.size () && (input[idx] == ' ' || input[idx] == ' \t ' || input[idx] == ' \n ' || input[idx] == ' \r ' )) idx++;
590+ if (idx < input.size () && is_upper_ascii (static_cast <unsigned char >(input[idx]))) {
591+ size_t j = idx;
592+ size_t upper_count = 0 ;
593+ while (j < input.size () && is_upper_ascii (static_cast <unsigned char >(input[j]))) {
594+ ++upper_count;
595+ ++j;
596+ }
597+ if (upper_count >= 2 ) {
598+ pending_upper_marker = true ;
599+ } else if (upper_count == 1 ) {
600+ if (j < input.size () && is_lower_ascii (static_cast <unsigned char >(input[j]))) {
601+ pending_title_marker = true ;
602+ }
603+ }
604+ }
605+ }
606+ if (pending_upper_marker) {
607+ // Insert All-Uppercase marker before the first non-space char of the word
608+ normalized.push_back (normalizer::cAllUppercase);
609+ pending_upper_marker = false ;
610+ } else if (pending_title_marker) {
611+ // Insert Titlecase marker before the first non-space char of the word
612+ normalized.push_back (normalizer::cTitlecase);
613+ pending_title_marker = false ;
579614 }
580615 normalized.push_back (c);
581616 } else {
@@ -590,6 +625,7 @@ struct SpmUgmTokenizer {
590625
591626 consumed += p.second ;
592627 input_view.remove_prefix (p.second );
628+ original_index += static_cast <size_t >(p.second );
593629 }
594630
595631 case_encoder_->PostProcess (&normalized, &norm_to_orig);
@@ -988,14 +1024,18 @@ class SpmUgmDecoder {
9881024 if (first_char == normalizer::cUppercase || first_char == normalizer::cAllUppercase ||
9891025 first_char == normalizer::cTitlecase || first_char == normalizer::cLowercase ||
9901026 first_char == normalizer::cPunctuation) {
1027+ // If we have an active Uppercase signature, ignore any leading Lowercase/Punctuation markers
1028+ bool active_upper = ((*state)->signature_ == normalizer::cUppercase || (*state)->signature_ == normalizer::cAllUppercase);
9911029 token.erase (0 , 1 ); // Remove signature character
9921030 // Update signature state: persist for AllUppercase/Uppercase; one-shot for Titlecase
9931031 if (first_char == normalizer::cAllUppercase || first_char == normalizer::cUppercase) {
9941032 (*state)->signature_ = first_char;
9951033 } else if (first_char == normalizer::cTitlecase) {
9961034 (*state)->signature_ = 0 ; // one-shot
997- } else { // Lowercase or Punctuation reset any prior signature
998- (*state)->signature_ = 0 ;
1035+ } else { // Lowercase or Punctuation
1036+ if (!active_upper) {
1037+ (*state)->signature_ = 0 ; // only reset if not in uppercase mode
1038+ }
9991039 }
10001040 switch (first_char) {
10011041 case normalizer::cUppercase:
@@ -1005,7 +1045,13 @@ class SpmUgmDecoder {
10051045 case normalizer::cTitlecase:
10061046 TitlecaseFirstCharacter (token);
10071047 break ;
1008- // For cLowercase and cPunctuation, no transformation needed
1048+ case normalizer::cLowercase:
1049+ case normalizer::cPunctuation:
1050+ // If active uppercase mode, keep token uppercased
1051+ if (active_upper) {
1052+ UppercaseUTF8 (token);
1053+ }
1054+ break ;
10091055 }
10101056 }
10111057 }
0 commit comments