Add titlecase marker injection

apsonawane · apsonawane · commit 90460f3c9d1c · 2025-12-18T16:50:19.000-08:00
diff --git a/operators/tokenizer/ugm_kernels.hpp b/operators/tokenizer/ugm_kernels.hpp
@@ -563,6 +563,9 @@ struct SpmUgmTokenizer {
 
     std::string_view input_view(input);
     int consumed = 0;
+    size_t original_index = 0;
+    bool pending_title_marker = false;
+    bool pending_upper_marker = false;
 
     while (!input_view.empty()) {
       auto p = case_encoder_->NormalizePrefix(input_view);
@@ -576,6 +579,38 @@ struct SpmUgmTokenizer {
               normalized.append(space);
               is_space_prepended = true;
             }
+
+            // Detect Titlecase/Acronym at the start of a word using original input (ASCII heuristic)
+            // Titlecase: first letter uppercase followed by lowercase
+            // Acronym: 2+ consecutive uppercase letters
+            auto is_upper_ascii = [](unsigned char ch) { return ch >= 'A' && ch <= 'Z'; };
+            auto is_lower_ascii = [](unsigned char ch) { return ch >= 'a' && ch <= 'z'; };
+            size_t idx = original_index;
+            while (idx < input.size() && (input[idx] == ' ' || input[idx] == '\t' || input[idx] == '\n' || input[idx] == '\r')) idx++;
+            if (idx < input.size() && is_upper_ascii(static_cast<unsigned char>(input[idx]))) {
+              size_t j = idx;
+              size_t upper_count = 0;
+              while (j < input.size() && is_upper_ascii(static_cast<unsigned char>(input[j]))) {
+                ++upper_count;
+                ++j;
+              }
+              if (upper_count >= 2) {
+                pending_upper_marker = true;
+              } else if (upper_count == 1) {
+                if (j < input.size() && is_lower_ascii(static_cast<unsigned char>(input[j]))) {
+                  pending_title_marker = true;
+                }
+              }
+            }
+          }
+          if (pending_upper_marker) {
+            // Insert All-Uppercase marker before the first non-space char of the word
+            normalized.push_back(normalizer::cAllUppercase);
+            pending_upper_marker = false;
+          } else if (pending_title_marker) {
+            // Insert Titlecase marker before the first non-space char of the word
+            normalized.push_back(normalizer::cTitlecase);
+            pending_title_marker = false;
           }
           normalized.push_back(c);
         } else {
@@ -590,6 +625,7 @@ struct SpmUgmTokenizer {
 
       consumed += p.second;
       input_view.remove_prefix(p.second);
+      original_index += static_cast<size_t>(p.second);
     }
 
     case_encoder_->PostProcess(&normalized, &norm_to_orig);
@@ -988,14 +1024,18 @@ class SpmUgmDecoder {
       if (first_char == normalizer::cUppercase || first_char == normalizer::cAllUppercase ||
           first_char == normalizer::cTitlecase || first_char == normalizer::cLowercase ||
           first_char == normalizer::cPunctuation) {
+        // If we have an active Uppercase signature, ignore any leading Lowercase/Punctuation markers
+        bool active_upper = ((*state)->signature_ == normalizer::cUppercase || (*state)->signature_ == normalizer::cAllUppercase);
         token.erase(0, 1);  // Remove signature character
         // Update signature state: persist for AllUppercase/Uppercase; one-shot for Titlecase
         if (first_char == normalizer::cAllUppercase || first_char == normalizer::cUppercase) {
           (*state)->signature_ = first_char;
         } else if (first_char == normalizer::cTitlecase) {
           (*state)->signature_ = 0;  // one-shot
-        } else {  // Lowercase or Punctuation reset any prior signature
-          (*state)->signature_ = 0;
+        } else {  // Lowercase or Punctuation
+          if (!active_upper) {
+            (*state)->signature_ = 0;  // only reset if not in uppercase mode
+          }
         }
         switch (first_char) {
           case normalizer::cUppercase:
@@ -1005,7 +1045,13 @@ class SpmUgmDecoder {
           case normalizer::cTitlecase:
             TitlecaseFirstCharacter(token);
             break;
-            // For cLowercase and cPunctuation, no transformation needed
+          case normalizer::cLowercase:
+          case normalizer::cPunctuation:
+            // If active uppercase mode, keep token uppercased
+            if (active_upper) {
+              UppercaseUTF8(token);
+            }
+            break;
         }
       }
     }