Fix character capitalization issue in marian tokenizer

apsonawane · apsonawane · commit d2424079668a · 2025-12-18T16:16:57.000-08:00
diff --git a/operators/tokenizer/ugm_kernels.hpp b/operators/tokenizer/ugm_kernels.hpp
@@ -879,6 +879,43 @@ class SpmUgmDecoder {
     token = prefix + suffix;
   }
 
+  // UTF-8 aware uppercase for the whole token
+  void UppercaseUTF8(std::string& text) const {
+    if (text.empty()) return;
+
+    std::string result;
+    result.reserve(text.size());
+
+    size_t i = 0;
+    while (i < text.size()) {
+      // Decode next codepoint from current position
+      wchar_t codepoint;
+      size_t char_len = 0;
+
+      // Create a view starting at i; if decoding fails, copy raw byte
+      std::string remaining = text.substr(i);
+      if (!DecodeFirstUTF8Codepoint(remaining, codepoint, char_len) || char_len == 0) {
+        result.push_back(text[i]);
+        ++i;
+        continue;
+      }
+
+      // Cyrillic special cases
+      if (codepoint >= L'а' && codepoint <= L'я') {
+        codepoint = codepoint - (L'а' - L'А');
+      } else if (codepoint == L'ё') {
+        codepoint = L'Ё';
+      } else {
+        codepoint = std::towupper(codepoint);
+      }
+
+      result += EncodeUTF8(codepoint);
+      i += char_len;
+    }
+
+    text.swap(result);
+  }
+
   OrtxStatus Id2Token(extTokenId_t id, std::string& token, TokenizerDecodingState** state, bool skip_special_tokens /* only used by BPE; placeholder for UGM */ = true) const {
     std::unique_ptr<TokenizerDecodingState> decoding_state;
     if (*state == nullptr) {
@@ -932,7 +969,7 @@ class SpmUgmDecoder {
       switch (signature) {
         case normalizer::cUppercase:
         case normalizer::cAllUppercase:
-          std::transform(token.begin(), token.end(), token.begin(), ::toupper);
+          UppercaseUTF8(token);
           break;
         case normalizer::cTitlecase:
           TitlecaseFirstCharacter(token);
@@ -953,7 +990,7 @@ class SpmUgmDecoder {
         switch (first_char) {
           case normalizer::cUppercase:
           case normalizer::cAllUppercase:
-            std::transform(token.begin(), token.end(), token.begin(), ::toupper);
+            UppercaseUTF8(token);
             break;
           case normalizer::cTitlecase:
             TitlecaseFirstCharacter(token);