@@ -802,7 +802,7 @@ class SpmUgmDecoder {
802802 }
803803
804804 // Helper: Decode first UTF-8 codepoint
805- bool DecodeFirstUTF8Codepoint (const std::string& utf8, wchar_t & codepoint, size_t & char_len) const {
805+ bool DecodeFirstUTF8Codepoint (const std::string& utf8, uint32_t & codepoint, size_t & char_len) const {
806806 unsigned char lead = static_cast <unsigned char >(utf8[0 ]);
807807 if (lead < 0x80 ) {
808808 codepoint = lead;
@@ -830,13 +830,10 @@ class SpmUgmDecoder {
830830 return true ;
831831 }
832832
833- // Helper: Encode a wchar_t as UTF-8
834- std::string EncodeUTF8 (wchar_t wc ) const {
833+ // Helper: Encode a uint32_t codepoint as UTF-8
834+ std::string EncodeUTF8 (uint32_t u ) const {
835835 std::string out;
836836
837- // Promote wchar_t to uint32_t to avoid data loss from shift operations and silence warning C4333
838- uint32_t u = static_cast <uint32_t >(wc);
839-
840837 if (u < 0x80 ) {
841838 out += static_cast <char >(u);
842839 } else if (u < 0x800 ) {
@@ -856,23 +853,68 @@ class SpmUgmDecoder {
856853 return out;
857854 }
858855
856+ // Helper: Convert UTF-8 string to uppercase (Unicode-aware)
857+ std::string UTF8ToUpper (const std::string& input) const {
858+ std::string result;
859+ size_t pos = 0 ;
860+
861+ while (pos < input.length ()) {
862+ uint32_t codepoint;
863+ size_t char_len = 0 ;
864+ std::string char_str = input.substr (pos);
865+
866+ if (!DecodeFirstUTF8Codepoint (char_str, codepoint, char_len)) {
867+ // If decode fails, just copy the byte as-is
868+ result += input[pos];
869+ pos++;
870+ continue ;
871+ }
872+
873+ // Apply uppercasing rules
874+ if (codepoint >= ' a' && codepoint <= ' z' ) {
875+ codepoint = codepoint - (' a' - ' A' ); // ASCII uppercase
876+ } else if (codepoint >= 0x0430 && codepoint <= 0x044F ) { // Cyrillic а-я
877+ codepoint = codepoint - (0x0430 - 0x0410 ); // Convert to А-Я
878+ } else if (codepoint == 0x0451 ) { // Cyrillic ё
879+ codepoint = 0x0401 ; // Ё
880+ } else if (codepoint >= 0x00E0 && codepoint <= 0x00FF ) {
881+ // Latin-1 Supplement lowercase letters
882+ if ((codepoint >= 0x00E0 && codepoint <= 0x00F6 ) || (codepoint >= 0x00F8 && codepoint <= 0x00FE )) {
883+ codepoint = codepoint - 0x20 ;
884+ }
885+ }
886+ // For other characters, leave unchanged
887+
888+ result += EncodeUTF8 (codepoint);
889+ pos += char_len;
890+ }
891+
892+ return result;
893+ }
894+
859895 // Updated titlecase logic (basic toupper does not work for a lot of languages)
860896 void TitlecaseFirstCharacter (std::string& token) const {
861897 if (token.empty ()) return ;
862898
863- wchar_t codepoint;
899+ uint32_t codepoint;
864900 size_t char_len = 0 ;
865901
866902 if (!DecodeFirstUTF8Codepoint (token, codepoint, char_len)) return ;
867903
868904 // Unicode-aware titlecasing for Cyrillic
869- if (codepoint >= L' а' && codepoint <= L' я' ) {
870- codepoint = codepoint - (L' а' - L' А' ); // Convert to uppercase
871- } else if (codepoint == L' ё' ) {
872- codepoint = L' Ё' ; // Special case
873- } else {
874- codepoint = std::towupper (codepoint); // Fallback (Latin, etc.)
905+ if (codepoint >= 0x0430 && codepoint <= 0x044F ) { // а-я
906+ codepoint = codepoint - (0x0430 - 0x0410 ); // Convert to uppercase (А-Я)
907+ } else if (codepoint == 0x0451 ) { // ё
908+ codepoint = 0x0401 ; // Ё
909+ } else if (codepoint >= ' a' && codepoint <= ' z' ) {
910+ codepoint = codepoint - (' a' - ' A' ); // ASCII uppercase
911+ } else if (codepoint >= 0x00E0 && codepoint <= 0x00FF ) {
912+ // Latin-1 Supplement lowercase letters
913+ if ((codepoint >= 0x00E0 && codepoint <= 0x00F6 ) || (codepoint >= 0x00F8 && codepoint <= 0x00FE )) {
914+ codepoint = codepoint - 0x20 ;
915+ }
875916 }
917+ // For other characters, leave unchanged (more comprehensive Unicode support would require ICU or similar)
876918
877919 std::string prefix = EncodeUTF8 (codepoint);
878920 std::string suffix = token.substr (char_len);
@@ -932,7 +974,7 @@ class SpmUgmDecoder {
932974 switch (signature) {
933975 case normalizer::cUppercase:
934976 case normalizer::cAllUppercase:
935- std::transform ( token. begin (), token. end (), token. begin (), ::toupper );
977+ token = UTF8ToUpper (token );
936978 break ;
937979 case normalizer::cTitlecase:
938980 TitlecaseFirstCharacter (token);
@@ -953,7 +995,7 @@ class SpmUgmDecoder {
953995 switch (first_char) {
954996 case normalizer::cUppercase:
955997 case normalizer::cAllUppercase:
956- std::transform ( token. begin (), token. end (), token. begin (), ::toupper );
998+ token = UTF8ToUpper (token );
957999 break ;
9581000 case normalizer::cTitlecase:
9591001 TitlecaseFirstCharacter (token);
0 commit comments