Skip to content

Commit c997999

Browse files
committed
fix character capitalization
1 parent 7387a4e commit c997999

File tree

2 files changed

+59
-17
lines changed

2 files changed

+59
-17
lines changed

operators/tokenizer/case_encoder.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,8 @@ class CaseEncoder {
129129

130130
sp.remove_prefix(1);
131131
buffer(sp);
132-
buffer_queue_.front().first[0] = cUppercase;
133-
buffer_[0] = cUppercase;
132+
buffer_queue_.back().first[0] = cUppercase;
133+
buffer_[buffer_.size() - sp.size()] = cUppercase;
134134
state_ = 2;
135135
ret = null(consumed);
136136

operators/tokenizer/ugm_kernels.hpp

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ class SpmUgmDecoder {
802802
}
803803

804804
// Helper: Decode first UTF-8 codepoint
805-
bool DecodeFirstUTF8Codepoint(const std::string& utf8, wchar_t& codepoint, size_t& char_len) const {
805+
bool DecodeFirstUTF8Codepoint(const std::string& utf8, uint32_t& codepoint, size_t& char_len) const {
806806
unsigned char lead = static_cast<unsigned char>(utf8[0]);
807807
if (lead < 0x80) {
808808
codepoint = lead;
@@ -830,13 +830,10 @@ class SpmUgmDecoder {
830830
return true;
831831
}
832832

833-
// Helper: Encode a wchar_t as UTF-8
834-
std::string EncodeUTF8(wchar_t wc) const {
833+
// Helper: Encode a uint32_t codepoint as UTF-8
834+
std::string EncodeUTF8(uint32_t u) const {
835835
std::string out;
836836

837-
// Promote wchar_t to uint32_t to avoid data loss from shift operations and silence warning C4333
838-
uint32_t u = static_cast<uint32_t>(wc);
839-
840837
if (u < 0x80) {
841838
out += static_cast<char>(u);
842839
} else if (u < 0x800) {
@@ -856,23 +853,68 @@ class SpmUgmDecoder {
856853
return out;
857854
}
858855

856+
// Helper: Convert UTF-8 string to uppercase (Unicode-aware)
857+
std::string UTF8ToUpper(const std::string& input) const {
858+
std::string result;
859+
size_t pos = 0;
860+
861+
while (pos < input.length()) {
862+
uint32_t codepoint;
863+
size_t char_len = 0;
864+
std::string char_str = input.substr(pos);
865+
866+
if (!DecodeFirstUTF8Codepoint(char_str, codepoint, char_len)) {
867+
// If decode fails, just copy the byte as-is
868+
result += input[pos];
869+
pos++;
870+
continue;
871+
}
872+
873+
// Apply uppercasing rules
874+
if (codepoint >= 'a' && codepoint <= 'z') {
875+
codepoint = codepoint - ('a' - 'A'); // ASCII uppercase
876+
} else if (codepoint >= 0x0430 && codepoint <= 0x044F) { // Cyrillic а-я
877+
codepoint = codepoint - (0x0430 - 0x0410); // Convert to А-Я
878+
} else if (codepoint == 0x0451) { // Cyrillic ё
879+
codepoint = 0x0401; // Ё
880+
} else if (codepoint >= 0x00E0 && codepoint <= 0x00FF) {
881+
// Latin-1 Supplement lowercase letters
882+
if ((codepoint >= 0x00E0 && codepoint <= 0x00F6) || (codepoint >= 0x00F8 && codepoint <= 0x00FE)) {
883+
codepoint = codepoint - 0x20;
884+
}
885+
}
886+
// For other characters, leave unchanged
887+
888+
result += EncodeUTF8(codepoint);
889+
pos += char_len;
890+
}
891+
892+
return result;
893+
}
894+
859895
// Updated titlecase logic (basic toupper does not work for a lot of languages)
860896
void TitlecaseFirstCharacter(std::string& token) const {
861897
if (token.empty()) return;
862898

863-
wchar_t codepoint;
899+
uint32_t codepoint;
864900
size_t char_len = 0;
865901

866902
if (!DecodeFirstUTF8Codepoint(token, codepoint, char_len)) return;
867903

868904
// Unicode-aware titlecasing for Cyrillic
869-
if (codepoint >= L'а' && codepoint <= L'я') {
870-
codepoint = codepoint - (L'а' - L'А'); // Convert to uppercase
871-
} else if (codepoint == L'ё') {
872-
codepoint = L'Ё'; // Special case
873-
} else {
874-
codepoint = std::towupper(codepoint); // Fallback (Latin, etc.)
905+
if (codepoint >= 0x0430 && codepoint <= 0x044F) { // а-я
906+
codepoint = codepoint - (0x0430 - 0x0410); // Convert to uppercase (А-Я)
907+
} else if (codepoint == 0x0451) { // ё
908+
codepoint = 0x0401; // Ё
909+
} else if (codepoint >= 'a' && codepoint <= 'z') {
910+
codepoint = codepoint - ('a' - 'A'); // ASCII uppercase
911+
} else if (codepoint >= 0x00E0 && codepoint <= 0x00FF) {
912+
// Latin-1 Supplement lowercase letters
913+
if ((codepoint >= 0x00E0 && codepoint <= 0x00F6) || (codepoint >= 0x00F8 && codepoint <= 0x00FE)) {
914+
codepoint = codepoint - 0x20;
915+
}
875916
}
917+
// For other characters, leave unchanged (more comprehensive Unicode support would require ICU or similar)
876918

877919
std::string prefix = EncodeUTF8(codepoint);
878920
std::string suffix = token.substr(char_len);
@@ -932,7 +974,7 @@ class SpmUgmDecoder {
932974
switch (signature) {
933975
case normalizer::cUppercase:
934976
case normalizer::cAllUppercase:
935-
std::transform(token.begin(), token.end(), token.begin(), ::toupper);
977+
token = UTF8ToUpper(token);
936978
break;
937979
case normalizer::cTitlecase:
938980
TitlecaseFirstCharacter(token);
@@ -953,7 +995,7 @@ class SpmUgmDecoder {
953995
switch (first_char) {
954996
case normalizer::cUppercase:
955997
case normalizer::cAllUppercase:
956-
std::transform(token.begin(), token.end(), token.begin(), ::toupper);
998+
token = UTF8ToUpper(token);
957999
break;
9581000
case normalizer::cTitlecase:
9591001
TitlecaseFirstCharacter(token);

0 commit comments

Comments
 (0)