Skip to content

Commit 90460f3

Browse files
committed
Add titlecase marker injection
1 parent a78fbd5 commit 90460f3

File tree

1 file changed

+49
-3
lines changed

1 file changed

+49
-3
lines changed

operators/tokenizer/ugm_kernels.hpp

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,9 @@ struct SpmUgmTokenizer {
563563

564564
std::string_view input_view(input);
565565
int consumed = 0;
566+
size_t original_index = 0;
567+
bool pending_title_marker = false;
568+
bool pending_upper_marker = false;
566569

567570
while (!input_view.empty()) {
568571
auto p = case_encoder_->NormalizePrefix(input_view);
@@ -576,6 +579,38 @@ struct SpmUgmTokenizer {
576579
normalized.append(space);
577580
is_space_prepended = true;
578581
}
582+
583+
// Detect Titlecase/Acronym at the start of a word using original input (ASCII heuristic)
584+
// Titlecase: first letter uppercase followed by lowercase
585+
// Acronym: 2+ consecutive uppercase letters
586+
auto is_upper_ascii = [](unsigned char ch) { return ch >= 'A' && ch <= 'Z'; };
587+
auto is_lower_ascii = [](unsigned char ch) { return ch >= 'a' && ch <= 'z'; };
588+
size_t idx = original_index;
589+
while (idx < input.size() && (input[idx] == ' ' || input[idx] == '\t' || input[idx] == '\n' || input[idx] == '\r')) idx++;
590+
if (idx < input.size() && is_upper_ascii(static_cast<unsigned char>(input[idx]))) {
591+
size_t j = idx;
592+
size_t upper_count = 0;
593+
while (j < input.size() && is_upper_ascii(static_cast<unsigned char>(input[j]))) {
594+
++upper_count;
595+
++j;
596+
}
597+
if (upper_count >= 2) {
598+
pending_upper_marker = true;
599+
} else if (upper_count == 1) {
600+
if (j < input.size() && is_lower_ascii(static_cast<unsigned char>(input[j]))) {
601+
pending_title_marker = true;
602+
}
603+
}
604+
}
605+
}
606+
if (pending_upper_marker) {
607+
// Insert All-Uppercase marker before the first non-space char of the word
608+
normalized.push_back(normalizer::cAllUppercase);
609+
pending_upper_marker = false;
610+
} else if (pending_title_marker) {
611+
// Insert Titlecase marker before the first non-space char of the word
612+
normalized.push_back(normalizer::cTitlecase);
613+
pending_title_marker = false;
579614
}
580615
normalized.push_back(c);
581616
} else {
@@ -590,6 +625,7 @@ struct SpmUgmTokenizer {
590625

591626
consumed += p.second;
592627
input_view.remove_prefix(p.second);
628+
original_index += static_cast<size_t>(p.second);
593629
}
594630

595631
case_encoder_->PostProcess(&normalized, &norm_to_orig);
@@ -988,14 +1024,18 @@ class SpmUgmDecoder {
9881024
if (first_char == normalizer::cUppercase || first_char == normalizer::cAllUppercase ||
9891025
first_char == normalizer::cTitlecase || first_char == normalizer::cLowercase ||
9901026
first_char == normalizer::cPunctuation) {
1027+
// If we have an active Uppercase signature, ignore any leading Lowercase/Punctuation markers
1028+
bool active_upper = ((*state)->signature_ == normalizer::cUppercase || (*state)->signature_ == normalizer::cAllUppercase);
9911029
token.erase(0, 1); // Remove signature character
9921030
// Update signature state: persist for AllUppercase/Uppercase; one-shot for Titlecase
9931031
if (first_char == normalizer::cAllUppercase || first_char == normalizer::cUppercase) {
9941032
(*state)->signature_ = first_char;
9951033
} else if (first_char == normalizer::cTitlecase) {
9961034
(*state)->signature_ = 0; // one-shot
997-
} else { // Lowercase or Punctuation reset any prior signature
998-
(*state)->signature_ = 0;
1035+
} else { // Lowercase or Punctuation
1036+
if (!active_upper) {
1037+
(*state)->signature_ = 0; // only reset if not in uppercase mode
1038+
}
9991039
}
10001040
switch (first_char) {
10011041
case normalizer::cUppercase:
@@ -1005,7 +1045,13 @@ class SpmUgmDecoder {
10051045
case normalizer::cTitlecase:
10061046
TitlecaseFirstCharacter(token);
10071047
break;
1008-
// For cLowercase and cPunctuation, no transformation needed
1048+
case normalizer::cLowercase:
1049+
case normalizer::cPunctuation:
1050+
// If active uppercase mode, keep token uppercased
1051+
if (active_upper) {
1052+
UppercaseUTF8(token);
1053+
}
1054+
break;
10091055
}
10101056
}
10111057
}

0 commit comments

Comments
 (0)