Skip to content

Commit 0feeebc

Browse files
committed
Add titlecase marker injection
1 parent a78fbd5 commit 0feeebc

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

operators/tokenizer/ugm_kernels.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,9 @@ struct SpmUgmTokenizer {
563563

564564
std::string_view input_view(input);
565565
int consumed = 0;
566+
size_t original_index = 0;
567+
bool pending_title_marker = false;
568+
bool pending_upper_marker = false;
566569

567570
while (!input_view.empty()) {
568571
auto p = case_encoder_->NormalizePrefix(input_view);
@@ -576,6 +579,38 @@ struct SpmUgmTokenizer {
576579
normalized.append(space);
577580
is_space_prepended = true;
578581
}
582+
583+
// Detect Titlecase/Acronym at the start of a word using original input (ASCII heuristic)
584+
// Titlecase: first letter uppercase followed by lowercase
585+
// Acronym: 2+ consecutive uppercase letters
586+
auto is_upper_ascii = [](unsigned char ch) { return ch >= 'A' && ch <= 'Z'; };
587+
auto is_lower_ascii = [](unsigned char ch) { return ch >= 'a' && ch <= 'z'; };
588+
size_t idx = original_index;
589+
while (idx < input.size() && (input[idx] == ' ' || input[idx] == '\t' || input[idx] == '\n' || input[idx] == '\r')) idx++;
590+
if (idx < input.size() && is_upper_ascii(static_cast<unsigned char>(input[idx]))) {
591+
size_t j = idx;
592+
size_t upper_count = 0;
593+
while (j < input.size() && is_upper_ascii(static_cast<unsigned char>(input[j]))) {
594+
++upper_count;
595+
++j;
596+
}
597+
if (upper_count >= 2) {
598+
pending_upper_marker = true;
599+
} else if (upper_count == 1) {
600+
if (j < input.size() && is_lower_ascii(static_cast<unsigned char>(input[j]))) {
601+
pending_title_marker = true;
602+
}
603+
}
604+
}
605+
}
606+
if (pending_upper_marker) {
607+
// Insert All-Uppercase marker before the first non-space char of the word
608+
normalized.push_back(normalizer::cAllUppercase);
609+
pending_upper_marker = false;
610+
} else if (pending_title_marker) {
611+
// Insert Titlecase marker before the first non-space char of the word
612+
normalized.push_back(normalizer::cTitlecase);
613+
pending_title_marker = false;
579614
}
580615
normalized.push_back(c);
581616
} else {
@@ -590,6 +625,7 @@ struct SpmUgmTokenizer {
590625

591626
consumed += p.second;
592627
input_view.remove_prefix(p.second);
628+
original_index += static_cast<size_t>(p.second);
593629
}
594630

595631
case_encoder_->PostProcess(&normalized, &norm_to_orig);

0 commit comments

Comments
 (0)