Skip to content

Commit f03ea8c

Browse files
committed
fix the unit test.
1 parent d9d7b20 commit f03ea8c

File tree

2 files changed

+10
-9
lines changed

2 files changed

+10
-9
lines changed

operators/tokenizer/bpe_utils.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -370,17 +370,17 @@ class PreTokenizerWithRegEx {
370370

371371
size_t j = i;
372372
// [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*
373-
ufal::unilib::unicode::category_t categories1 = ufal::unilib::unicode::Lu | ufal::unilib::unicode::Lt |
374-
ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo |
375-
ufal::unilib::unicode::M;
373+
const ufal::unilib::unicode::category_t categories1 = ufal::unilib::unicode::Lu | ufal::unilib::unicode::Lt |
374+
ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo |
375+
ufal::unilib::unicode::M;
376376
if (IsCategory(m_text[i], categories1)) {
377377
for (; j < m_text.size(); ++j) {
378378
if (!IsCategory(m_text[j], categories1)) break;
379379
}
380380
}
381381

382382
// [\p{Ll}\p{Lm}\p{Lo}\p{M}]+
383-
ufal::unilib::unicode::category_t categories2 =
383+
const ufal::unilib::unicode::category_t categories2 =
384384
ufal::unilib::unicode::Ll | ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo | ufal::unilib::unicode::M;
385385

386386
if (IsCategory(m_text[j], categories2)) {
@@ -428,9 +428,9 @@ class PreTokenizerWithRegEx {
428428
}
429429

430430
// [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+
431-
ufal::unilib::unicode::category_t categories1 = ufal::unilib::unicode::Lu | ufal::unilib::unicode::Lt |
432-
ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo |
433-
ufal::unilib::unicode::M;
431+
const ufal::unilib::unicode::category_t categories1 = ufal::unilib::unicode::Lu | ufal::unilib::unicode::Lt |
432+
ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo |
433+
ufal::unilib::unicode::M;
434434
if (IsCategory(m_text[i], categories1)) {
435435
for (; i < m_text.size(); ++i) {
436436
if (!IsCategory(m_text[i], categories1)) break;
@@ -440,7 +440,7 @@ class PreTokenizerWithRegEx {
440440
}
441441

442442
// [\p{Ll}\p{Lm}\p{Lo}\p{M}]*
443-
ufal::unilib::unicode::category_t categories2 =
443+
const ufal::unilib::unicode::category_t categories2 =
444444
ufal::unilib::unicode::Ll | ufal::unilib::unicode::Lm | ufal::unilib::unicode::Lo | ufal::unilib::unicode::M;
445445
if (IsCategory(m_text[i], categories2)) {
446446
for (; i < m_text.size(); ++i) {

test/pp_api_test/test_tokenizer.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ TEST(OrtxTokenizerTest, RegexTest) {
7070
auto reg_splitter = std::make_unique<ort_extensions::bpe::PreTokenizerWithRegEx>();
7171

7272
std::vector<std::u32string> res;
73-
std::vector<std::u32string> out_tokens = {U"You'll", U" enjoy", U" the", U" concert", U"."};
73+
std::vector<std::u32string> out_tokens = {U"You'll", U" enjoy", U" the", U" concert"};
7474

7575
int64_t max_length = out_tokens.size();
7676
reg_splitter->Set(str.c_str());
@@ -81,6 +81,7 @@ TEST(OrtxTokenizerTest, RegexTest) {
8181
std::u32string_view tok = reg_splitter->GetNextToken();
8282
res.push_back(ustring(tok));
8383
}
84+
8485
EXPECT_EQ(res, out_tokens);
8586
}
8687

0 commit comments

Comments
 (0)