microsoft · sayanshaw24 · Jan 18, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -359,6 +359,112 @@ class PreTokenizerWithRegEx {
     return {};
   }
 
+  void CategoryMatch(size_t& index, std::set<ufal::unilib::unicode::category_t>& categories){
+    while (categories.find(ufal::unilib::unicode::category(m_text[index])) != categories.end()){
+      index++;
+    }
+  }
+
+  // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?
+  std::u32string_view Match_PHI4_Pattern_1() {
+    size_t i = 0;
+
+    // [^\r\n\p{L}\p{N}]?
+    if (!IsRN(m_text[i]) && !IsN(m_text[i]) && !IsL(m_text[i])) {
+      i++;
+    }
+
+    // [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*
+    std::set<ufal::unilib::unicode::category_t> categories1 = {ufal::unilib::unicode::Lu,
+                                                               ufal::unilib::unicode::Lt,
+                                                               ufal::unilib::unicode::Lm,
+                                                               ufal::unilib::unicode::Lo,
+                                                               ufal::unilib::unicode::M};
+    CategoryMatch(i, categories1);
+
+    // [\p{Ll}\p{Lm}\p{Lo}\p{M}]+
+    size_t j = i;
+    std::set<ufal::unilib::unicode::category_t> categories2 = {ufal::unilib::unicode::Ll,
+                                                                 ufal::unilib::unicode::Lm,
+                                                                 ufal::unilib::unicode::Lo,
+                                                                 ufal::unilib::unicode::M};
+    CategoryMatch(i, categories2);
+    if (i == j){
+      // No case match, return as this is a '+' category case (one or more occurrences must be found)
+      std::u32string_view res = m_text.substr(0, i);
+      m_text = m_text.substr(i);
+      return res;
+    }
+
+    // (?i:'s|'t|'re|'ve|'m|'ll|'d)?
+    if ((m_text[i] == U'\'') && ((i + 1) < m_text.size())) {
+      if ((m_text[i + 1] == U's') || (m_text[i + 1] == U't') || (m_text[i + 1] == U'm') || (m_text[i + 1] == U'd') ||
+          (m_text[i + 1] == U'S') || (m_text[i + 1] == U'T') || (m_text[i + 1] == U'M') || (m_text[i + 1] == U'D')) {
+        i += 2;
+      } else if ((i + 2) < m_text.size()) {
+        if ((((m_text[i + 1] == U'r') || (m_text[i + 1] == U'R')) && ((m_text[i + 2] == U'e') || (m_text[i + 2] == U'E'))) ||
+            (((m_text[i + 1] == U'v') || (m_text[i + 1] == U'V')) && ((m_text[i + 2] == U'e') || (m_text[i + 2] == U'E'))) ||
+            (((m_text[i + 1] == U'l') || (m_text[i + 1] == U'L')) && ((m_text[i + 2] == U'l') || (m_text[i + 2] == U'L')))) {
+          i += 3;
+        }
+      }
+    }
+
+    std::u32string_view res = m_text.substr(0, i);
+    m_text = m_text.substr(i);
+    return res;
+  }
+
+  // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?
+  std::u32string_view Match_PHI4_Pattern_2() {
+    size_t i = 0;
+
+    // [^\r\n\p{L}\p{N}]?
+    if (!IsRN(m_text[i]) && !IsN(m_text[i]) && !IsL(m_text[i])) {
+      i++;
+    }
+
+    // [\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+
+    size_t j = i;
+    std::set<ufal::unilib::unicode::category_t> categories1 = {ufal::unilib::unicode::Lu,
+                                                                 ufal::unilib::unicode::Lt,
+                                                                 ufal::unilib::unicode::Lm,
+                                                                 ufal::unilib::unicode::Lo,
+                                                                 ufal::unilib::unicode::M};
+    CategoryMatch(i, categories1);
+    if (i == j){
+      // No case match, return as this is a '+' category case (one or more occurrences must be found)
+      std::u32string_view res = m_text.substr(0, i);
+      m_text = m_text.substr(i);
+      return res;
+    }
+
+    // [\p{Ll}\p{Lm}\p{Lo}\p{M}]*
+    std::set<ufal::unilib::unicode::category_t> categories2 = {ufal::unilib::unicode::Ll,
+                                                                 ufal::unilib::unicode::Lm,
+                                                                 ufal::unilib::unicode::Lo,
+                                                                 ufal::unilib::unicode::M};
+    CategoryMatch(i, categories2);
+
+    // (?i:'s|'t|'re|'ve|'m|'ll|'d)?
+    if ((m_text[i] == U'\'') && ((i + 1) < m_text.size())) {
+      if ((m_text[i + 1] == U's') || (m_text[i + 1] == U't') || (m_text[i + 1] == U'm') || (m_text[i + 1] == U'd') ||
+          (m_text[i + 1] == U'S') || (m_text[i + 1] == U'T') || (m_text[i + 1] == U'M') || (m_text[i + 1] == U'D')) {
+        i += 2;
+      } else if ((i + 2) < m_text.size()) {
+        if ((((m_text[i + 1] == U'r') || (m_text[i + 1] == U'R')) && ((m_text[i + 2] == U'e') || (m_text[i + 2] == U'E'))) ||
+            (((m_text[i + 1] == U'v') || (m_text[i + 1] == U'V')) && ((m_text[i + 2] == U'e') || (m_text[i + 2] == U'E'))) ||
+            (((m_text[i + 1] == U'l') || (m_text[i + 1] == U'L')) && ((m_text[i + 2] == U'l') || (m_text[i + 2] == U'L')))) {
+          i += 3;
+        }
+      }
+    }
+
+    std::u32string_view res = m_text.substr(0, i);
+    m_text = m_text.substr(i);
+    return res;
+  }
+
   // "(\p{N})"
   std::u32string_view Match_General_Pattern_1() {
     if (IsN(m_text[0])) {
@@ -387,6 +493,9 @@ class PreTokenizerWithRegEx {
         {R"(\s+(?!\S)|\s+)", &PreTokenizerWithRegEx::Match_GPT2_Pattern_4},
         {R"([\p{L}]+|[\p{N}])", &PreTokenizerWithRegEx::Match_CLIP_Pattern_1},
         {R"([^\s\p{L}\p{N}]+)", &PreTokenizerWithRegEx::Match_CLIP_Pattern_2},
+        {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_1},
+        {R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?)", &PreTokenizerWithRegEx::Match_PHI4_Pattern_2},
+        {R"(?[^\s\p{L}\p{N}]+[\r\n/]*)", &PreTokenizerWithRegEx::Match_LLAMA3_Pattern_4},
         {R"(\p{N})", &PreTokenizerWithRegEx::Match_General_Pattern_1},
     };
 

@@ -65,6 +65,25 @@ TEST(CApiTest, StreamApiTest) {
   OrtxDispose(&tokenizer);
 }
 
+TEST(OrtxTokenizerTest, RegexTest) {
+  std::u32string str = U"You'll enjoy the concert.";
+  auto reg_splitter = std::make_unique<ort_extensions::bpe::PreTokenizerWithRegEx>();
+
+  std::vector<std::u32string> res;
+  std::vector<std::u32string> out_tokens = {U"You'll"};
+
+  int64_t max_length = out_tokens.size();
+  reg_splitter->Set(str.c_str());
+  auto status = reg_splitter->Compile(R"([^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?)");
+  assert(status.IsOk());
+
+  while (static_cast<int64_t>(res.size()) < max_length) {
+    std::u32string_view tok = reg_splitter->GetNextToken();
+    res.push_back(ustring(tok));
+  }
+  EXPECT_EQ(res, out_tokens);
+}
+
 TEST(OrtxTokenizerTest, ClipTokenizer) {
   auto tokenizer = std::make_unique<ort_extensions::TokenizerImpl>();
   auto status = tokenizer->Load("data/tokenizer/clip");

@@ -120,7 +120,7 @@ def test_llama3_2_image_processing(self):
                 a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png")
 
     # test sentence for tokenizer
-    tokenizer_test_sentence = "I like walking my cute dog\n and\x17 then 生活的真谛是 \t\t\t\t \n\n61"
+    tokenizer_test_sentence = "I like walking my cute dog\n and\x17 then 生活的真谛是 \t\t\t\t \n\n61. You'll enjoy the concert."
 
     def test_OLMa_tokenizer(self):
         test_sentence = [self.tokenizer_test_sentence + " |||IP_ADDRESS|||"]