PyThaiNLP · bact · Dec 11, 2023 · Nov 14, 2023 · Dec 1, 2023 · Dec 1, 2023
diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = [
+    "PartOfSpeechTagger",
+    "segment",
+]
+
+from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Tuple, Union
+import re
+import warnings
+from transformers import (
+    CamembertTokenizer,
+)
+
+
+_model_name = "clicknext/phayathaibert"
+_tokenizer = CamembertTokenizer.from_pretrained(_model_name)
+
+
+class PartOfSpeechTagger:
+    def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
+        # Load model directly
+        from transformers import (
+            AutoTokenizer, 
+            AutoModelForTokenClassification,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModelForTokenClassification.from_pretrained(model)
+
+    def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]:
+        from transformers import TokenClassificationPipeline
+        pipeline = TokenClassificationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer, 
+            aggregation_strategy=strategy,
+        )
+        outputs = pipeline(sentence)
+        word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
+        return word_tags
+
+def segment(sentence: str)->List[str]:
+    if not sentence or not isinstance(sentence, str):
+        return []
+
+    return _tokenizer.tokenize(sentence)
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -224,6 +224,7 @@ def pos_tag_transformers(
 
     _blackboard_support_engine = {
         "bert" : "lunarlist/pos_thai",
+        "phayathai" : "lunarlist/pos_thai_phayathai",
     }
 
     _pud_support_engine = {

diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -616,6 +616,8 @@ def subword_tokenize(
         from pythainlp.tokenize.tltk import syllable_tokenize as segment
     elif engine == "han_solo":
         from pythainlp.tokenize.han_solo import segment
+    elif engine == "phayathai":
+        from pythainlp.phayathaibert import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -431,6 +431,18 @@ def test_subword_tokenize(self):
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")
         )
         self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list)
+
+        self.assertEqual(subword_tokenize(None, engine="phayathai"), [])
+        self.assertEqual(subword_tokenize("", engine="phayathai"), [])
+        self.assertIsInstance(
+            subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai")
+        )
+        self.assertIsInstance(
+            subword_tokenize("โควิด19", engine="phayathai"), list
+        )
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist