From 41d79c2c3892d6102ab82502459bd0a155015d49 Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Fri, 1 Dec 2023 14:32:11 +0000
Subject: [PATCH 01/18] add phayathaibert core engine

---
 pythainlp/phayathaibert/__init__.py | 20 +++++++++++
 pythainlp/phayathaibert/core.py     | 52 +++++++++++++++++++++++++++++
 pythainlp/tag/pos_tag.py            |  1 +
 pythainlp/tokenize/core.py          |  2 ++
 tests/test_tokenize.py              | 12 +++++++
 5 files changed, 87 insertions(+)
 create mode 100644 pythainlp/phayathaibert/__init__.py
 create mode 100644 pythainlp/phayathaibert/core.py

diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
new file mode 100644
index 000000000..426a1d3be
--- /dev/null
+++ b/pythainlp/phayathaibert/__init__.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = [
+    "PartOfSpeechTagger",
+    "segment",
+]
+
+from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment
\ No newline at end of file
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
new file mode 100644
index 000000000..983d4a203
--- /dev/null
+++ b/pythainlp/phayathaibert/core.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Tuple, Union
+import re
+import warnings
+from transformers import (
+    CamembertTokenizer,
+)
+
+
+_model_name = "clicknext/phayathaibert"
+_tokenizer = CamembertTokenizer.from_pretrained(_model_name)
+
+
+class PartOfSpeechTagger:
+    def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
+        # Load model directly
+        from transformers import (
+            AutoTokenizer, 
+            AutoModelForTokenClassification,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModelForTokenClassification.from_pretrained(model)
+
+    def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]:
+        from transformers import TokenClassificationPipeline
+        pipeline = TokenClassificationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer, 
+            aggregation_strategy=strategy,
+        )
+        outputs = pipeline(sentence)
+        word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
+        return word_tags
+    
+def segment(sentence: str)->List[str]:
+    if not sentence or not isinstance(sentence, str):
+        return []
+
+    return _tokenizer.tokenize(sentence)
\ No newline at end of file
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index ee2a2b478..eb6ab877b 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -224,6 +224,7 @@ def pos_tag_transformers(
 
     _blackboard_support_engine = {
         "bert" : "lunarlist/pos_thai",
+        "phayathai" : "lunarlist/pos_thai_phayathai",
     }
 
     _pud_support_engine = {
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index a19082981..234bccfe5 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -616,6 +616,8 @@ def subword_tokenize(
         from pythainlp.tokenize.tltk import syllable_tokenize as segment
     elif engine == "han_solo":
         from pythainlp.tokenize.han_solo import segment
+    elif engine == "phayathai":
+        from pythainlp.phayathaibert import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
index 1537b62c9..1f1ed4ac5 100644
--- a/tests/test_tokenize.py
+++ b/tests/test_tokenize.py
@@ -431,6 +431,18 @@ def test_subword_tokenize(self):
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")
         )
         self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list)
+
+        self.assertEqual(subword_tokenize(None, engine="phayathai"), [])
+        self.assertEqual(subword_tokenize("", engine="phayathai"), [])
+        self.assertIsInstance(
+            subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai")
+        )
+        self.assertIsInstance(
+            subword_tokenize("โควิด19", engine="phayathai"), list
+        )
         with self.assertRaises(ValueError):
             subword_tokenize("นกแก้ว", engine="XX")  # engine does not exist
 

From cb9e27aed1ec4a7dad8d6cccc3fe34ff2f84b283 Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Mon, 4 Dec 2023 10:04:12 +0000
Subject: [PATCH 02/18] add data augmentation engine

---
 pythainlp/augment/lm/__init__.py      |   2 +
 pythainlp/augment/lm/phayathaibert.py |  73 +++++++++
 pythainlp/phayathaibert/__init__.py   |   9 +-
 pythainlp/phayathaibert/core.py       | 219 +++++++++++++++++++++++++-
 tests/test_augment.py                 |   5 +
 5 files changed, 306 insertions(+), 2 deletions(-)
 create mode 100644 pythainlp/augment/lm/phayathaibert.py

diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py
index 968380438..2798bc3b9 100644
--- a/pythainlp/augment/lm/__init__.py
+++ b/pythainlp/augment/lm/__init__.py
@@ -19,7 +19,9 @@
 __all__ = [
     "FastTextAug",
     "Thai2transformersAug",
+    "ThaiTextAugmenter",
 ]
 
 from pythainlp.augment.lm.fasttext import FastTextAug
 from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
+from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
new file mode 100644
index 000000000..e9303616e
--- /dev/null
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+import re
+import random
+from pythainlp.phayathaibert.core import ThaiTextProcessor
+
+
+_model_name = "clicknext/phayathaibert"
+
+class ThaiTextAugmenter:
+    def __init__(self,)->None:
+        from transformers import (AutoTokenizer, 
+                                  AutoModelForMaskedLM, 
+                                  pipeline,)
+        self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
+        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
+        self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm)
+        self.processor = ThaiTextProcessor()
+
+    def generate(self,
+                 sample_text: str, 
+                 word_rank: int, 
+                 max_length: int=3,
+                 sample: bool=False
+                 )->str:
+        sample_txt = sample_text
+        final_text = ""
+        for j in range(max_length):
+            input = self.processor.preprocess(sample_txt)
+            if sample:
+                random_word_idx = random.randint(0, 4)
+                output = self.model(input)[random_word_idx]['sequence']
+            else:
+                output = self.model(input)[word_rank]['sequence']
+            sample_txt = output+"<mask>"
+            final_text = sample_txt
+
+        gen_txt = re.sub("<mask>","",final_text)
+        return gen_txt
+    
+
+    def augment(self,
+                text: str, 
+                num_augs: int, 
+                sample: bool=False)->List[str]:
+        augment_list = []
+        if "<mask>" not in text:
+            text = text+"<mask>" 
+        if num_augs <= 5:
+            for rank in range(num_augs):
+                gen_text = self.generate(text, rank, sample=sample)
+                processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
+                augment_list.append(processed_text)
+
+            return augment_list
+        else:
+            raise ValueError(
+                f"augmentation of more than {num_augs} is exceeded the default limit"
+            )
+
diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 426a1d3be..8d579d760 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -13,8 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 __all__ = [
+    "ThaiTextProcessor",
+    "ThaiTextAugmenter",
     "PartOfSpeechTagger",
     "segment",
 ]
 
-from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment
\ No newline at end of file
+from pythainlp.phayathaibert.core import (
+    ThaiTextProcessor, 
+    ThaiTextAugmenter, 
+    PartOfSpeechTagger, 
+    segment,
+)
\ No newline at end of file
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 983d4a203..913487ac5 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -12,9 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Tuple, Union
+from typing import List, Tuple, Collection, Callable
 import re
 import warnings
+import random
+from pythainlp.tokenize import word_tokenize
 from transformers import (
     CamembertTokenizer,
 )
@@ -23,6 +25,221 @@
 _model_name = "clicknext/phayathaibert"
 _tokenizer = CamembertTokenizer.from_pretrained(_model_name)
 
+class ThaiTextProcessor:
+    def __init__(self):
+        self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = "<unk> <rep> <wrep> <url> </s>".split()
+        self.SPACE_SPECIAL_TOKEN = "<_>"
+
+
+    def replace_url(self, text: str) -> str:
+        """
+            Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
+            :param str text: text to replace url
+            :return: text where urls  are replaced
+            :rtype: str
+            :Example:
+                >>> replace_url("go to https://github.com")
+                go to <url>
+        """
+        URL_PATTERN = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
+        return re.sub(URL_PATTERN, self._TK_URL, text)
+
+    def rm_brackets(text: str) -> str:
+        """
+            Remove all empty brackets and artifacts within brackets from `text`.
+            :param str text: text to remove useless brackets
+            :return: text where all useless brackets are removed
+            :rtype: str
+            :Example:
+                >>> rm_brackets("hey() whats[;] up{*&} man(hey)")
+                hey whats up man(hey)
+        """
+        # remove empty brackets
+        new_line = re.sub(r"\(\)", "", text)
+        new_line = re.sub(r"\{\}", "", new_line)
+        new_line = re.sub(r"\[\]", "", new_line)
+        # brakets with only punctuations
+        new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line)
+        new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line)
+        new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line)
+        # artifiacts after (
+        new_line = re.sub(r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line)
+        new_line = re.sub(r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line)
+        new_line = re.sub(r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line)
+        # artifacts before )
+        new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line)
+        new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line)
+        new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line)
+        return new_line
+
+    def replace_newlines(text: str) -> str:
+        """
+            Replace newlines in `text` with spaces.
+            :param str text: text to replace all newlines with spaces
+            :return: text where all newlines are replaced with spaces
+            :rtype: str
+            :Example:
+                >>> rm_useless_spaces("hey whats\n\nup")
+                hey whats  up
+        """
+
+        return re.sub(r"[\n]", " ", text.strip())
+
+    def rm_useless_spaces(text: str) -> str:
+        """
+            Remove multiple spaces in `text`. (code from `fastai`)
+            :param str text: text to replace useless spaces
+            :return: text where all spaces are reduced to one
+            :rtype: str
+            :Example:
+                >>> rm_useless_spaces("oh         no")
+                oh no
+        """
+        return re.sub(" {2,}", " ", text)
+
+    def replace_spaces(text: str, space_token: str = SPACE_SPECIAL_TOKEN) -> str:
+        """
+            Replace spaces with _
+            :param str text: text to replace spaces
+            :return: text where all spaces replaced with _
+            :rtype: str
+            :Example:
+                >>> replace_spaces("oh no")
+                oh_no
+        """
+        return re.sub(" ", space_token, text)
+
+    def replace_rep_after(text: str) -> str:
+        """
+        Replace repetitions at the character level in `text`
+        :param str text: input text to replace character repetition
+        :return: text with repetitive tokens removed.
+        :rtype: str
+        :Example:
+            >>> text = "กาาาาาาา"
+            >>> replace_rep_after(text)
+            'กา'
+        """
+
+        def _replace_rep(m):
+            c, cc = m.groups()
+            return f"{c}"
+
+        re_rep = re.compile(r"(\S)(\1{3,})")
+        return re_rep.sub(_replace_rep, text)
+
+    def replace_wrep_post(toks: Collection[str]) -> Collection[str]:
+        """
+        Replace reptitive words post tokenization;
+        fastai `replace_wrep` does not work well with Thai.
+        :param Collection[str] toks: list of tokens
+        :return: list of tokens where repetitive words are removed.
+        :rtype: Collection[str]
+        :Example:
+            >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
+            >>> replace_wrep_post(toks)
+            ['กา', 'น้ำ']
+        """
+        previous_word = None
+        rep_count = 0
+        res = []
+        for current_word in toks + [self._TK_END]:
+            if current_word == previous_word:
+                rep_count += 1
+            elif (current_word != previous_word) & (rep_count > 0):
+                res += [previous_word]
+                rep_count = 0
+            else:
+                res.append(previous_word)
+            previous_word = current_word
+        return res[1:]
+
+    def remove_space(toks: Collection[str]) -> Collection[str]:
+        """
+        Do not include space for bag-of-word models.
+        :param Collection[str] toks: list of tokens
+        :return: Collection of tokens where space tokens (" ") are filtered out
+        :rtype: Collection[str]
+        :Example:
+            >>> toks = ['ฉัน','เดิน',' ','กลับ','บ้าน']
+            >>> remove_space(toks)
+            ['ฉัน','เดิน','กลับ','บ้าน']
+        """
+        res = []
+        for t in toks:
+            t = t.strip()
+            if t:
+                res.append(t)
+        return res
+
+    # combine them together
+    def preprocess(
+        self,
+        text: str,
+        pre_rules: Collection[Callable] = [
+            rm_brackets,
+            replace_newlines,
+            rm_useless_spaces,
+            replace_spaces,
+            replace_rep_after,
+        ],
+        tok_func: Callable = word_tokenize,
+    ) -> str:
+        text = text.lower()
+        for rule in pre_rules:
+            text = rule(text)
+        toks = tok_func(text)
+        return "".join(toks)
+
+
+
+class ThaiTextAugmenter:
+    def __init__(self)->None:
+        from transformers import (AutoTokenizer, 
+                                  AutoModelForMaskedLM, 
+                                  pipeline,)
+        self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
+        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
+        self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm)
+        self.processor = ThaiTextProcessor()
+
+    def generate(self,
+                 sample_text: str, 
+                 word_rank: int, 
+                 max_length: int=3,
+                 sample: bool=False
+                 )->str:
+        sample_txt = sample_text
+        final_text = ""
+        for j in range(max_length):
+            input = self.processor.preprocess(sample_txt)
+            if sample:
+                random_word_idx = random.randint(0, 4)
+                output = self.model(input)[random_word_idx]['sequence']
+            else:
+                output = self.model(input)[word_rank]['sequence']
+            sample_txt = output+"<mask>"
+            final_text = sample_txt
+
+        gen_txt = re.sub("<mask>","",final_text)
+        return gen_txt
+    
+
+    def augment(self,
+                text: str, 
+                num_augs: int, 
+                sample: bool=False
+                )->List[str]:
+        augment_list = []
+        if num_augs <= 5: # since huggingface transformers pipeline default was set to 5 generated text
+            for rank in range(num_augs):
+                gen_text = self.generate(text, rank, sample=sample)
+                processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
+                augment_list.append(processed_text)
+
+            return augment_list
+
+
 
 class PartOfSpeechTagger:
     def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
diff --git a/tests/test_augment.py b/tests/test_augment.py
index 51dc89082..5542d96c3 100644
--- a/tests/test_augment.py
+++ b/tests/test_augment.py
@@ -5,6 +5,7 @@
 from pythainlp.augment import WordNetAug
 from pythainlp.augment.wordnet import postype2wordnet
 # from pythainlp.augment.lm import Thai2transformersAug
+# from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
 from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug
 from pythainlp.augment.word2vec import (
     LTW2VAug
@@ -43,3 +44,7 @@ def test_LTW2VAug(self):
     # def test_Thai2transformersAug(self):
     #     _aug = Thai2transformersAug()
     #     self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))
+
+     # def test_ThaiTextAugmenter(self):
+    #     _aug = ThaiTextAugmenter()
+    #     self.assertIsNotNone(_aug.augment(self.text2, num__augs=3))
\ No newline at end of file

From 473af52ed0afdf2df04f772e2ffa03a4f3ed6d7c Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Mon, 4 Dec 2023 10:11:44 +0000
Subject: [PATCH 03/18] update engine properties

---
 pythainlp/augment/lm/phayathaibert.py | 2 +-
 pythainlp/phayathaibert/core.py       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index e9303616e..40d0877a1 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -54,7 +54,7 @@ def generate(self,
 
     def augment(self,
                 text: str, 
-                num_augs: int, 
+                num_augs: int=3, 
                 sample: bool=False)->List[str]:
         augment_list = []
         if "<mask>" not in text:
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 913487ac5..d25e7b796 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -97,7 +97,7 @@ def rm_useless_spaces(text: str) -> str:
         """
         return re.sub(" {2,}", " ", text)
 
-    def replace_spaces(text: str, space_token: str = SPACE_SPECIAL_TOKEN) -> str:
+    def replace_spaces(text: str, space_token: str = self.SPACE_SPECIAL_TOKEN) -> str:
         """
             Replace spaces with _
             :param str text: text to replace spaces
@@ -227,7 +227,7 @@ def generate(self,
 
     def augment(self,
                 text: str, 
-                num_augs: int, 
+                num_augs: int=3, 
                 sample: bool=False
                 )->List[str]:
         augment_list = []

From 0c3efd0b4f9b5e150b3092c2e506c47b2d8f370c Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Mon, 4 Dec 2023 10:26:08 +0000
Subject: [PATCH 04/18] updae augmentation properties

---
 pythainlp/phayathaibert/core.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index d25e7b796..8fd5229af 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 from typing import List, Tuple, Collection, Callable
 import re
-import warnings
 import random
 from pythainlp.tokenize import word_tokenize
 from transformers import (
@@ -97,7 +96,7 @@ def rm_useless_spaces(text: str) -> str:
         """
         return re.sub(" {2,}", " ", text)
 
-    def replace_spaces(text: str, space_token: str = self.SPACE_SPECIAL_TOKEN) -> str:
+    def replace_spaces(text: str, space_token: str = "<_>") -> str:
         """
             Replace spaces with _
             :param str text: text to replace spaces
@@ -128,7 +127,7 @@ def _replace_rep(m):
         re_rep = re.compile(r"(\S)(\1{3,})")
         return re_rep.sub(_replace_rep, text)
 
-    def replace_wrep_post(toks: Collection[str]) -> Collection[str]:
+    def replace_wrep_post(self, toks: Collection[str]) -> Collection[str]:
         """
         Replace reptitive words post tokenization;
         fastai `replace_wrep` does not work well with Thai.

From dd2b8347c6ffff6f8a0b93e5d495147c85525abc Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Mon, 4 Dec 2023 10:43:21 +0000
Subject: [PATCH 05/18] change license

---
 pythainlp/augment/lm/phayathaibert.py | 15 ++-------------
 pythainlp/phayathaibert/__init__.py   | 15 ++-------------
 pythainlp/phayathaibert/core.py       | 15 ++-------------
 3 files changed, 6 insertions(+), 39 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 40d0877a1..e0dbc9d39 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -1,17 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2016-2023 PyThaiNLP Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 from typing import List
 import re
 import random
diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 8d579d760..7e2dba7f1 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -1,17 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2016-2023 PyThaiNLP Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 __all__ = [
     "ThaiTextProcessor",
     "ThaiTextAugmenter",
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 8fd5229af..3f35e144f 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -1,17 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2016-2023 PyThaiNLP Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Tuple, Collection, Callable
 import re
 import random

From d1b9c99b4b03ccb72176616ab98de0e4e1dc7aa0 Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Mon, 4 Dec 2023 16:52:40 +0000
Subject: [PATCH 06/18] add er engine

---
 pythainlp/phayathaibert/__init__.py |  1 +
 pythainlp/phayathaibert/core.py     | 45 ++++++++++++++++++++++++++++-
 pythainlp/tag/named_entity.py       |  4 +++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 7e2dba7f1..232399079 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -11,6 +11,7 @@
 from pythainlp.phayathaibert.core import (
     ThaiTextProcessor, 
     ThaiTextAugmenter, 
+    NamedEntityTagger,
     PartOfSpeechTagger, 
     segment,
 )
\ No newline at end of file
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 3f35e144f..5ecebe740 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Tuple, Collection, Callable
+from typing import List, Union, Tuple, Collection, Callable
 import re
 import random
+import warnings
 from pythainlp.tokenize import word_tokenize
 from transformers import (
     CamembertTokenizer,
@@ -250,6 +251,48 @@ def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, s
         word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
         return word_tags
     
+class NamedEntityTagger:
+     def __init__(self, model: str="Pavarissy/phayathaibert-thainer") -> None:
+        from transformers import (
+            AutoTokenizer, 
+            AutoModelForTokenClassification,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.model = AutoModelForTokenClassification.from_pretrained(model)
+     def get_ner(self,
+                text: str,
+                tag: bool=False, 
+                pos: bool=False,
+                strategy: str="simple"
+                )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
+        from transformers import TokenClassificationPipeline
+        if pos:
+            warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
+        sample_output = []
+        tag_text_list = []
+        current_pos = 0
+        pipeline = TokenClassificationPipeline(
+            model=self.model, 
+            tokenizer=self.tokenizer, 
+            aggregation_strategy=strategy,
+        )
+        outputs = pipeline(text)
+        for token in outputs:
+            ner_tag = token['entity_group']
+            begin_pos, end_pos = token['start'], token['end']
+            if current_pos == 0:
+                text_tag = text[:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"</{ner_tag}>"
+            else:
+                text_tag = text[current_pos:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"</{ner_tag}>"
+            tag_text_list.append(text_tag)
+            sample_output.append((token['word'], token['entity_group']))
+            current_pos = end_pos
+        if tag:
+            return str("".join(tag_text_list))
+        else:
+            return sample_output
+
+    
 def segment(sentence: str)->List[str]:
     if not sentence or not isinstance(sentence, str):
         return []
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
index 109e0faa1..0ab087cb6 100644
--- a/pythainlp/tag/named_entity.py
+++ b/pythainlp/tag/named_entity.py
@@ -46,6 +46,10 @@ def load_engine(self, engine: str, corpus: str) -> None:
             from pythainlp.wangchanberta import ThaiNameTagger
 
             self.engine = ThaiNameTagger(dataset_name=corpus)
+        elif engine=="phayathaibert" and corpus == "thainer-v2":
+            from pythainlp.phayathaibert.core import NamedEntityTagger
+
+            self.engine = NamedEntityTagger()
         else:
             raise ValueError(
                 "NER class not support {0} engine or {1} corpus.".format(

From cbb7c8e52787b8ca795f4cd28046029cd7c4b3fa Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 4 Dec 2023 17:48:59 +0000
Subject: [PATCH 07/18] Update __init__.py

---
 pythainlp/augment/lm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py
index 48e8c6c5e..febe833f5 100644
--- a/pythainlp/augment/lm/__init__.py
+++ b/pythainlp/augment/lm/__init__.py
@@ -12,5 +12,5 @@
 ]
 
 from pythainlp.augment.lm.fasttext import FastTextAug
-from pythainlp.augment.lm.wangchanberta import Thai2transformersAug
 from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter
+from pythainlp.augment.lm.wangchanberta import Thai2transformersAug

From 348dc1f0f74ec2873f8fe55cac33d677c64de017 Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Sun, 10 Dec 2023 12:25:58 +0000
Subject: [PATCH 08/18] add documentation and credit model builder

Co-authored-by: MpolaarbearM <notari5555@gmail.com>
---
 pythainlp/augment/lm/phayathaibert.py | 25 +++++++++
 pythainlp/phayathaibert/core.py       | 77 ++++++++++++++++++++++++++-
 pythainlp/tag/pos_tag.py              |  1 +
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index e0dbc9d39..6197e5116 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -45,6 +45,31 @@ def augment(self,
                 text: str, 
                 num_augs: int=3, 
                 sample: bool=False)->List[str]:
+        """
+        Text Augment from phayathaibert
+
+        :param str text: thai text
+        :param int num_augs: an amount of augmentation text needed as an output
+        :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed
+
+        :return: list of text augment
+        :rtype: List[str]
+
+        :Example:
+        ::
+
+            from pythainlp.augment.lm import ThaiTextAugmenter
+
+            aug=ThaiTextAugmenter()
+            aug = ThaiTextAugmenter()
+            aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
+
+            # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
+                'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
+                'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
+                'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
+                'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
+        """
         augment_list = []
         if "<mask>" not in text:
             text = text+"<mask>" 
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 5ecebe740..1cce1b203 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -219,8 +219,33 @@ def augment(self,
                 num_augs: int=3, 
                 sample: bool=False
                 )->List[str]:
+        """
+        Text Augment from phayathaibert
+
+        :param str text: thai text
+        :param int num_augs: an amount of augmentation text needed as an output
+        :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed
+
+        :return: list of text augment
+        :rtype: List[str]
+
+        :Example:
+        ::
+
+            from pythainlp.augment.lm import ThaiTextAugmenter
+
+            aug=ThaiTextAugmenter()
+            aug = ThaiTextAugmenter()
+            aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
+
+            # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.',
+                'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...',
+                'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...',
+                'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
+                'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
+        """
         augment_list = []
-        if num_augs <= 5: # since huggingface transformers pipeline default was set to 5 generated text
+        if num_augs <= 5: 
             for rank in range(num_augs):
                 gen_text = self.generate(text, rank, sample=sample)
                 processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
@@ -241,6 +266,24 @@ def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
         self.model = AutoModelForTokenClassification.from_pretrained(model)
 
     def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]:
+        """
+    Marks sentences with part-of-speech (POS) tags.
+
+    :param str sentence: a list of lists of tokenized words
+    :return: a list of lists of tuples (word, POS tag)
+    :rtype: list[list[tuple[str, str]]]
+
+    :Example:
+
+    Labels POS for given sentence::
+
+        from pythainlp.phayathaibert.core import PartOfSpeechTagger
+
+        tagger = PartOfSpeechTagger()
+        tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า")
+        # output:
+        # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
+        """
         from transformers import TokenClassificationPipeline
         pipeline = TokenClassificationPipeline(
             model=self.model, 
@@ -265,6 +308,31 @@ def get_ner(self,
                 pos: bool=False,
                 strategy: str="simple"
                 )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
+        """
+        This function tags named entities in text in IOB format.
+
+        :param str text: text in Thai to be tagged
+        :param bool pos: output with part-of-speech tags.\
+            (phayathaibert is supported in PartOfSpeechTagger)
+        :return: a list of tuples associated with tokenized words, NER tags,
+                 POS tags (if the parameter `pos` is specified as `True`),
+                 and output HTML-like tags (if the parameter `tag` is
+                 specified as `True`).
+                 Otherwise, return a list of tuples associated with tokenized
+                 words and NER tags
+        :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]
+        :Example:
+
+            >>> from pythainlp.phayathaibert.core import NamedEntityTagger
+            >>>
+            >>> tagger = NamedEntityTagger()
+            >>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย")
+            [('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'),
+            ('จาก', 'LOCATION'),
+            ('ประเทศไทย', 'LOCATION')]
+            >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
+            'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON><LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
+        """
         from transformers import TokenClassificationPipeline
         if pos:
             warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
@@ -294,6 +362,13 @@ def get_ner(self,
 
     
 def segment(sentence: str)->List[str]:
+    """
+    Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion.
+
+    :param str text: text to be tokenized
+    :return: list of subwords
+    :rtype: list[str]
+    """
     if not sentence or not isinstance(sentence, str):
         return []
 
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index f850b404b..92f437262 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -180,6 +180,7 @@ def pos_tag_transformers(
     :param str engine:
         * *bert* -  BERT: Bidirectional Encoder Representations from Transformers (default)
         * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only)
+        * *phayathaibert* - fine-tuned version of clicknext/phayathaibert on blackboard corpus (support blackboard cotpus only)
         * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only)
     :param str corpus: the corpus that is used to create the language model for tagger
         * *blackboard* - `blackboard treebank (support bert engine only) <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_

From a55168a3d8f6ac6363547bd99d6af38d6acff769 Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Sun, 10 Dec 2023 13:15:42 +0000
Subject: [PATCH 09/18] update pep8

---
 pythainlp/augment/lm/phayathaibert.py | 36 ++++++-----
 pythainlp/phayathaibert/__init__.py   | 13 ++--
 pythainlp/phayathaibert/core.py       | 91 ++++++++++++++-------------
 pythainlp/tag/named_entity.py         |  4 +-
 pythainlp/tag/pos_tag.py              | 13 ++--
 tests/test_augment.py                 |  4 +-
 6 files changed, 83 insertions(+), 78 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 6197e5116..375a9e9d2 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -9,10 +9,11 @@
 
 _model_name = "clicknext/phayathaibert"
 
+
 class ThaiTextAugmenter:
-    def __init__(self,)->None:
-        from transformers import (AutoTokenizer, 
-                                  AutoModelForMaskedLM, 
+    def __init__(self,) -> None:
+        from transformers import (AutoTokenizer,
+                                  AutoModelForMaskedLM,
                                   pipeline,)
         self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
         self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
@@ -20,11 +21,11 @@ def __init__(self,)->None:
         self.processor = ThaiTextProcessor()
 
     def generate(self,
-                 sample_text: str, 
-                 word_rank: int, 
-                 max_length: int=3,
-                 sample: bool=False
-                 )->str:
+                 sample_text: str,
+                 word_rank: int,
+                 max_length: int = 3,
+                 sample: bool = False
+                 ) -> str:
         sample_txt = sample_text
         final_text = ""
         for j in range(max_length):
@@ -34,23 +35,24 @@ def generate(self,
                 output = self.model(input)[random_word_idx]['sequence']
             else:
                 output = self.model(input)[word_rank]['sequence']
-            sample_txt = output+"<mask>"
+            sample_txt = output + "<mask>"
             final_text = sample_txt
-
-        gen_txt = re.sub("<mask>","",final_text)
+        gen_txt = re.sub("<mask>", "", final_text)
         return gen_txt
-    
+
 
     def augment(self,
-                text: str, 
-                num_augs: int=3, 
-                sample: bool=False)->List[str]:
+                text: str,
+                num_augs: int = 3,
+                sample: bool = False
+                )->List[str]:
         """
         Text Augment from phayathaibert
 
         :param str text: thai text
         :param int num_augs: an amount of augmentation text needed as an output
-        :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed
+        :param bool sample: whether to sample the text as an output or not, \
+                            true if more word diversity is needed
 
         :return: list of text augment
         :rtype: List[str]
@@ -72,7 +74,7 @@ def augment(self,
         """
         augment_list = []
         if "<mask>" not in text:
-            text = text+"<mask>" 
+            text = text + "<mask>" 
         if num_augs <= 5:
             for rank in range(num_augs):
                 gen_text = self.generate(text, rank, sample=sample)
diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 232399079..352f503b0 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -8,10 +8,9 @@
     "segment",
 ]
 
-from pythainlp.phayathaibert.core import (
-    ThaiTextProcessor, 
-    ThaiTextAugmenter, 
-    NamedEntityTagger,
-    PartOfSpeechTagger, 
-    segment,
-)
\ No newline at end of file
+from pythainlp.phayathaibert.core import (ThaiTextProcessor,
+                                          ThaiTextAugmenter,
+                                          NamedEntityTagger,
+                                          PartOfSpeechTagger,
+                                          segment,
+)
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 1cce1b203..ac8bab523 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -14,9 +14,11 @@
 _model_name = "clicknext/phayathaibert"
 _tokenizer = CamembertTokenizer.from_pretrained(_model_name)
 
+
 class ThaiTextProcessor:
     def __init__(self):
-        self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = "<unk> <rep> <wrep> <url> </s>".split()
+        self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = \
+            "<unk> <rep> <wrep> <url> </s>".split()
         self.SPACE_SPECIAL_TOKEN = "<_>"
 
 
@@ -30,7 +32,8 @@ def replace_url(self, text: str) -> str:
                 >>> replace_url("go to https://github.com")
                 go to <url>
         """
-        URL_PATTERN = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
+        URL_PATTERN = \
+            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
         return re.sub(URL_PATTERN, self._TK_URL, text)
 
     def rm_brackets(text: str) -> str:
@@ -181,22 +184,21 @@ def preprocess(
         return "".join(toks)
 
 
-
 class ThaiTextAugmenter:
-    def __init__(self)->None:
-        from transformers import (AutoTokenizer, 
-                                  AutoModelForMaskedLM, 
+    def __init__(self) -> None:
+        from transformers import (AutoTokenizer,
+                                  AutoModelForMaskedLM,
                                   pipeline,)
         self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
         self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
-        self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm)
+        self.model = pipeline("fill-mask", tokenizer = self.tokenizer, model = self.model_for_masked_lm)
         self.processor = ThaiTextProcessor()
 
     def generate(self,
                  sample_text: str, 
                  word_rank: int, 
-                 max_length: int=3,
-                 sample: bool=False
+                 max_length: int = 3,
+                 sample: bool = False,
                  )->str:
         sample_txt = sample_text
         final_text = ""
@@ -207,24 +209,25 @@ def generate(self,
                 output = self.model(input)[random_word_idx]['sequence']
             else:
                 output = self.model(input)[word_rank]['sequence']
-            sample_txt = output+"<mask>"
+            sample_txt = output + "<mask>"
             final_text = sample_txt
 
-        gen_txt = re.sub("<mask>","",final_text)
+        gen_txt = re.sub("<mask>", "", final_text)
         return gen_txt
     
 
     def augment(self,
                 text: str, 
-                num_augs: int=3, 
-                sample: bool=False
+                num_augs: int = 3, 
+                sample: bool = False,
                 )->List[str]:
         """
         Text Augment from phayathaibert
 
         :param str text: thai text
         :param int num_augs: an amount of augmentation text needed as an output
-        :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed
+        :param bool sample: whether to sample the text as an output or not,\
+              true if more word diversity is needed
 
         :return: list of text augment
         :rtype: List[str]
@@ -247,25 +250,26 @@ def augment(self,
         augment_list = []
         if num_augs <= 5: 
             for rank in range(num_augs):
-                gen_text = self.generate(text, rank, sample=sample)
+                gen_text = self.generate(text, rank, sample = sample)
                 processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
                 augment_list.append(processed_text)
 
             return augment_list
 
 
-
 class PartOfSpeechTagger:
-    def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
+    def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None:
         # Load model directly
-        from transformers import (
-            AutoTokenizer, 
-            AutoModelForTokenClassification,
-        )
+        from transformers import (AutoTokenizer,
+                                  AutoModelForTokenClassification,
+                                  )
         self.tokenizer = AutoTokenizer.from_pretrained(model)
         self.model = AutoModelForTokenClassification.from_pretrained(model)
 
-    def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]:
+    def get_tag(self,
+                sentence: str,
+                strategy: str = 'simple'
+                ) -> List[List[Tuple[str, str]]]:
         """
     Marks sentences with part-of-speech (POS) tags.
 
@@ -285,28 +289,26 @@ def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, s
         # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]]
         """
         from transformers import TokenClassificationPipeline
-        pipeline = TokenClassificationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer, 
-            aggregation_strategy=strategy,
-        )
+        pipeline = TokenClassificationPipeline(model=self.model,
+                                               tokenizer=self.tokenizer,
+                                               aggregation_strategy=strategy,
+                                               )
         outputs = pipeline(sentence)
         word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
         return word_tags
     
 class NamedEntityTagger:
-     def __init__(self, model: str="Pavarissy/phayathaibert-thainer") -> None:
-        from transformers import (
-            AutoTokenizer, 
-            AutoModelForTokenClassification,
-        )
+     def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
+        from transformers import (AutoTokenizer,
+                                  AutoModelForTokenClassification,
+                                  )
         self.tokenizer = AutoTokenizer.from_pretrained(model)
         self.model = AutoModelForTokenClassification.from_pretrained(model)
      def get_ner(self,
-                text: str,
-                tag: bool=False, 
-                pos: bool=False,
-                strategy: str="simple"
+                 text: str,
+                 tag: bool = False,
+                 pos: bool = False,
+                 strategy: str = "simple",
                 )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named entities in text in IOB format.
@@ -339,19 +341,20 @@ def get_ner(self,
         sample_output = []
         tag_text_list = []
         current_pos = 0
-        pipeline = TokenClassificationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer, 
-            aggregation_strategy=strategy,
-        )
+        pipeline = TokenClassificationPipeline(model = self.model,
+                                               tokenizer = self.tokenizer,
+                                               aggregation_strategy = strategy,
+                                               )
         outputs = pipeline(text)
         for token in outputs:
             ner_tag = token['entity_group']
             begin_pos, end_pos = token['start'], token['end']
             if current_pos == 0:
-                text_tag = text[:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"</{ner_tag}>"
+                text_tag = text[:begin_pos] + f"<{ner_tag}>" \
+                    + text[begin_pos:end_pos] + f"</{ner_tag}>"
             else:
-                text_tag = text[current_pos:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"</{ner_tag}>"
+                text_tag = text[current_pos:begin_pos] + f"<{ner_tag}>" \
+                    + text[begin_pos:end_pos] + f"</{ner_tag}>"
             tag_text_list.append(text_tag)
             sample_output.append((token['word'], token['entity_group']))
             current_pos = end_pos
@@ -361,7 +364,7 @@ def get_ner(self,
             return sample_output
 
     
-def segment(sentence: str)->List[str]:
+def segment(sentence: str) -> List[str]:
     """
     Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion.
 
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
index 0ab087cb6..888edda74 100644
--- a/pythainlp/tag/named_entity.py
+++ b/pythainlp/tag/named_entity.py
@@ -46,7 +46,7 @@ def load_engine(self, engine: str, corpus: str) -> None:
             from pythainlp.wangchanberta import ThaiNameTagger
 
             self.engine = ThaiNameTagger(dataset_name=corpus)
-        elif engine=="phayathaibert" and corpus == "thainer-v2":
+        elif engine == "phayathaibert" and corpus == "thainer-v2":
             from pythainlp.phayathaibert.core import NamedEntityTagger
 
             self.engine = NamedEntityTagger()
@@ -58,7 +58,7 @@ def load_engine(self, engine: str, corpus: str) -> None:
             )
 
     def tag(
-        self, text, pos=False, tag=False
+        self, text, pos = False, tag = False
     ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named entities in text in IOB format.
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index 92f437262..0c9092808 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -180,7 +180,8 @@ def pos_tag_transformers(
     :param str engine:
         * *bert* -  BERT: Bidirectional Encoder Representations from Transformers (default)
         * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only)
-        * *phayathaibert* - fine-tuned version of clicknext/phayathaibert on blackboard corpus (support blackboard cotpus only)
+        * *phayathaibert* - fine-tuned version of clicknext/phayathaibert \
+            on blackboard corpus (support blackboard cotpus only)
         * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only)
     :param str corpus: the corpus that is used to create the language model for tagger
         * *blackboard* - `blackboard treebank (support bert engine only) <https://bitbucket.org/kaamanita/blackboard-treebank/src/master/>`_
@@ -213,13 +214,13 @@ def pos_tag_transformers(
         return []
 
     _blackboard_support_engine = {
-        "bert" : "lunarlist/pos_thai",
-        "phayathai" : "lunarlist/pos_thai_phayathai",
+        "bert": "lunarlist/pos_thai",
+        "phayathai": "lunarlist/pos_thai_phayathai",
     }
 
     _pud_support_engine = {
-        "wangchanberta" : "Pavarissy/wangchanberta-ud-thai-pud-upos",
-        "mdeberta" : "Pavarissy/mdeberta-v3-ud-thai-pud-upos",
+        "wangchanberta": "Pavarissy/wangchanberta-ud-thai-pud-upos",
+        "mdeberta": "Pavarissy/mdeberta-v3-ud-thai-pud-upos",
     }
 
     if corpus == 'blackboard' and engine in _blackboard_support_engine.keys():
@@ -237,7 +238,7 @@ def pos_tag_transformers(
             )
         )
 
-    pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+    pipeline = TokenClassificationPipeline(model = model, tokenizer = tokenizer, aggregation_strategy = "simple")
 
     outputs = pipeline(sentence)
     word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
diff --git a/tests/test_augment.py b/tests/test_augment.py
index 5542d96c3..2721b056b 100644
--- a/tests/test_augment.py
+++ b/tests/test_augment.py
@@ -45,6 +45,6 @@ def test_LTW2VAug(self):
     #     _aug = Thai2transformersAug()
     #     self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1))
 
-     # def test_ThaiTextAugmenter(self):
+    # def test_ThaiTextAugmenter(self):
     #     _aug = ThaiTextAugmenter()
-    #     self.assertIsNotNone(_aug.augment(self.text2, num__augs=3))
\ No newline at end of file
+    #     self.assertIsNotNone(_aug.augment(self.text2, num__augs=3))

From 76b49c319121d0867a280b2ca0c5c907a6b8d65d Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Sun, 10 Dec 2023 13:41:00 +0000
Subject: [PATCH 10/18] update pep8

---
 pythainlp/augment/lm/phayathaibert.py |  4 +-
 pythainlp/phayathaibert/__init__.py   |  2 +-
 pythainlp/phayathaibert/core.py       | 57 ++++++++++++++++-----------
 pythainlp/tag/named_entity.py         |  7 ++--
 pythainlp/tag/pos_tag.py              |  8 ++--
 5 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 375a9e9d2..e9ea4b7df 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -40,12 +40,11 @@ def generate(self,
         gen_txt = re.sub("<mask>", "", final_text)
         return gen_txt
 
-
     def augment(self,
                 text: str,
                 num_augs: int = 3,
                 sample: bool = False
-                )->List[str]:
+                ) -> List[str]:
         """
         Text Augment from phayathaibert
 
@@ -86,4 +85,3 @@ def augment(self,
             raise ValueError(
                 f"augmentation of more than {num_augs} is exceeded the default limit"
             )
-
diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 352f503b0..405ce1843 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -13,4 +13,4 @@
                                           NamedEntityTagger,
                                           PartOfSpeechTagger,
                                           segment,
-)
+                                          )
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index ac8bab523..926046076 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -21,7 +21,6 @@ def __init__(self):
             "<unk> <rep> <wrep> <url> </s>".split()
         self.SPACE_SPECIAL_TOKEN = "<_>"
 
-
     def replace_url(self, text: str) -> str:
         """
             Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965)
@@ -191,15 +190,18 @@ def __init__(self) -> None:
                                   pipeline,)
         self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
         self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
-        self.model = pipeline("fill-mask", tokenizer = self.tokenizer, model = self.model_for_masked_lm)
+        self.model = pipeline("fill-mask",
+                              tokenizer = self.tokenizer,
+                              model = self.model_for_masked_lm,
+                              )
         self.processor = ThaiTextProcessor()
 
     def generate(self,
-                 sample_text: str, 
-                 word_rank: int, 
+                 sample_text: str,
+                 word_rank: int,
                  max_length: int = 3,
                  sample: bool = False,
-                 )->str:
+                 ) -> str:
         sample_txt = sample_text
         final_text = ""
         for j in range(max_length):
@@ -214,13 +216,12 @@ def generate(self,
 
         gen_txt = re.sub("<mask>", "", final_text)
         return gen_txt
-    
 
     def augment(self,
-                text: str, 
-                num_augs: int = 3, 
+                text: str,
+                num_augs: int = 3,
                 sample: bool = False,
-                )->List[str]:
+                ) -> List[str]:
         """
         Text Augment from phayathaibert
 
@@ -248,9 +249,12 @@ def augment(self,
                 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
         """
         augment_list = []
-        if num_augs <= 5: 
+        if num_augs <= 5:
             for rank in range(num_augs):
-                gen_text = self.generate(text, rank, sample = sample)
+                gen_text = self.generate(text,
+                                         rank,
+                                         sample = sample,
+                                         )
                 processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
                 augment_list.append(processed_text)
 
@@ -296,20 +300,22 @@ def get_tag(self,
         outputs = pipeline(sentence)
         word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
         return word_tags
-    
+
+
 class NamedEntityTagger:
-     def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
+    def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None:
         from transformers import (AutoTokenizer,
                                   AutoModelForTokenClassification,
                                   )
         self.tokenizer = AutoTokenizer.from_pretrained(model)
         self.model = AutoModelForTokenClassification.from_pretrained(model)
-     def get_ner(self,
-                 text: str,
-                 tag: bool = False,
-                 pos: bool = False,
-                 strategy: str = "simple",
-                )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
+
+    def get_ner(self,
+                text: str,
+                tag: bool = False,
+                pos: bool = False,
+                strategy: str = "simple",
+                ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named entities in text in IOB format.
 
@@ -333,11 +339,13 @@ def get_ner(self,
             ('จาก', 'LOCATION'),
             ('ประเทศไทย', 'LOCATION')]
             >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True)
-            'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON><LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
+            'ทดสอบ<PERSON>นายปวริศ เรืองจุติโพธิ์พาน</PERSON>\
+                <LOCATION>จาก</LOCATION><LOCATION>ประเทศไทย</LOCATION>'
         """
         from transformers import TokenClassificationPipeline
         if pos:
-            warnings.warn("This model doesn't support output postag and It doesn't output the postag.")
+            warnings.warn("This model doesn't support output \
+                          postag and It doesn't output the postag.")
         sample_output = []
         tag_text_list = []
         current_pos = 0
@@ -363,10 +371,11 @@ def get_ner(self,
         else:
             return sample_output
 
-    
+
 def segment(sentence: str) -> List[str]:
     """
-    Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion.
+    Subword tokenize of phayathaibert, \
+    sentencepiece from wangchanberta model with Vocabulary Expansion.
 
     :param str text: text to be tokenized
     :return: list of subwords
@@ -375,4 +384,4 @@ def segment(sentence: str) -> List[str]:
     if not sentence or not isinstance(sentence, str):
         return []
 
-    return _tokenizer.tokenize(sentence)
\ No newline at end of file
+    return _tokenizer.tokenize(sentence)
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
index 888edda74..9b18ff5d0 100644
--- a/pythainlp/tag/named_entity.py
+++ b/pythainlp/tag/named_entity.py
@@ -57,9 +57,10 @@ def load_engine(self, engine: str, corpus: str) -> None:
                 )
             )
 
-    def tag(
-        self, text, pos = False, tag = False
-    ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
+    def tag(self,
+            text, pos = False,
+            tag = False
+            ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named entities in text in IOB format.
 
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index e5d773d88..ea45297d7 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -241,10 +241,10 @@ def pos_tag_transformers(
             )
         )
 
-
-    pipeline = TokenClassificationPipeline(
-        model = model, tokenizer = tokenizer, aggregation_strategy = "simple"
-    )
+    pipeline = TokenClassificationPipeline(model = model,
+                                           tokenizer = tokenizer,
+                                           aggregation_strategy = "simple",
+                                           )
 
     outputs = pipeline(sentence)
     word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]]

From 22daf2df5ccd9b9c0b946655ece210908d67ed2a Mon Sep 17 00:00:00 2001
From: Pavarissy <pavaris.ruang@protonmail.com>
Date: Sun, 10 Dec 2023 13:46:22 +0000
Subject: [PATCH 11/18] update pep8

---
 pythainlp/augment/lm/phayathaibert.py |  2 +-
 pythainlp/phayathaibert/core.py       | 12 ++++++------
 pythainlp/tag/named_entity.py         |  5 +++--
 pythainlp/tag/pos_tag.py              |  6 +++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index e9ea4b7df..439b9ca98 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -73,7 +73,7 @@ def augment(self,
         """
         augment_list = []
         if "<mask>" not in text:
-            text = text + "<mask>" 
+            text = text + "<mask>"
         if num_augs <= 5:
             for rank in range(num_augs):
                 gen_text = self.generate(text, rank, sample=sample)
diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 926046076..3569a40cf 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -191,8 +191,8 @@ def __init__(self) -> None:
         self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
         self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
         self.model = pipeline("fill-mask",
-                              tokenizer = self.tokenizer,
-                              model = self.model_for_masked_lm,
+                              tokenizer=self.tokenizer,
+                              model=self.model_for_masked_lm,
                               )
         self.processor = ThaiTextProcessor()
 
@@ -253,7 +253,7 @@ def augment(self,
             for rank in range(num_augs):
                 gen_text = self.generate(text,
                                          rank,
-                                         sample = sample,
+                                         sample=sample,
                                          )
                 processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
                 augment_list.append(processed_text)
@@ -349,9 +349,9 @@ def get_ner(self,
         sample_output = []
         tag_text_list = []
         current_pos = 0
-        pipeline = TokenClassificationPipeline(model = self.model,
-                                               tokenizer = self.tokenizer,
-                                               aggregation_strategy = strategy,
+        pipeline = TokenClassificationPipeline(model=self.model,
+                                               tokenizer=self.tokenizer,
+                                               aggregation_strategy=strategy,
                                                )
         outputs = pipeline(text)
         for token in outputs:
diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py
index 9b18ff5d0..88b290f29 100644
--- a/pythainlp/tag/named_entity.py
+++ b/pythainlp/tag/named_entity.py
@@ -58,8 +58,9 @@ def load_engine(self, engine: str, corpus: str) -> None:
             )
 
     def tag(self,
-            text, pos = False,
-            tag = False
+            text,
+            pos=False,
+            tag=False
             ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]:
         """
         This function tags named entities in text in IOB format.
diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
index ea45297d7..a195f8b69 100644
--- a/pythainlp/tag/pos_tag.py
+++ b/pythainlp/tag/pos_tag.py
@@ -241,9 +241,9 @@ def pos_tag_transformers(
             )
         )
 
-    pipeline = TokenClassificationPipeline(model = model,
-                                           tokenizer = tokenizer,
-                                           aggregation_strategy = "simple",
+    pipeline = TokenClassificationPipeline(model=model,
+                                           tokenizer=tokenizer,
+                                           aggregation_strategy="simple",
                                            )
 
     outputs = pipeline(sentence)

From 84de5c442665ff641ae10a5ee5b6df38b0ff8d52 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 10 Dec 2023 17:24:23 +0000
Subject: [PATCH 12/18] Update core.py: sort imports, remove duplicated lines

---
 pythainlp/phayathaibert/core.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index 3569a40cf..d1977ee4a 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Union, Tuple, Collection, Callable
+from typing import Callable, Collection, List, Tuple, Union
 import re
 import random
 import warnings
@@ -238,7 +238,6 @@ def augment(self,
 
             from pythainlp.augment.lm import ThaiTextAugmenter
 
-            aug=ThaiTextAugmenter()
             aug = ThaiTextAugmenter()
             aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
 

From a2fd4d3732f96ff7d1b78d829b775ded92c8d703 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 10 Dec 2023 17:26:07 +0000
Subject: [PATCH 13/18] Update phayathaibert.py: sort imports, remove
 duplicated lines

---
 pythainlp/augment/lm/phayathaibert.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 439b9ca98..370c9dac1 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 from typing import List
-import re
 import random
+import re
 from pythainlp.phayathaibert.core import ThaiTextProcessor
 
 
@@ -32,9 +32,9 @@ def generate(self,
             input = self.processor.preprocess(sample_txt)
             if sample:
                 random_word_idx = random.randint(0, 4)
-                output = self.model(input)[random_word_idx]['sequence']
+                output = self.model(input)[random_word_idx]["sequence"]
             else:
-                output = self.model(input)[word_rank]['sequence']
+                output = self.model(input)[word_rank]["sequence"]
             sample_txt = output + "<mask>"
             final_text = sample_txt
         gen_txt = re.sub("<mask>", "", final_text)
@@ -48,7 +48,7 @@ def augment(self,
         """
         Text Augment from phayathaibert
 
-        :param str text: thai text
+        :param str text: Thai text
         :param int num_augs: an amount of augmentation text needed as an output
         :param bool sample: whether to sample the text as an output or not, \
                             true if more word diversity is needed
@@ -61,7 +61,6 @@ def augment(self,
 
             from pythainlp.augment.lm import ThaiTextAugmenter
 
-            aug=ThaiTextAugmenter()
             aug = ThaiTextAugmenter()
             aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5)
 

From 7e24d3fd278001de69665a9a774c2399a993331c Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 10 Dec 2023 17:28:28 +0000
Subject: [PATCH 14/18] Reexport NamedEntityTagger

---
 pythainlp/phayathaibert/__init__.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py
index 405ce1843..bf0f847bf 100644
--- a/pythainlp/phayathaibert/__init__.py
+++ b/pythainlp/phayathaibert/__init__.py
@@ -2,15 +2,17 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 __all__ = [
-    "ThaiTextProcessor",
-    "ThaiTextAugmenter",
+    "NamedEntityTagger",
     "PartOfSpeechTagger",
+    "ThaiTextAugmenter",
+    "ThaiTextProcessor",
     "segment",
 ]
 
-from pythainlp.phayathaibert.core import (ThaiTextProcessor,
-                                          ThaiTextAugmenter,
-                                          NamedEntityTagger,
-                                          PartOfSpeechTagger,
-                                          segment,
-                                          )
+from pythainlp.phayathaibert.core import (
+    NamedEntityTagger,
+    PartOfSpeechTagger,
+    ThaiTextAugmenter,
+    ThaiTextProcessor,
+    segment,
+)

From 826cfed0898838ca61a7c61d6665e68ce8787c43 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 10 Dec 2023 17:33:22 +0000
Subject: [PATCH 15/18] Fix minor types

---
 pythainlp/phayathaibert/core.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py
index d1977ee4a..579d24ddc 100644
--- a/pythainlp/phayathaibert/core.py
+++ b/pythainlp/phayathaibert/core.py
@@ -2,9 +2,10 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 from typing import Callable, Collection, List, Tuple, Union
-import re
 import random
+import re
 import warnings
+
 from pythainlp.tokenize import word_tokenize
 from transformers import (
     CamembertTokenizer,
@@ -49,7 +50,7 @@ def rm_brackets(text: str) -> str:
         new_line = re.sub(r"\(\)", "", text)
         new_line = re.sub(r"\{\}", "", new_line)
         new_line = re.sub(r"\[\]", "", new_line)
-        # brakets with only punctuations
+        # brackets with only punctuations
         new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line)
         new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line)
         new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line)
@@ -225,7 +226,7 @@ def augment(self,
         """
         Text Augment from phayathaibert
 
-        :param str text: thai text
+        :param str text: Thai text
         :param int num_augs: an amount of augmentation text needed as an output
         :param bool sample: whether to sample the text as an output or not,\
               true if more word diversity is needed

From 72e2bd557c2b43557143dc9734356644070023e8 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Sun, 10 Dec 2023 17:34:35 +0000
Subject: [PATCH 16/18] Update __init__.py

---
 pythainlp/augment/lm/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py
index febe833f5..f0265b136 100644
--- a/pythainlp/augment/lm/__init__.py
+++ b/pythainlp/augment/lm/__init__.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
 """
-LM
+Language Models
 """
 
 __all__ = [

From dec62c1cf0d2dd4f129a5850116edc3feb9c3989 Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 11 Dec 2023 03:30:19 +0000
Subject: [PATCH 17/18] Use MAX_NUM_AUGS constant for max num_augs limit

---
 pythainlp/augment/lm/phayathaibert.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 370c9dac1..90e067523 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -1,9 +1,11 @@
 # -*- coding: utf-8 -*-
 # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
 # SPDX-License-Identifier: Apache-2.0
+
 from typing import List
 import random
 import re
+
 from pythainlp.phayathaibert.core import ThaiTextProcessor
 
 
@@ -28,6 +30,7 @@ def generate(self,
                  ) -> str:
         sample_txt = sample_text
         final_text = ""
+
         for j in range(max_length):
             input = self.processor.preprocess(sample_txt)
             if sample:
@@ -37,7 +40,9 @@ def generate(self,
                 output = self.model(input)[word_rank]["sequence"]
             sample_txt = output + "<mask>"
             final_text = sample_txt
+
         gen_txt = re.sub("<mask>", "", final_text)
+
         return gen_txt
 
     def augment(self,
@@ -46,7 +51,7 @@ def augment(self,
                 sample: bool = False
                 ) -> List[str]:
         """
-        Text Augment from phayathaibert
+        Text augmentation from PhayaThaiBERT
 
         :param str text: Thai text
         :param int num_augs: an amount of augmentation text needed as an output
@@ -70,17 +75,20 @@ def augment(self,
                 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼',
                 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁']
         """
+        MAX_NUM_AUGS = 5
         augment_list = []
+
         if "<mask>" not in text:
             text = text + "<mask>"
-        if num_augs <= 5:
+
+        if num_augs <= MAX_NUM_AUGS:
             for rank in range(num_augs):
                 gen_text = self.generate(text, rank, sample=sample)
                 processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text))
                 augment_list.append(processed_text)
 
             return augment_list
-        else:
-            raise ValueError(
-                f"augmentation of more than {num_augs} is exceeded the default limit"
-            )
+        
+        raise ValueError(
+            f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}"
+        )

From e7ef6ceb0c6906f6ed09679c719f73effdf158ef Mon Sep 17 00:00:00 2001
From: Arthit Suriyawongkul <arthit@gmail.com>
Date: Mon, 11 Dec 2023 14:00:07 +0000
Subject: [PATCH 18/18] Update phayathaibert.py

Use UPPERCASE for constant
---
 pythainlp/augment/lm/phayathaibert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py
index 90e067523..47b43c219 100644
--- a/pythainlp/augment/lm/phayathaibert.py
+++ b/pythainlp/augment/lm/phayathaibert.py
@@ -9,7 +9,7 @@
 from pythainlp.phayathaibert.core import ThaiTextProcessor
 
 
-_model_name = "clicknext/phayathaibert"
+_MODEL_NAME = "clicknext/phayathaibert"
 
 
 class ThaiTextAugmenter:
@@ -17,8 +17,8 @@ def __init__(self,) -> None:
         from transformers import (AutoTokenizer,
                                   AutoModelForMaskedLM,
                                   pipeline,)
-        self.tokenizer = AutoTokenizer.from_pretrained(_model_name)
-        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
+        self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_MODEL_NAME)
         self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm)
         self.processor = ThaiTextProcessor()
 
@@ -88,7 +88,7 @@ def augment(self,
                 augment_list.append(processed_text)
 
             return augment_list
-        
+
         raise ValueError(
             f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}"
         )