From 41d79c2c3892d6102ab82502459bd0a155015d49 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Fri, 1 Dec 2023 14:32:11 +0000 Subject: [PATCH 01/18] add phayathaibert core engine --- pythainlp/phayathaibert/__init__.py | 20 +++++++++++ pythainlp/phayathaibert/core.py | 52 +++++++++++++++++++++++++++++ pythainlp/tag/pos_tag.py | 1 + pythainlp/tokenize/core.py | 2 ++ tests/test_tokenize.py | 12 +++++++ 5 files changed, 87 insertions(+) create mode 100644 pythainlp/phayathaibert/__init__.py create mode 100644 pythainlp/phayathaibert/core.py diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py new file mode 100644 index 000000000..426a1d3be --- /dev/null +++ b/pythainlp/phayathaibert/__init__.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +__all__ = [ + "PartOfSpeechTagger", + "segment", +] + +from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment \ No newline at end of file diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py new file mode 100644 index 000000000..983d4a203 --- /dev/null +++ b/pythainlp/phayathaibert/core.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Tuple, Union +import re +import warnings +from transformers import ( + CamembertTokenizer, +) + + +_model_name = "clicknext/phayathaibert" +_tokenizer = CamembertTokenizer.from_pretrained(_model_name) + + +class PartOfSpeechTagger: + def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None: + # Load model directly + from transformers import ( + AutoTokenizer, + AutoModelForTokenClassification, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model) + + def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]: + from transformers import TokenClassificationPipeline + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) + outputs = pipeline(sentence) + word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] + return word_tags + +def segment(sentence: str)->List[str]: + if not sentence or not isinstance(sentence, str): + return [] + + return _tokenizer.tokenize(sentence) \ No newline at end of file diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index ee2a2b478..eb6ab877b 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -224,6 +224,7 @@ def pos_tag_transformers( _blackboard_support_engine = { "bert" : "lunarlist/pos_thai", + "phayathai" : "lunarlist/pos_thai_phayathai", } _pud_support_engine = { diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index a19082981..234bccfe5 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -616,6 +616,8 @@ def subword_tokenize( from pythainlp.tokenize.tltk import syllable_tokenize as segment elif engine == "han_solo": from pythainlp.tokenize.han_solo import segment + elif engine == "phayathai": + from pythainlp.phayathaibert import segment else: raise ValueError( f"""Tokenizer \"{engine}\" not found. diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py index 1537b62c9..1f1ed4ac5 100644 --- a/tests/test_tokenize.py +++ b/tests/test_tokenize.py @@ -431,6 +431,18 @@ def test_subword_tokenize(self): "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk") ) self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list) + + self.assertEqual(subword_tokenize(None, engine="phayathai"), []) + self.assertEqual(subword_tokenize("", engine="phayathai"), []) + self.assertIsInstance( + subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list + ) + self.assertFalse( + "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai") + ) + self.assertIsInstance( + subword_tokenize("โควิด19", engine="phayathai"), list + ) with self.assertRaises(ValueError): subword_tokenize("นกแก้ว", engine="XX") # engine does not exist From cb9e27aed1ec4a7dad8d6cccc3fe34ff2f84b283 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Mon, 4 Dec 2023 10:04:12 +0000 Subject: [PATCH 02/18] add data augmentation engine --- pythainlp/augment/lm/__init__.py | 2 + pythainlp/augment/lm/phayathaibert.py | 73 +++++++++ pythainlp/phayathaibert/__init__.py | 9 +- pythainlp/phayathaibert/core.py | 219 +++++++++++++++++++++++++- tests/test_augment.py | 5 + 5 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 pythainlp/augment/lm/phayathaibert.py diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py index 968380438..2798bc3b9 100644 --- a/pythainlp/augment/lm/__init__.py +++ b/pythainlp/augment/lm/__init__.py @@ -19,7 +19,9 @@ __all__ = [ "FastTextAug", "Thai2transformersAug", + "ThaiTextAugmenter", ] from pythainlp.augment.lm.fasttext import FastTextAug from pythainlp.augment.lm.wangchanberta import Thai2transformersAug +from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py new file mode 100644 index 000000000..e9303616e --- /dev/null +++ b/pythainlp/augment/lm/phayathaibert.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List +import re +import random +from pythainlp.phayathaibert.core import ThaiTextProcessor + + +_model_name = "clicknext/phayathaibert" + +class ThaiTextAugmenter: + def __init__(self,)->None: + from transformers import (AutoTokenizer, + AutoModelForMaskedLM, + pipeline,) + self.tokenizer = AutoTokenizer.from_pretrained(_model_name) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) + self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm) + self.processor = ThaiTextProcessor() + + def generate(self, + sample_text: str, + word_rank: int, + max_length: int=3, + sample: bool=False + )->str: + sample_txt = sample_text + final_text = "" + for j in range(max_length): + input = self.processor.preprocess(sample_txt) + if sample: + random_word_idx = random.randint(0, 4) + output = self.model(input)[random_word_idx]['sequence'] + else: + output = self.model(input)[word_rank]['sequence'] + sample_txt = output+"" + final_text = sample_txt + + gen_txt = re.sub("","",final_text) + return gen_txt + + + def augment(self, + text: str, + num_augs: int, + sample: bool=False)->List[str]: + augment_list = [] + if "" not in text: + text = text+"" + if num_augs <= 5: + for rank in range(num_augs): + gen_text = self.generate(text, rank, sample=sample) + processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) + augment_list.append(processed_text) + + return augment_list + else: + raise ValueError( + f"augmentation of more than {num_augs} is exceeded the default limit" + ) + diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 426a1d3be..8d579d760 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -13,8 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. __all__ = [ + "ThaiTextProcessor", + "ThaiTextAugmenter", "PartOfSpeechTagger", "segment", ] -from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment \ No newline at end of file +from pythainlp.phayathaibert.core import ( + ThaiTextProcessor, + ThaiTextAugmenter, + PartOfSpeechTagger, + segment, +) \ No newline at end of file diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 983d4a203..913487ac5 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -12,9 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Union +from typing import List, Tuple, Collection, Callable import re import warnings +import random +from pythainlp.tokenize import word_tokenize from transformers import ( CamembertTokenizer, ) @@ -23,6 +25,221 @@ _model_name = "clicknext/phayathaibert" _tokenizer = CamembertTokenizer.from_pretrained(_model_name) +class ThaiTextProcessor: + def __init__(self): + self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = " ".split() + self.SPACE_SPECIAL_TOKEN = "<_>" + + + def replace_url(self, text: str) -> str: + """ + Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965) + :param str text: text to replace url + :return: text where urls are replaced + :rtype: str + :Example: + >>> replace_url("go to https://github.com") + go to + """ + URL_PATTERN = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + return re.sub(URL_PATTERN, self._TK_URL, text) + + def rm_brackets(text: str) -> str: + """ + Remove all empty brackets and artifacts within brackets from `text`. + :param str text: text to remove useless brackets + :return: text where all useless brackets are removed + :rtype: str + :Example: + >>> rm_brackets("hey() whats[;] up{*&} man(hey)") + hey whats up man(hey) + """ + # remove empty brackets + new_line = re.sub(r"\(\)", "", text) + new_line = re.sub(r"\{\}", "", new_line) + new_line = re.sub(r"\[\]", "", new_line) + # brakets with only punctuations + new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) + new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) + new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) + # artifiacts after ( + new_line = re.sub(r"(?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) + new_line = re.sub(r"(?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) + new_line = re.sub(r"(?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])", "", new_line) + # artifacts before ) + new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))", "", new_line) + new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})", "", new_line) + new_line = re.sub(r"(?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])", "", new_line) + return new_line + + def replace_newlines(text: str) -> str: + """ + Replace newlines in `text` with spaces. + :param str text: text to replace all newlines with spaces + :return: text where all newlines are replaced with spaces + :rtype: str + :Example: + >>> rm_useless_spaces("hey whats\n\nup") + hey whats up + """ + + return re.sub(r"[\n]", " ", text.strip()) + + def rm_useless_spaces(text: str) -> str: + """ + Remove multiple spaces in `text`. (code from `fastai`) + :param str text: text to replace useless spaces + :return: text where all spaces are reduced to one + :rtype: str + :Example: + >>> rm_useless_spaces("oh no") + oh no + """ + return re.sub(" {2,}", " ", text) + + def replace_spaces(text: str, space_token: str = SPACE_SPECIAL_TOKEN) -> str: + """ + Replace spaces with _ + :param str text: text to replace spaces + :return: text where all spaces replaced with _ + :rtype: str + :Example: + >>> replace_spaces("oh no") + oh_no + """ + return re.sub(" ", space_token, text) + + def replace_rep_after(text: str) -> str: + """ + Replace repetitions at the character level in `text` + :param str text: input text to replace character repetition + :return: text with repetitive tokens removed. + :rtype: str + :Example: + >>> text = "กาาาาาาา" + >>> replace_rep_after(text) + 'กา' + """ + + def _replace_rep(m): + c, cc = m.groups() + return f"{c}" + + re_rep = re.compile(r"(\S)(\1{3,})") + return re_rep.sub(_replace_rep, text) + + def replace_wrep_post(toks: Collection[str]) -> Collection[str]: + """ + Replace reptitive words post tokenization; + fastai `replace_wrep` does not work well with Thai. + :param Collection[str] toks: list of tokens + :return: list of tokens where repetitive words are removed. + :rtype: Collection[str] + :Example: + >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"] + >>> replace_wrep_post(toks) + ['กา', 'น้ำ'] + """ + previous_word = None + rep_count = 0 + res = [] + for current_word in toks + [self._TK_END]: + if current_word == previous_word: + rep_count += 1 + elif (current_word != previous_word) & (rep_count > 0): + res += [previous_word] + rep_count = 0 + else: + res.append(previous_word) + previous_word = current_word + return res[1:] + + def remove_space(toks: Collection[str]) -> Collection[str]: + """ + Do not include space for bag-of-word models. + :param Collection[str] toks: list of tokens + :return: Collection of tokens where space tokens (" ") are filtered out + :rtype: Collection[str] + :Example: + >>> toks = ['ฉัน','เดิน',' ','กลับ','บ้าน'] + >>> remove_space(toks) + ['ฉัน','เดิน','กลับ','บ้าน'] + """ + res = [] + for t in toks: + t = t.strip() + if t: + res.append(t) + return res + + # combine them together + def preprocess( + self, + text: str, + pre_rules: Collection[Callable] = [ + rm_brackets, + replace_newlines, + rm_useless_spaces, + replace_spaces, + replace_rep_after, + ], + tok_func: Callable = word_tokenize, + ) -> str: + text = text.lower() + for rule in pre_rules: + text = rule(text) + toks = tok_func(text) + return "".join(toks) + + + +class ThaiTextAugmenter: + def __init__(self)->None: + from transformers import (AutoTokenizer, + AutoModelForMaskedLM, + pipeline,) + self.tokenizer = AutoTokenizer.from_pretrained(_model_name) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) + self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm) + self.processor = ThaiTextProcessor() + + def generate(self, + sample_text: str, + word_rank: int, + max_length: int=3, + sample: bool=False + )->str: + sample_txt = sample_text + final_text = "" + for j in range(max_length): + input = self.processor.preprocess(sample_txt) + if sample: + random_word_idx = random.randint(0, 4) + output = self.model(input)[random_word_idx]['sequence'] + else: + output = self.model(input)[word_rank]['sequence'] + sample_txt = output+"" + final_text = sample_txt + + gen_txt = re.sub("","",final_text) + return gen_txt + + + def augment(self, + text: str, + num_augs: int, + sample: bool=False + )->List[str]: + augment_list = [] + if num_augs <= 5: # since huggingface transformers pipeline default was set to 5 generated text + for rank in range(num_augs): + gen_text = self.generate(text, rank, sample=sample) + processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) + augment_list.append(processed_text) + + return augment_list + + class PartOfSpeechTagger: def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None: diff --git a/tests/test_augment.py b/tests/test_augment.py index 51dc89082..5542d96c3 100644 --- a/tests/test_augment.py +++ b/tests/test_augment.py @@ -5,6 +5,7 @@ from pythainlp.augment import WordNetAug from pythainlp.augment.wordnet import postype2wordnet # from pythainlp.augment.lm import Thai2transformersAug +# from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter from pythainlp.augment.word2vec.bpemb_wv import BPEmbAug from pythainlp.augment.word2vec import ( LTW2VAug @@ -43,3 +44,7 @@ def test_LTW2VAug(self): # def test_Thai2transformersAug(self): # _aug = Thai2transformersAug() # self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1)) + + # def test_ThaiTextAugmenter(self): + # _aug = ThaiTextAugmenter() + # self.assertIsNotNone(_aug.augment(self.text2, num__augs=3)) \ No newline at end of file From 473af52ed0afdf2df04f772e2ffa03a4f3ed6d7c Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Mon, 4 Dec 2023 10:11:44 +0000 Subject: [PATCH 03/18] update engine properties --- pythainlp/augment/lm/phayathaibert.py | 2 +- pythainlp/phayathaibert/core.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index e9303616e..40d0877a1 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -54,7 +54,7 @@ def generate(self, def augment(self, text: str, - num_augs: int, + num_augs: int=3, sample: bool=False)->List[str]: augment_list = [] if "" not in text: diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 913487ac5..d25e7b796 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -97,7 +97,7 @@ def rm_useless_spaces(text: str) -> str: """ return re.sub(" {2,}", " ", text) - def replace_spaces(text: str, space_token: str = SPACE_SPECIAL_TOKEN) -> str: + def replace_spaces(text: str, space_token: str = self.SPACE_SPECIAL_TOKEN) -> str: """ Replace spaces with _ :param str text: text to replace spaces @@ -227,7 +227,7 @@ def generate(self, def augment(self, text: str, - num_augs: int, + num_augs: int=3, sample: bool=False )->List[str]: augment_list = [] From 0c3efd0b4f9b5e150b3092c2e506c47b2d8f370c Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Mon, 4 Dec 2023 10:26:08 +0000 Subject: [PATCH 04/18] updae augmentation properties --- pythainlp/phayathaibert/core.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index d25e7b796..8fd5229af 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -14,7 +14,6 @@ # limitations under the License. from typing import List, Tuple, Collection, Callable import re -import warnings import random from pythainlp.tokenize import word_tokenize from transformers import ( @@ -97,7 +96,7 @@ def rm_useless_spaces(text: str) -> str: """ return re.sub(" {2,}", " ", text) - def replace_spaces(text: str, space_token: str = self.SPACE_SPECIAL_TOKEN) -> str: + def replace_spaces(text: str, space_token: str = "<_>") -> str: """ Replace spaces with _ :param str text: text to replace spaces @@ -128,7 +127,7 @@ def _replace_rep(m): re_rep = re.compile(r"(\S)(\1{3,})") return re_rep.sub(_replace_rep, text) - def replace_wrep_post(toks: Collection[str]) -> Collection[str]: + def replace_wrep_post(self, toks: Collection[str]) -> Collection[str]: """ Replace reptitive words post tokenization; fastai `replace_wrep` does not work well with Thai. From dd2b8347c6ffff6f8a0b93e5d495147c85525abc Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Mon, 4 Dec 2023 10:43:21 +0000 Subject: [PATCH 05/18] change license --- pythainlp/augment/lm/phayathaibert.py | 15 ++------------- pythainlp/phayathaibert/__init__.py | 15 ++------------- pythainlp/phayathaibert/core.py | 15 ++------------- 3 files changed, 6 insertions(+), 39 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 40d0877a1..e0dbc9d39 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -1,17 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2016-2023 PyThaiNLP Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 from typing import List import re import random diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 8d579d760..7e2dba7f1 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -1,17 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2016-2023 PyThaiNLP Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 __all__ = [ "ThaiTextProcessor", "ThaiTextAugmenter", diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 8fd5229af..3f35e144f 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -1,17 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2016-2023 PyThaiNLP Project -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 from typing import List, Tuple, Collection, Callable import re import random From d1b9c99b4b03ccb72176616ab98de0e4e1dc7aa0 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Mon, 4 Dec 2023 16:52:40 +0000 Subject: [PATCH 06/18] add er engine --- pythainlp/phayathaibert/__init__.py | 1 + pythainlp/phayathaibert/core.py | 45 ++++++++++++++++++++++++++++- pythainlp/tag/named_entity.py | 4 +++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 7e2dba7f1..232399079 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -11,6 +11,7 @@ from pythainlp.phayathaibert.core import ( ThaiTextProcessor, ThaiTextAugmenter, + NamedEntityTagger, PartOfSpeechTagger, segment, ) \ No newline at end of file diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 3f35e144f..5ecebe740 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -from typing import List, Tuple, Collection, Callable +from typing import List, Union, Tuple, Collection, Callable import re import random +import warnings from pythainlp.tokenize import word_tokenize from transformers import ( CamembertTokenizer, @@ -250,6 +251,48 @@ def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, s word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] return word_tags +class NamedEntityTagger: + def __init__(self, model: str="Pavarissy/phayathaibert-thainer") -> None: + from transformers import ( + AutoTokenizer, + AutoModelForTokenClassification, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.model = AutoModelForTokenClassification.from_pretrained(model) + def get_ner(self, + text: str, + tag: bool=False, + pos: bool=False, + strategy: str="simple" + )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + from transformers import TokenClassificationPipeline + if pos: + warnings.warn("This model doesn't support output postag and It doesn't output the postag.") + sample_output = [] + tag_text_list = [] + current_pos = 0 + pipeline = TokenClassificationPipeline( + model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) + outputs = pipeline(text) + for token in outputs: + ner_tag = token['entity_group'] + begin_pos, end_pos = token['start'], token['end'] + if current_pos == 0: + text_tag = text[:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"" + else: + text_tag = text[current_pos:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"" + tag_text_list.append(text_tag) + sample_output.append((token['word'], token['entity_group'])) + current_pos = end_pos + if tag: + return str("".join(tag_text_list)) + else: + return sample_output + + def segment(sentence: str)->List[str]: if not sentence or not isinstance(sentence, str): return [] diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 109e0faa1..0ab087cb6 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -46,6 +46,10 @@ def load_engine(self, engine: str, corpus: str) -> None: from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) + elif engine=="phayathaibert" and corpus == "thainer-v2": + from pythainlp.phayathaibert.core import NamedEntityTagger + + self.engine = NamedEntityTagger() else: raise ValueError( "NER class not support {0} engine or {1} corpus.".format( From cbb7c8e52787b8ca795f4cd28046029cd7c4b3fa Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 4 Dec 2023 17:48:59 +0000 Subject: [PATCH 07/18] Update __init__.py --- pythainlp/augment/lm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py index 48e8c6c5e..febe833f5 100644 --- a/pythainlp/augment/lm/__init__.py +++ b/pythainlp/augment/lm/__init__.py @@ -12,5 +12,5 @@ ] from pythainlp.augment.lm.fasttext import FastTextAug -from pythainlp.augment.lm.wangchanberta import Thai2transformersAug from pythainlp.augment.lm.phayathaibert import ThaiTextAugmenter +from pythainlp.augment.lm.wangchanberta import Thai2transformersAug From 348dc1f0f74ec2873f8fe55cac33d677c64de017 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Sun, 10 Dec 2023 12:25:58 +0000 Subject: [PATCH 08/18] add documentation and credit model builder Co-authored-by: MpolaarbearM --- pythainlp/augment/lm/phayathaibert.py | 25 +++++++++ pythainlp/phayathaibert/core.py | 77 ++++++++++++++++++++++++++- pythainlp/tag/pos_tag.py | 1 + 3 files changed, 102 insertions(+), 1 deletion(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index e0dbc9d39..6197e5116 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -45,6 +45,31 @@ def augment(self, text: str, num_augs: int=3, sample: bool=False)->List[str]: + """ + Text Augment from phayathaibert + + :param str text: thai text + :param int num_augs: an amount of augmentation text needed as an output + :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed + + :return: list of text augment + :rtype: List[str] + + :Example: + :: + + from pythainlp.augment.lm import ThaiTextAugmenter + + aug=ThaiTextAugmenter() + aug = ThaiTextAugmenter() + aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5) + + # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.', + 'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼', + 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] + """ augment_list = [] if "" not in text: text = text+"" diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 5ecebe740..1cce1b203 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -219,8 +219,33 @@ def augment(self, num_augs: int=3, sample: bool=False )->List[str]: + """ + Text Augment from phayathaibert + + :param str text: thai text + :param int num_augs: an amount of augmentation text needed as an output + :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed + + :return: list of text augment + :rtype: List[str] + + :Example: + :: + + from pythainlp.augment.lm import ThaiTextAugmenter + + aug=ThaiTextAugmenter() + aug = ThaiTextAugmenter() + aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5) + + # output = ['ช้างมีทั้งหมด 50 ตัว บนโลกใบนี้ครับ.', + 'ช้างมีทั้งหมด 50 ตัว บนพื้นดินครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนท้องฟ้าครับ...', + 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼', + 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] + """ augment_list = [] - if num_augs <= 5: # since huggingface transformers pipeline default was set to 5 generated text + if num_augs <= 5: for rank in range(num_augs): gen_text = self.generate(text, rank, sample=sample) processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) @@ -241,6 +266,24 @@ def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None: self.model = AutoModelForTokenClassification.from_pretrained(model) def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]: + """ + Marks sentences with part-of-speech (POS) tags. + + :param str sentence: a list of lists of tokenized words + :return: a list of lists of tuples (word, POS tag) + :rtype: list[list[tuple[str, str]]] + + :Example: + + Labels POS for given sentence:: + + from pythainlp.phayathaibert.core import PartOfSpeechTagger + + tagger = PartOfSpeechTagger() + tagger.get_tag("แมวทำอะไรตอนห้าโมงเช้า") + # output: + # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] + """ from transformers import TokenClassificationPipeline pipeline = TokenClassificationPipeline( model=self.model, @@ -265,6 +308,31 @@ def get_ner(self, pos: bool=False, strategy: str="simple" )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + """ + This function tags named entities in text in IOB format. + + :param str text: text in Thai to be tagged + :param bool pos: output with part-of-speech tags.\ + (phayathaibert is supported in PartOfSpeechTagger) + :return: a list of tuples associated with tokenized words, NER tags, + POS tags (if the parameter `pos` is specified as `True`), + and output HTML-like tags (if the parameter `tag` is + specified as `True`). + Otherwise, return a list of tuples associated with tokenized + words and NER tags + :rtype: Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str] + :Example: + + >>> from pythainlp.phayathaibert.core import NamedEntityTagger + >>> + >>> tagger = NamedEntityTagger() + >>> tagger.get_ner("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย") + [('นายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย', 'PERSON'), + ('จาก', 'LOCATION'), + ('ประเทศไทย', 'LOCATION')] + >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True) + 'ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย' + """ from transformers import TokenClassificationPipeline if pos: warnings.warn("This model doesn't support output postag and It doesn't output the postag.") @@ -294,6 +362,13 @@ def get_ner(self, def segment(sentence: str)->List[str]: + """ + Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion. + + :param str text: text to be tokenized + :return: list of subwords + :rtype: list[str] + """ if not sentence or not isinstance(sentence, str): return [] diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index f850b404b..92f437262 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -180,6 +180,7 @@ def pos_tag_transformers( :param str engine: * *bert* - BERT: Bidirectional Encoder Representations from Transformers (default) * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only) + * *phayathaibert* - fine-tuned version of clicknext/phayathaibert on blackboard corpus (support blackboard cotpus only) * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only) :param str corpus: the corpus that is used to create the language model for tagger * *blackboard* - `blackboard treebank (support bert engine only) `_ From a55168a3d8f6ac6363547bd99d6af38d6acff769 Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Sun, 10 Dec 2023 13:15:42 +0000 Subject: [PATCH 09/18] update pep8 --- pythainlp/augment/lm/phayathaibert.py | 36 ++++++----- pythainlp/phayathaibert/__init__.py | 13 ++-- pythainlp/phayathaibert/core.py | 91 ++++++++++++++------------- pythainlp/tag/named_entity.py | 4 +- pythainlp/tag/pos_tag.py | 13 ++-- tests/test_augment.py | 4 +- 6 files changed, 83 insertions(+), 78 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 6197e5116..375a9e9d2 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -9,10 +9,11 @@ _model_name = "clicknext/phayathaibert" + class ThaiTextAugmenter: - def __init__(self,)->None: - from transformers import (AutoTokenizer, - AutoModelForMaskedLM, + def __init__(self,) -> None: + from transformers import (AutoTokenizer, + AutoModelForMaskedLM, pipeline,) self.tokenizer = AutoTokenizer.from_pretrained(_model_name) self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) @@ -20,11 +21,11 @@ def __init__(self,)->None: self.processor = ThaiTextProcessor() def generate(self, - sample_text: str, - word_rank: int, - max_length: int=3, - sample: bool=False - )->str: + sample_text: str, + word_rank: int, + max_length: int = 3, + sample: bool = False + ) -> str: sample_txt = sample_text final_text = "" for j in range(max_length): @@ -34,23 +35,24 @@ def generate(self, output = self.model(input)[random_word_idx]['sequence'] else: output = self.model(input)[word_rank]['sequence'] - sample_txt = output+"" + sample_txt = output + "" final_text = sample_txt - - gen_txt = re.sub("","",final_text) + gen_txt = re.sub("", "", final_text) return gen_txt - + def augment(self, - text: str, - num_augs: int=3, - sample: bool=False)->List[str]: + text: str, + num_augs: int = 3, + sample: bool = False + )->List[str]: """ Text Augment from phayathaibert :param str text: thai text :param int num_augs: an amount of augmentation text needed as an output - :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed + :param bool sample: whether to sample the text as an output or not, \ + true if more word diversity is needed :return: list of text augment :rtype: List[str] @@ -72,7 +74,7 @@ def augment(self, """ augment_list = [] if "" not in text: - text = text+"" + text = text + "" if num_augs <= 5: for rank in range(num_augs): gen_text = self.generate(text, rank, sample=sample) diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 232399079..352f503b0 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -8,10 +8,9 @@ "segment", ] -from pythainlp.phayathaibert.core import ( - ThaiTextProcessor, - ThaiTextAugmenter, - NamedEntityTagger, - PartOfSpeechTagger, - segment, -) \ No newline at end of file +from pythainlp.phayathaibert.core import (ThaiTextProcessor, + ThaiTextAugmenter, + NamedEntityTagger, + PartOfSpeechTagger, + segment, +) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 1cce1b203..ac8bab523 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -14,9 +14,11 @@ _model_name = "clicknext/phayathaibert" _tokenizer = CamembertTokenizer.from_pretrained(_model_name) + class ThaiTextProcessor: def __init__(self): - self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = " ".split() + self._TK_UNK, self._TK_REP, self._TK_WREP, self._TK_URL, self._TK_END = \ + " ".split() self.SPACE_SPECIAL_TOKEN = "<_>" @@ -30,7 +32,8 @@ def replace_url(self, text: str) -> str: >>> replace_url("go to https://github.com") go to """ - URL_PATTERN = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" + URL_PATTERN = \ + r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?" return re.sub(URL_PATTERN, self._TK_URL, text) def rm_brackets(text: str) -> str: @@ -181,22 +184,21 @@ def preprocess( return "".join(toks) - class ThaiTextAugmenter: - def __init__(self)->None: - from transformers import (AutoTokenizer, - AutoModelForMaskedLM, + def __init__(self) -> None: + from transformers import (AutoTokenizer, + AutoModelForMaskedLM, pipeline,) self.tokenizer = AutoTokenizer.from_pretrained(_model_name) self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) - self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm) + self.model = pipeline("fill-mask", tokenizer = self.tokenizer, model = self.model_for_masked_lm) self.processor = ThaiTextProcessor() def generate(self, sample_text: str, word_rank: int, - max_length: int=3, - sample: bool=False + max_length: int = 3, + sample: bool = False, )->str: sample_txt = sample_text final_text = "" @@ -207,24 +209,25 @@ def generate(self, output = self.model(input)[random_word_idx]['sequence'] else: output = self.model(input)[word_rank]['sequence'] - sample_txt = output+"" + sample_txt = output + "" final_text = sample_txt - gen_txt = re.sub("","",final_text) + gen_txt = re.sub("", "", final_text) return gen_txt def augment(self, text: str, - num_augs: int=3, - sample: bool=False + num_augs: int = 3, + sample: bool = False, )->List[str]: """ Text Augment from phayathaibert :param str text: thai text :param int num_augs: an amount of augmentation text needed as an output - :param bool sample: whether to sample the text as an output or not, true if more word diversity is needed + :param bool sample: whether to sample the text as an output or not,\ + true if more word diversity is needed :return: list of text augment :rtype: List[str] @@ -247,25 +250,26 @@ def augment(self, augment_list = [] if num_augs <= 5: for rank in range(num_augs): - gen_text = self.generate(text, rank, sample=sample) + gen_text = self.generate(text, rank, sample = sample) processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) augment_list.append(processed_text) return augment_list - class PartOfSpeechTagger: - def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None: + def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None: # Load model directly - from transformers import ( - AutoTokenizer, - AutoModelForTokenClassification, - ) + from transformers import (AutoTokenizer, + AutoModelForTokenClassification, + ) self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) - def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]: + def get_tag(self, + sentence: str, + strategy: str = 'simple' + ) -> List[List[Tuple[str, str]]]: """ Marks sentences with part-of-speech (POS) tags. @@ -285,28 +289,26 @@ def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, s # [[('แมว', 'NOUN'), ('ทําอะไร', 'VERB'), ('ตอนห้าโมงเช้า', 'NOUN')]] """ from transformers import TokenClassificationPipeline - pipeline = TokenClassificationPipeline( - model=self.model, - tokenizer=self.tokenizer, - aggregation_strategy=strategy, - ) + pipeline = TokenClassificationPipeline(model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, + ) outputs = pipeline(sentence) word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] return word_tags class NamedEntityTagger: - def __init__(self, model: str="Pavarissy/phayathaibert-thainer") -> None: - from transformers import ( - AutoTokenizer, - AutoModelForTokenClassification, - ) + def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: + from transformers import (AutoTokenizer, + AutoModelForTokenClassification, + ) self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) def get_ner(self, - text: str, - tag: bool=False, - pos: bool=False, - strategy: str="simple" + text: str, + tag: bool = False, + pos: bool = False, + strategy: str = "simple", )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. @@ -339,19 +341,20 @@ def get_ner(self, sample_output = [] tag_text_list = [] current_pos = 0 - pipeline = TokenClassificationPipeline( - model=self.model, - tokenizer=self.tokenizer, - aggregation_strategy=strategy, - ) + pipeline = TokenClassificationPipeline(model = self.model, + tokenizer = self.tokenizer, + aggregation_strategy = strategy, + ) outputs = pipeline(text) for token in outputs: ner_tag = token['entity_group'] begin_pos, end_pos = token['start'], token['end'] if current_pos == 0: - text_tag = text[:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"" + text_tag = text[:begin_pos] + f"<{ner_tag}>" \ + + text[begin_pos:end_pos] + f"" else: - text_tag = text[current_pos:begin_pos]+f"<{ner_tag}>"+text[begin_pos:end_pos]+f"" + text_tag = text[current_pos:begin_pos] + f"<{ner_tag}>" \ + + text[begin_pos:end_pos] + f"" tag_text_list.append(text_tag) sample_output.append((token['word'], token['entity_group'])) current_pos = end_pos @@ -361,7 +364,7 @@ def get_ner(self, return sample_output -def segment(sentence: str)->List[str]: +def segment(sentence: str) -> List[str]: """ Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion. diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 0ab087cb6..888edda74 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -46,7 +46,7 @@ def load_engine(self, engine: str, corpus: str) -> None: from pythainlp.wangchanberta import ThaiNameTagger self.engine = ThaiNameTagger(dataset_name=corpus) - elif engine=="phayathaibert" and corpus == "thainer-v2": + elif engine == "phayathaibert" and corpus == "thainer-v2": from pythainlp.phayathaibert.core import NamedEntityTagger self.engine = NamedEntityTagger() @@ -58,7 +58,7 @@ def load_engine(self, engine: str, corpus: str) -> None: ) def tag( - self, text, pos=False, tag=False + self, text, pos = False, tag = False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index 92f437262..0c9092808 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -180,7 +180,8 @@ def pos_tag_transformers( :param str engine: * *bert* - BERT: Bidirectional Encoder Representations from Transformers (default) * *wangchanberta* - fine-tuned version of airesearch/wangchanberta-base-att-spm-uncased on pud corpus (support PUD cotpus only) - * *phayathaibert* - fine-tuned version of clicknext/phayathaibert on blackboard corpus (support blackboard cotpus only) + * *phayathaibert* - fine-tuned version of clicknext/phayathaibert \ + on blackboard corpus (support blackboard cotpus only) * *mdeberta* - mDeBERTa: Multilingual Decoding-enhanced BERT with disentangled attention (support PUD corpus only) :param str corpus: the corpus that is used to create the language model for tagger * *blackboard* - `blackboard treebank (support bert engine only) `_ @@ -213,13 +214,13 @@ def pos_tag_transformers( return [] _blackboard_support_engine = { - "bert" : "lunarlist/pos_thai", - "phayathai" : "lunarlist/pos_thai_phayathai", + "bert": "lunarlist/pos_thai", + "phayathai": "lunarlist/pos_thai_phayathai", } _pud_support_engine = { - "wangchanberta" : "Pavarissy/wangchanberta-ud-thai-pud-upos", - "mdeberta" : "Pavarissy/mdeberta-v3-ud-thai-pud-upos", + "wangchanberta": "Pavarissy/wangchanberta-ud-thai-pud-upos", + "mdeberta": "Pavarissy/mdeberta-v3-ud-thai-pud-upos", } if corpus == 'blackboard' and engine in _blackboard_support_engine.keys(): @@ -237,7 +238,7 @@ def pos_tag_transformers( ) ) - pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple") + pipeline = TokenClassificationPipeline(model = model, tokenizer = tokenizer, aggregation_strategy = "simple") outputs = pipeline(sentence) word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] diff --git a/tests/test_augment.py b/tests/test_augment.py index 5542d96c3..2721b056b 100644 --- a/tests/test_augment.py +++ b/tests/test_augment.py @@ -45,6 +45,6 @@ def test_LTW2VAug(self): # _aug = Thai2transformersAug() # self.assertIsNotNone(_aug.augment(self.text2, num_replace_tokens=1)) - # def test_ThaiTextAugmenter(self): + # def test_ThaiTextAugmenter(self): # _aug = ThaiTextAugmenter() - # self.assertIsNotNone(_aug.augment(self.text2, num__augs=3)) \ No newline at end of file + # self.assertIsNotNone(_aug.augment(self.text2, num__augs=3)) From 76b49c319121d0867a280b2ca0c5c907a6b8d65d Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Sun, 10 Dec 2023 13:41:00 +0000 Subject: [PATCH 10/18] update pep8 --- pythainlp/augment/lm/phayathaibert.py | 4 +- pythainlp/phayathaibert/__init__.py | 2 +- pythainlp/phayathaibert/core.py | 57 ++++++++++++++++----------- pythainlp/tag/named_entity.py | 7 ++-- pythainlp/tag/pos_tag.py | 8 ++-- 5 files changed, 43 insertions(+), 35 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 375a9e9d2..e9ea4b7df 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -40,12 +40,11 @@ def generate(self, gen_txt = re.sub("", "", final_text) return gen_txt - def augment(self, text: str, num_augs: int = 3, sample: bool = False - )->List[str]: + ) -> List[str]: """ Text Augment from phayathaibert @@ -86,4 +85,3 @@ def augment(self, raise ValueError( f"augmentation of more than {num_augs} is exceeded the default limit" ) - diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 352f503b0..405ce1843 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -13,4 +13,4 @@ NamedEntityTagger, PartOfSpeechTagger, segment, -) + ) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index ac8bab523..926046076 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -21,7 +21,6 @@ def __init__(self): " ".split() self.SPACE_SPECIAL_TOKEN = "<_>" - def replace_url(self, text: str) -> str: """ Replace url in `text` with TK_URL (https://stackoverflow.com/a/6041965) @@ -191,15 +190,18 @@ def __init__(self) -> None: pipeline,) self.tokenizer = AutoTokenizer.from_pretrained(_model_name) self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) - self.model = pipeline("fill-mask", tokenizer = self.tokenizer, model = self.model_for_masked_lm) + self.model = pipeline("fill-mask", + tokenizer = self.tokenizer, + model = self.model_for_masked_lm, + ) self.processor = ThaiTextProcessor() def generate(self, - sample_text: str, - word_rank: int, + sample_text: str, + word_rank: int, max_length: int = 3, sample: bool = False, - )->str: + ) -> str: sample_txt = sample_text final_text = "" for j in range(max_length): @@ -214,13 +216,12 @@ def generate(self, gen_txt = re.sub("", "", final_text) return gen_txt - def augment(self, - text: str, - num_augs: int = 3, + text: str, + num_augs: int = 3, sample: bool = False, - )->List[str]: + ) -> List[str]: """ Text Augment from phayathaibert @@ -248,9 +249,12 @@ def augment(self, 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] """ augment_list = [] - if num_augs <= 5: + if num_augs <= 5: for rank in range(num_augs): - gen_text = self.generate(text, rank, sample = sample) + gen_text = self.generate(text, + rank, + sample = sample, + ) processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) augment_list.append(processed_text) @@ -296,20 +300,22 @@ def get_tag(self, outputs = pipeline(sentence) word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]] return word_tags - + + class NamedEntityTagger: - def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: + def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: from transformers import (AutoTokenizer, AutoModelForTokenClassification, ) self.tokenizer = AutoTokenizer.from_pretrained(model) self.model = AutoModelForTokenClassification.from_pretrained(model) - def get_ner(self, - text: str, - tag: bool = False, - pos: bool = False, - strategy: str = "simple", - )->Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + + def get_ner(self, + text: str, + tag: bool = False, + pos: bool = False, + strategy: str = "simple", + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. @@ -333,11 +339,13 @@ def get_ner(self, ('จาก', 'LOCATION'), ('ประเทศไทย', 'LOCATION')] >>> ner.tag("ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย", tag=True) - 'ทดสอบนายปวริศ เรืองจุติโพธิ์พานจากประเทศไทย' + 'ทดสอบนายปวริศ เรืองจุติโพธิ์พาน\ + จากประเทศไทย' """ from transformers import TokenClassificationPipeline if pos: - warnings.warn("This model doesn't support output postag and It doesn't output the postag.") + warnings.warn("This model doesn't support output \ + postag and It doesn't output the postag.") sample_output = [] tag_text_list = [] current_pos = 0 @@ -363,10 +371,11 @@ def get_ner(self, else: return sample_output - + def segment(sentence: str) -> List[str]: """ - Subword tokenize of phayathaibert, sentencepiece from wangchanberta model with Vocabulary Expansion. + Subword tokenize of phayathaibert, \ + sentencepiece from wangchanberta model with Vocabulary Expansion. :param str text: text to be tokenized :return: list of subwords @@ -375,4 +384,4 @@ def segment(sentence: str) -> List[str]: if not sentence or not isinstance(sentence, str): return [] - return _tokenizer.tokenize(sentence) \ No newline at end of file + return _tokenizer.tokenize(sentence) diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 888edda74..9b18ff5d0 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -57,9 +57,10 @@ def load_engine(self, engine: str, corpus: str) -> None: ) ) - def tag( - self, text, pos = False, tag = False - ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: + def tag(self, + text, pos = False, + tag = False + ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index e5d773d88..ea45297d7 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -241,10 +241,10 @@ def pos_tag_transformers( ) ) - - pipeline = TokenClassificationPipeline( - model = model, tokenizer = tokenizer, aggregation_strategy = "simple" - ) + pipeline = TokenClassificationPipeline(model = model, + tokenizer = tokenizer, + aggregation_strategy = "simple", + ) outputs = pipeline(sentence) word_tags = [[(tag["word"], tag["entity_group"]) for tag in outputs]] From 22daf2df5ccd9b9c0b946655ece210908d67ed2a Mon Sep 17 00:00:00 2001 From: Pavarissy Date: Sun, 10 Dec 2023 13:46:22 +0000 Subject: [PATCH 11/18] update pep8 --- pythainlp/augment/lm/phayathaibert.py | 2 +- pythainlp/phayathaibert/core.py | 12 ++++++------ pythainlp/tag/named_entity.py | 5 +++-- pythainlp/tag/pos_tag.py | 6 +++--- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index e9ea4b7df..439b9ca98 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -73,7 +73,7 @@ def augment(self, """ augment_list = [] if "" not in text: - text = text + "" + text = text + "" if num_augs <= 5: for rank in range(num_augs): gen_text = self.generate(text, rank, sample=sample) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 926046076..3569a40cf 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -191,8 +191,8 @@ def __init__(self) -> None: self.tokenizer = AutoTokenizer.from_pretrained(_model_name) self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) self.model = pipeline("fill-mask", - tokenizer = self.tokenizer, - model = self.model_for_masked_lm, + tokenizer=self.tokenizer, + model=self.model_for_masked_lm, ) self.processor = ThaiTextProcessor() @@ -253,7 +253,7 @@ def augment(self, for rank in range(num_augs): gen_text = self.generate(text, rank, - sample = sample, + sample=sample, ) processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) augment_list.append(processed_text) @@ -349,9 +349,9 @@ def get_ner(self, sample_output = [] tag_text_list = [] current_pos = 0 - pipeline = TokenClassificationPipeline(model = self.model, - tokenizer = self.tokenizer, - aggregation_strategy = strategy, + pipeline = TokenClassificationPipeline(model=self.model, + tokenizer=self.tokenizer, + aggregation_strategy=strategy, ) outputs = pipeline(text) for token in outputs: diff --git a/pythainlp/tag/named_entity.py b/pythainlp/tag/named_entity.py index 9b18ff5d0..88b290f29 100644 --- a/pythainlp/tag/named_entity.py +++ b/pythainlp/tag/named_entity.py @@ -58,8 +58,9 @@ def load_engine(self, engine: str, corpus: str) -> None: ) def tag(self, - text, pos = False, - tag = False + text, + pos=False, + tag=False ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]], str]: """ This function tags named entities in text in IOB format. diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py index ea45297d7..a195f8b69 100644 --- a/pythainlp/tag/pos_tag.py +++ b/pythainlp/tag/pos_tag.py @@ -241,9 +241,9 @@ def pos_tag_transformers( ) ) - pipeline = TokenClassificationPipeline(model = model, - tokenizer = tokenizer, - aggregation_strategy = "simple", + pipeline = TokenClassificationPipeline(model=model, + tokenizer=tokenizer, + aggregation_strategy="simple", ) outputs = pipeline(sentence) From 84de5c442665ff641ae10a5ee5b6df38b0ff8d52 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 10 Dec 2023 17:24:23 +0000 Subject: [PATCH 12/18] Update core.py: sort imports, remove duplicated lines --- pythainlp/phayathaibert/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 3569a40cf..d1977ee4a 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union, Tuple, Collection, Callable +from typing import Callable, Collection, List, Tuple, Union import re import random import warnings @@ -238,7 +238,6 @@ def augment(self, from pythainlp.augment.lm import ThaiTextAugmenter - aug=ThaiTextAugmenter() aug = ThaiTextAugmenter() aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5) From a2fd4d3732f96ff7d1b78d829b775ded92c8d703 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 10 Dec 2023 17:26:07 +0000 Subject: [PATCH 13/18] Update phayathaibert.py: sort imports, remove duplicated lines --- pythainlp/augment/lm/phayathaibert.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 439b9ca98..370c9dac1 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 from typing import List -import re import random +import re from pythainlp.phayathaibert.core import ThaiTextProcessor @@ -32,9 +32,9 @@ def generate(self, input = self.processor.preprocess(sample_txt) if sample: random_word_idx = random.randint(0, 4) - output = self.model(input)[random_word_idx]['sequence'] + output = self.model(input)[random_word_idx]["sequence"] else: - output = self.model(input)[word_rank]['sequence'] + output = self.model(input)[word_rank]["sequence"] sample_txt = output + "" final_text = sample_txt gen_txt = re.sub("", "", final_text) @@ -48,7 +48,7 @@ def augment(self, """ Text Augment from phayathaibert - :param str text: thai text + :param str text: Thai text :param int num_augs: an amount of augmentation text needed as an output :param bool sample: whether to sample the text as an output or not, \ true if more word diversity is needed @@ -61,7 +61,6 @@ def augment(self, from pythainlp.augment.lm import ThaiTextAugmenter - aug=ThaiTextAugmenter() aug = ThaiTextAugmenter() aug.augment("ช้างมีทั้งหมด 50 ตัว บน", num_args=5) From 7e24d3fd278001de69665a9a774c2399a993331c Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 10 Dec 2023 17:28:28 +0000 Subject: [PATCH 14/18] Reexport NamedEntityTagger --- pythainlp/phayathaibert/__init__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pythainlp/phayathaibert/__init__.py b/pythainlp/phayathaibert/__init__.py index 405ce1843..bf0f847bf 100644 --- a/pythainlp/phayathaibert/__init__.py +++ b/pythainlp/phayathaibert/__init__.py @@ -2,15 +2,17 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 __all__ = [ - "ThaiTextProcessor", - "ThaiTextAugmenter", + "NamedEntityTagger", "PartOfSpeechTagger", + "ThaiTextAugmenter", + "ThaiTextProcessor", "segment", ] -from pythainlp.phayathaibert.core import (ThaiTextProcessor, - ThaiTextAugmenter, - NamedEntityTagger, - PartOfSpeechTagger, - segment, - ) +from pythainlp.phayathaibert.core import ( + NamedEntityTagger, + PartOfSpeechTagger, + ThaiTextAugmenter, + ThaiTextProcessor, + segment, +) From 826cfed0898838ca61a7c61d6665e68ce8787c43 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 10 Dec 2023 17:33:22 +0000 Subject: [PATCH 15/18] Fix minor types --- pythainlp/phayathaibert/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index d1977ee4a..579d24ddc 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -2,9 +2,10 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 from typing import Callable, Collection, List, Tuple, Union -import re import random +import re import warnings + from pythainlp.tokenize import word_tokenize from transformers import ( CamembertTokenizer, @@ -49,7 +50,7 @@ def rm_brackets(text: str) -> str: new_line = re.sub(r"\(\)", "", text) new_line = re.sub(r"\{\}", "", new_line) new_line = re.sub(r"\[\]", "", new_line) - # brakets with only punctuations + # brackets with only punctuations new_line = re.sub(r"\([^a-zA-Z0-9ก-๙]+\)", "", new_line) new_line = re.sub(r"\{[^a-zA-Z0-9ก-๙]+\}", "", new_line) new_line = re.sub(r"\[[^a-zA-Z0-9ก-๙]+\]", "", new_line) @@ -225,7 +226,7 @@ def augment(self, """ Text Augment from phayathaibert - :param str text: thai text + :param str text: Thai text :param int num_augs: an amount of augmentation text needed as an output :param bool sample: whether to sample the text as an output or not,\ true if more word diversity is needed From 72e2bd557c2b43557143dc9734356644070023e8 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 10 Dec 2023 17:34:35 +0000 Subject: [PATCH 16/18] Update __init__.py --- pythainlp/augment/lm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/augment/lm/__init__.py b/pythainlp/augment/lm/__init__.py index febe833f5..f0265b136 100644 --- a/pythainlp/augment/lm/__init__.py +++ b/pythainlp/augment/lm/__init__.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 """ -LM +Language Models """ __all__ = [ From dec62c1cf0d2dd4f129a5850116edc3feb9c3989 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 11 Dec 2023 03:30:19 +0000 Subject: [PATCH 17/18] Use MAX_NUM_AUGS constant for max num_augs limit --- pythainlp/augment/lm/phayathaibert.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 370c9dac1..90e067523 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -1,9 +1,11 @@ # -*- coding: utf-8 -*- # SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project # SPDX-License-Identifier: Apache-2.0 + from typing import List import random import re + from pythainlp.phayathaibert.core import ThaiTextProcessor @@ -28,6 +30,7 @@ def generate(self, ) -> str: sample_txt = sample_text final_text = "" + for j in range(max_length): input = self.processor.preprocess(sample_txt) if sample: @@ -37,7 +40,9 @@ def generate(self, output = self.model(input)[word_rank]["sequence"] sample_txt = output + "" final_text = sample_txt + gen_txt = re.sub("", "", final_text) + return gen_txt def augment(self, @@ -46,7 +51,7 @@ def augment(self, sample: bool = False ) -> List[str]: """ - Text Augment from phayathaibert + Text augmentation from PhayaThaiBERT :param str text: Thai text :param int num_augs: an amount of augmentation text needed as an output @@ -70,17 +75,20 @@ def augment(self, 'ช้างมีทั้งหมด 50 ตัว บนดวงจันทร์.‼', 'ช้างมีทั้งหมด 50 ตัว บนเขาค่ะ😁'] """ + MAX_NUM_AUGS = 5 augment_list = [] + if "" not in text: text = text + "" - if num_augs <= 5: + + if num_augs <= MAX_NUM_AUGS: for rank in range(num_augs): gen_text = self.generate(text, rank, sample=sample) processed_text = re.sub("<_>", " ", self.processor.preprocess(gen_text)) augment_list.append(processed_text) return augment_list - else: - raise ValueError( - f"augmentation of more than {num_augs} is exceeded the default limit" - ) + + raise ValueError( + f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}" + ) From e7ef6ceb0c6906f6ed09679c719f73effdf158ef Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 11 Dec 2023 14:00:07 +0000 Subject: [PATCH 18/18] Update phayathaibert.py Use UPPERCASE for constant --- pythainlp/augment/lm/phayathaibert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pythainlp/augment/lm/phayathaibert.py b/pythainlp/augment/lm/phayathaibert.py index 90e067523..47b43c219 100644 --- a/pythainlp/augment/lm/phayathaibert.py +++ b/pythainlp/augment/lm/phayathaibert.py @@ -9,7 +9,7 @@ from pythainlp.phayathaibert.core import ThaiTextProcessor -_model_name = "clicknext/phayathaibert" +_MODEL_NAME = "clicknext/phayathaibert" class ThaiTextAugmenter: @@ -17,8 +17,8 @@ def __init__(self,) -> None: from transformers import (AutoTokenizer, AutoModelForMaskedLM, pipeline,) - self.tokenizer = AutoTokenizer.from_pretrained(_model_name) - self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_model_name) + self.tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) + self.model_for_masked_lm = AutoModelForMaskedLM.from_pretrained(_MODEL_NAME) self.model = pipeline("fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm) self.processor = ThaiTextProcessor() @@ -88,7 +88,7 @@ def augment(self, augment_list.append(processed_text) return augment_list - + raise ValueError( f"augmentation of more than {num_augs} is exceeded the default limit: {MAX_NUM_AUGS}" )