Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1341a31
Merge pull request #2 from pavaris-pm/improve-pos-tag-transformers
pavaris-pm Nov 14, 2023
38f71b5
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 1, 2023
41d79c2
add phayathaibert core engine
pavaris-pm Dec 1, 2023
cb9e27a
add data augmentation engine
pavaris-pm Dec 4, 2023
473af52
update engine properties
pavaris-pm Dec 4, 2023
0c3efd0
updae augmentation properties
pavaris-pm Dec 4, 2023
245e99e
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 4, 2023
dd2b834
change license
pavaris-pm Dec 4, 2023
d1b9c99
add er engine
pavaris-pm Dec 4, 2023
cbb7c8e
Update __init__.py
bact Dec 4, 2023
b71ebda
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 10, 2023
348dc1f
add documentation and credit model builder
pavaris-pm Dec 10, 2023
c7b6900
Merge branch 'dev' into dev
pavaris-pm Dec 10, 2023
a55168a
update pep8
pavaris-pm Dec 10, 2023
536f493
resolve conflict
pavaris-pm Dec 10, 2023
76b49c3
update pep8
pavaris-pm Dec 10, 2023
22daf2d
update pep8
pavaris-pm Dec 10, 2023
84de5c4
Update core.py: sort imports, remove duplicated lines
bact Dec 10, 2023
a2fd4d3
Update phayathaibert.py: sort imports, remove duplicated lines
bact Dec 10, 2023
7e24d3f
Reexport NamedEntityTagger
bact Dec 10, 2023
826cfed
Fix minor types
bact Dec 10, 2023
72e2bd5
Update __init__.py
bact Dec 10, 2023
dec62c1
Use MAX_NUM_AUGS constant for max num_augs limit
bact Dec 11, 2023
9999f90
Merge branch 'PyThaiNLP:dev' into dev
pavaris-pm Dec 11, 2023
e7ef6ce
Update phayathaibert.py
bact Dec 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions pythainlp/phayathaibert/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"PartOfSpeechTagger",
"segment",
]

from pythainlp.phayathaibert.core import PartOfSpeechTagger, segment
52 changes: 52 additions & 0 deletions pythainlp/phayathaibert/core.py
Comment thread
bact marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple, Union
import re
import warnings
from transformers import (
CamembertTokenizer,
)


_model_name = "clicknext/phayathaibert"
_tokenizer = CamembertTokenizer.from_pretrained(_model_name)


class PartOfSpeechTagger:
def __init__(self, model: str="lunarlist/pos_thai_phayathai") -> None:
# Load model directly
from transformers import (
AutoTokenizer,
AutoModelForTokenClassification,
)
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.model = AutoModelForTokenClassification.from_pretrained(model)

def get_tag(self, sentence: str, strategy: str='simple')->List[List[Tuple[str, str]]]:
from transformers import TokenClassificationPipeline
pipeline = TokenClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
aggregation_strategy=strategy,
)
outputs = pipeline(sentence)
word_tags = [[(tag['word'], tag['entity_group']) for tag in outputs]]
return word_tags

def segment(sentence: str)->List[str]:
if not sentence or not isinstance(sentence, str):
return []

return _tokenizer.tokenize(sentence)
1 change: 1 addition & 0 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ def pos_tag_transformers(

_blackboard_support_engine = {
"bert" : "lunarlist/pos_thai",
"phayathai" : "lunarlist/pos_thai_phayathai",
}

_pud_support_engine = {
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,8 @@ def subword_tokenize(
from pythainlp.tokenize.tltk import syllable_tokenize as segment
elif engine == "han_solo":
from pythainlp.tokenize.han_solo import segment
elif engine == "phayathai":
from pythainlp.phayathaibert import segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down
12 changes: 12 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,18 @@ def test_subword_tokenize(self):
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="tltk")
)
self.assertIsInstance(subword_tokenize("โควิด19", engine="tltk"), list)

self.assertEqual(subword_tokenize(None, engine="phayathai"), [])
self.assertEqual(subword_tokenize("", engine="phayathai"), [])
self.assertIsInstance(
subword_tokenize("สวัสดิีดาวอังคาร", engine="phayathai"), list
)
self.assertFalse(
"า" in subword_tokenize("สวัสดีดาวอังคาร", engine="phayathai")
)
self.assertIsInstance(
subword_tokenize("โควิด19", engine="phayathai"), list
)
with self.assertRaises(ValueError):
subword_tokenize("นกแก้ว", engine="XX") # engine does not exist

Expand Down