Skip to content

Commit 3b494c5

Browse files
committed
6.0.0
1 parent ee5392e commit 3b494c5

36 files changed

+1721
-114
lines changed

Diff for: README.md

+1,385-1
Large diffs are not rendered by default.

Diff for: bench/preprocessing/test_preprocess.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
normalization_type="NFC",
1414
allow_doubled_spaces=False,
1515
allow_html_tags=False,
16-
allow_html_unescape=False,
16+
allow_html_escape=False,
1717
allow_halfwidth_hangul=False,
1818
reduce_char_repeats_over=5,
1919
reduce_emoticon_repeats_over=2,
@@ -47,7 +47,7 @@
4747
max_ellipsis_ratio=0.5,
4848
min_hangul_ratio=0.4,
4949
max_hangul_ratio=1.0,
50-
max_hangul_non_completed_form_ratio=5,
50+
max_hangul_incompleted_form_ratio=5,
5151
max_words_length=1000,
5252
max_line_repeats=4,
5353
max_line_by_char_repeats=10,

Diff for: kss/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ def __init__(self, module: str):
133133
def __call__(self, *args, **kwargs):
134134
return self.module(*args, **kwargs)
135135

136-
def usage(self):
137-
return self.module.__doc__
136+
def help(self):
137+
print(self.module.__doc__.strip())
138138

139139
@staticmethod
140140
def available():

Diff for: kss/_modules/augmentation/augment.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from functools import partial
1+
from functools import partial, lru_cache
22
from typing import Union, List, Tuple
33

44
from kss._modules.augmentation.replacement import SynonymReplacement
@@ -8,23 +8,24 @@
88
from kss._utils.sanity_checks import _check_text, _check_type, _check_num_workers, _check_backend_mecab_pecab_only
99

1010

11+
@lru_cache(maxsize=500)
1112
def augment(
1213
text: Union[str, List[str], Tuple[str]],
1314
replacement_ratio: float = 0.3,
14-
josa_normalization: bool = True,
15+
josa_correction: bool = True,
1516
num_workers: Union[int, str] = "auto",
1617
backend: str = "auto",
1718
verbose: bool = False,
1819
) -> Union[str, List[str]]:
1920
"""
20-
Augment text with synonym replacement method and,
21-
postprocess with space normalization and josa normalization.
21+
Augments text with synonym replacement method and,
22+
optionally it postprocesses the text by correcting josa.
23+
For this, Kss uses the Korean wordnet from KAIST.
2224
2325
Args:
2426
text (Union[str, List[str], Tuple[str]]): single text or list of texts
2527
replacement_ratio (float): ratio of words to be replaced
26-
space_normalization (bool): whether to normalize spaces or not
27-
josa_normalization (bool): whether to normalize josa or not
28+
josa_correction (bool): whether to normalize josa or not
2829
num_workers (Union[int, str]): the number of multiprocessing workers
2930
backend (str): morpheme analyzer backend. 'mecab', 'pecab' are supported
3031
verbose (bool): whether to print verbose outputs or not
@@ -38,15 +39,18 @@ def augment(
3839
>>> text = "앞서 지난해 11월, 보이저 1호는 명령을 수신하고 수행하는 데엔 문제가 없었지만 통신 장치에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일 '포크'라는 명령을 보냈다."
3940
>>> output = augment(text)
4041
>>> print(output)
41-
"앞서 지난해 11월, 보이저 1호는 명령을 수신하고 시행하는 데엔 문제가 없었지만 송신 장비에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일'포크'라는 명령을 보냈다."
42+
"앞서 지난해 11월, 보이저 1호는 명령을 수신하고 시행하는 데엔 문제가 없었지만 송신 장비에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일 '포크'라는 명령을 보냈다."
43+
44+
References:
45+
This was copied from [KoEDA](https://github.com/toriving/KoEDA) and modified by Kss
4246
"""
4347
text, finish = _check_text(text)
4448

4549
if finish:
4650
return text
4751

4852
replacement_ratio = _check_type(replacement_ratio, "replacement_ratio", float)
49-
josa_normalization = _check_type(josa_normalization, "josa_normalization", bool)
53+
josa_correction = _check_type(josa_correction, "josa_correction", bool)
5054
verbose = _check_type(verbose, "verbose", bool)
5155
num_workers = _check_num_workers(text, num_workers)
5256
_check_backend_mecab_pecab_only(backend)
@@ -62,7 +66,7 @@ def augment(
6266
func=partial(
6367
_augment,
6468
replacement_ratio=replacement_ratio,
65-
josa_normalization=josa_normalization,
69+
josa_correction=josa_correction,
6670
backend=backend,
6771
verbose=verbose,
6872
),
@@ -74,15 +78,15 @@ def augment(
7478
def _augment(
7579
text: str,
7680
replacement_ratio: float = 0.3,
77-
josa_normalization: bool = True,
81+
josa_correction: bool = True,
7882
backend: str = "auto",
7983
verbose: bool = False,
8084
):
8185
orig_text = text
8286
replacement = SynonymReplacement(backend=backend) # I want WSD...
8387
text = replacement(text, p=replacement_ratio, verbose=verbose)
8488

85-
if josa_normalization:
89+
if josa_correction:
8690
text = correct_josa(text)
8791

8892
if verbose:

Diff for: kss/_modules/augmentation/replacement.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This is copied from KoEDA [https://github.com/toriving/KoEDA]
1+
# This was copied from KoEDA [https://github.com/toriving/KoEDA]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
import random

Diff for: kss/_modules/augmentation/utils.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# This was copied from KoEDA [https://github.com/toriving/KoEDA]
2+
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
3+
14
import json
25
import os
36

Diff for: kss/_modules/collocation/collocate.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from functools import partial
1+
from functools import partial, lru_cache
22
from typing import Union, List, Tuple
33

44
from kollocate import Kollocate
@@ -10,22 +10,23 @@
1010
kollocate_obj = Kollocate()
1111

1212

13+
@lru_cache(maxsize=500)
1314
def collocate(
1415
text: Union[str, List[str], Tuple[str]],
1516
num_workers: Union[int, str] = "auto",
1617
verbose: bool = False,
1718
) -> Union[dict, List[dict]]:
1819
"""
19-
Get collocation (연어) of words in text.
20+
This returns collocation (연어) of given words.
2021
The collocation is a set of words that frequently appear together.
2122
2223
Args:
23-
text (Union[str, List[str], Tuple[str]]): single text or list of texts
24+
text (Union[str, List[str], Tuple[str]]): single word or list of words
2425
num_workers (Union[int, str]): the number of multiprocessing workers
2526
verbose (bool): whether to print verbose outputs or not
2627
2728
Returns:
28-
Union[dict, List[dict]]: collocation of words in text or list of collocations
29+
Union[dict, List[dict]]: collocations and frequencies of words in text or list of collocations and frequencies
2930
3031
Examples:
3132
>>> from kss import Kss
@@ -34,6 +35,9 @@ def collocate(
3435
>>> output = collocate(text)
3536
>>> print(output)
3637
{'verb': {'noun': [('것', 39), ('수', 29), ('음식', 23), ('등', 16), ('고기', 14), ('먹이', 14), ('때', 12), ('식물', 12), ('개고기', 9), ('젖', 9), ('겁', 7), ('시작', 7), ('후', 7), ('밥', 7), ('요리', 7), ('경우', 6), ('풀', 6), ('사람', 5), ('자살', 5), ('과일', 4), ('늑대', 4), ('마음', 4), ('나이', 4), ('애', 4), ('생선', 3), ('개', 3), ('죽', 3), ('양', 3), ('나무', 3), ('만큼', 3), ('물', 3), ('방법', 3), ('알', 3), ('떡볶이', 3), ('식사', 3), ('아침', 3), ('사과', 3), ('라면', 3), ('자기', 3), ('약', 3), ('점심', 3), ('때문', 3), ('조리', 3), ('떡', 2), ('접시', 2), ('국수', 2), ('일반적', 2), ('무엇', 2), ('파이', 2), ('만', 2), ('다음', 2), ('이후', 2), ('무리', 2), ('기록', 2), ('풍습', 2), ('동물', 2), ('식물질', 2), ('곤충', 2), ('이', 2), ('유제류', 2), ('새끼', 2), ('불고기', 2), ('한국', 2), ('한국식', 2), ('동안', 2), ('몸', 2), ('돼지고기', 2), ('잡식성', 2), ('기관', 2), ('제사장', 2), ('끼', 2), ('운동', 2), ('곡식', 2), ('궁중', 2), ('젖소', 2), ('우유', 2), ('고', 2), ('이야기', 2), ('정도', 2), ('일', 2), ('자리', 2), ('지역', 2), ('소화', 2), ('도중', 2), ('쓰레기', 2), ('저녁', 2), ('그', 2), ('뒤', 2), ('조', 2), ('고구마', 1), ('가지', 1), ('가시', 1), ('가지도', 1), ('아이', 1), ('쌈', 1), ('노출', 1), ('그다음', 1), ('근육', 1), ('아침상', 1), ('대로', 1), ('솔잎', 1), ('생', 1), ('중세', 1), ('어른', 1), ('성소', 1), ('집중적', 1), ('한번', 1), ('멜론', 1), ('입', 1), ('나뭇가지', 1), ('혀', 1), ('무시', 1), ('발견', 1), ('전후', 1), ('운반', 1), ('찬밥', 1), ('벌레', 1), ('남편', 1), ('하', 1), ('부인', 1), ('플랑크톤', 1), ('4', 1), ('미역국', 1), ('벨', 1), ('잔가지', 1), ('그중', 1), ('말', 1), ('소시지', 1), ('사냥', 1), ('카스', 1), ('따위', 1), ('레', 1), ('새', 1), ('내서', 1), ('인류', 1), ('소문', 1), ('수라', 1), ('곳', 1), ('항원', 1), ('구지가', 1), ('되새김', 1), ('사료', 1), ('하루', 1), ('문제', 1), ('큰곰', 1), ('비율', 1), ('6', 1), ('보리', 1), ('메조', 1), ('광', 1), ('물냉면', 1), ('살', 1), ('여공', 1), ('수영', 1), ('주변', 1), ('파리', 1), ('체액', 1), ('가운데', 1), ('시체', 1), ('구기', 1), ('해초', 1), ('호두', 1), ('엽전', 1), ('고추장', 1), ('300', 1), ('배설', 1), ('곰국', 1), ('노루', 1), ('대나무', 1), ('초식성', 1), ('부육', 1), ('종류', 1), ('남', 1), ('포장마차', 1), ('어묵', 1), ('특효', 1), ('약물', 1), ('노래', 1), ('어미', 1), ('맥', 1), ('털가죽', 1), ('뇌', 1), ('돌', 1), ('방식', 1), ('비빔밥', 1), ('진화', 1), ('콩', 1), ('탈', 1), ('신의주', 1), ('뜰', 1), ('하느님', 1), ('공화제', 1), ('물질', 1), ('초식', 1), ('간', 1), ('흑', 1), ('식품', 1), ('덮밥', 1), ('탐사선', 1), ('해산물', 1), ('내부', 1), ('3', 1), ('공격', 1), ('정', 1), ('대부분', 1), ('데', 1), ('연습', 1), ('오이', 1), ('성장', 1), ('볶음', 1), ('설', 1), ('스탈린', 1), ('형식', 1), ('골목', 1), ('나뭇잎', 1), ('주간', 1), ('잎', 1), ('된장찌개', 1), ('자매', 1), ('시간당', 1), ('말복', 1), ('구어', 1), ('말고기', 1), ('당나귀', 1), ('모습', 1), ('박테리아', 1), ('영양분', 1), ('부분', 1), ('가축', 1), ('그것', 1), ('가공품', 1), ('구이', 1), ('목', 1), ('기름', 1), ('휴식', 1), ('소스', 1), ('나무껍질', 1), ('쇠고기', 1), ('고착', 1), ('밝기', 1), ('짜장면', 1), ('날', 1), ('수액', 1), ('배달', 1), ('장면', 1), ('단어', 1), ('떡국', 1), ('찜', 1), ('문화권', 1), ('소형', 1), ('이전', 1), ('로', 1), ('노년', 1), ('난자', 1), ('돼지', 1), ('채집', 1), ('나라', 1), ('전', 1), ('육식성', 1), ('격려', 1), ('반복', 1), ('유협', 1), ('디저트', 1), ('분위기', 1), ('일과', 1), ('쌀', 1), ('고관', 1), ('밑반찬', 1), ('떼', 1), ('개월', 1), ('다양', 1), ('본격적', 1), ('생각', 1), ('수면제', 1), ('학교', 1), ('칭', 1), ('고깃국', 1), ('충격', 1), ('복사본', 1), ('미', 1), ('감로', 1), ('비', 1), ('독약', 1), ('우두', 1), ('도', 1), ('농작물', 1), ('자장면', 1), ('인터넷', 1), ('술', 1), ('기회', 1), ('풀뿌리', 1), ('게걸', 1), ('외상술', 1), ('약밥', 1), ('풍속', 1), ('한방', 1), ('치질', 1), ('나', 1), ('허기', 1), ('어린이', 1), ('나물밥', 1), ('사약', 1), ('가죽', 1), ('서부', 1), ('명', 1), ('이승만', 1), ('가래떡', 1), ('절대', 1), ('숲', 1), ('깨', 1), ('봄맞이', 1), ('곡', 1), ('계속', 1), ('자연', 1), ('목축', 1), ('시간', 1), ('서로', 1), ('얼', 1), ('가능성', 1), ('빵', 1), ('뿌리', 1), ('독립', 1), ('혼자', 1), ('잠', 1), ('덴푸라', 1), ('둥', 1), ('성', 1), ('가열', 1), ('전자', 1), ('창신', 1), ('탈진', 1), ('맘', 1), ('자신', 1)], 'verb': [('하', 33), ('않', 21), ('살', 17), ('즐기', 11), ('굽', 9), ('곁들이', 7), ('썰', 6), ('찍', 5), ('섞', 5), ('있', 5), ('치우', 4), ('잡', 4), ('나누', 4), ('비비', 4), ('되', 3), ('싸', 3), ('어', 3), ('익히', 3), ('버리', 3), ('만들', 3), ('집', 3), ('보', 3), ('죽', 3), ('들', 2), ('튀기', 2), ('삭히', 2), ('마시', 2), ('말', 2), ('달이', 2), ('끓이', 2), ('뜯', 2), ('사', 2), ('자라', 2), ('뿌리', 2), ('넣', 2), ('시키', 2), ('위하', 2), ('찌', 1), ('씹', 1), ('뒤집', 1), ('바꾸', 1), ('부르', 1), ('다니', 1), ('담구', 1), ('도망치', 1), ('재우', 1), ('파', 1), ('오', 1), ('덜', 1), ('발르', 1), ('늘', 1), ('자르', 1), ('얹', 1), ('묻히', 1), ('볶', 1), ('담그', 1), ('태어나', 1), ('죽이', 1), ('줍', 1), ('갉', 1), ('생겨나', 1), ('찾', 1), ('아', 1), ('부숴뜨리', 1), ('짜', 1), ('부리', 1), ('훔치', 1), ('쓰', 1), ('나르', 1), ('베', 1), ('대', 1), ('둘러앉', 1), ('걸르', 1), ('맞', 1), ('넘', 1), ('부', 1), ('두들기', 1), ('팔', 1), ('따', 1)], 'adverb': [('많이', 10), ('주로', 7), ('다', 5), ('같이', 4), ('잘', 4), ('함께', 3), ('못', 3), ('가장', 3), ('우연히', 1), ('자주', 1), ('하지만', 1), ('그냥', 1), ('씩', 1), ('또한', 1), ('너무', 1), ('채', 1), ('내내', 1), ('어찌', 1), ('적당히', 1), ('대체로', 1), ('가끔', 1), ('특히', 1), ('흥청망청', 1), ('적이', 1), ('흔히', 1), ('상관없이', 1), ('또', 1), ('통째로', 1), ('날로', 1), ('빨리', 1), ('때로', 1), ('전혀', 1), ('배불리', 1), ('겨우', 1), ('안', 1), ('간단히', 1), ('달리', 1)], 'determiner': [('다른', 5), ('그', 2), ('여러', 1), ('세', 1), ('몇몇', 1), ('새', 1)], 'adjective': [('싶', 5), ('어리', 1), ('편하', 1), ('작', 1), ('좋', 1), ('손쉽', 1), ('못하', 1)]}, 'noun': {'noun': [('붓', 3), ('종이', 2), ('묘선', 1), ('청자', 1), ('은장도', 1), ('제조', 1), ('벼루', 1), ('농담', 1), ('윤', 1)], 'verb': [('의하', 1), ('그리', 1), ('찍', 1), ('차', 1), ('늘어놓', 1)], 'adverb': [('하지만', 1)]}}
38+
39+
References:
40+
This was copied from [Kollocate](https://github.com/Kyubyong/kollocate) and modified by Kss
3741
"""
3842
text, finish = _check_text(text)
3943

Diff for: kss/_modules/g2p/english.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44

Diff for: kss/_modules/g2p/g2p.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
import re
5-
from functools import partial
5+
from functools import partial, lru_cache
66
from typing import Union, List, Tuple
77

88
from kss._modules.g2p.english import convert_eng
@@ -35,6 +35,7 @@
3535
from kss._utils.sanity_checks import _check_text, _check_num_workers, _check_type, _check_backend_mecab_pecab_only
3636

3737

38+
@lru_cache(maxsize=500)
3839
def g2p(
3940
text: Union[str, List[str], Tuple[str]],
4041
descriptive: bool = False,
@@ -62,7 +63,7 @@ def g2p(
6263
verbose (bool): whether to print verbose outputs or not
6364
6465
Returns:
65-
Union[str, List[str]]: phoneme string or list of phoneme strings from the given text (graphemes)
66+
Union[str, List[str]]: phoneme string or list of phoneme strings from the given text
6667
6768
Examples:
6869
>>> from kss import Kss
@@ -71,6 +72,9 @@ def g2p(
7172
>>> output = g2p(text)
7273
>>> print(output)
7374
"어제는 말간는데 오느른 흐리다."
75+
76+
References:
77+
This was copied from [g2pk](https://github.com/Kyubyong/g2pk) and modified by Kss
7478
"""
7579
text, finish = _check_text(text)
7680

Diff for: kss/_modules/g2p/numerals.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
import re

Diff for: kss/_modules/g2p/regular.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
from kss._modules.g2p.utils import rule_id2text, gloss

Diff for: kss/_modules/g2p/special.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
import re

Diff for: kss/_modules/g2p/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
1+
# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
22
# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
33

44
import os

Diff for: kss/_modules/hangulization/hangulization.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from functools import partial
1+
from functools import partial, lru_cache
22
from typing import Union, List, Tuple
33

44
import distance
@@ -48,13 +48,14 @@
4848
}
4949

5050

51+
@lru_cache(maxsize=500)
5152
def hangulize(
5253
text: Union[str, List[str], Tuple[str]],
5354
lang: str,
5455
num_workers: Union[int, str] = "auto",
5556
) -> Union[str, List[str]]:
5657
"""
57-
Convert the given text to Hangul pronunciation.
58+
This converts the given text to Hangul pronunciation.
5859
5960
Args:
6061
text (Union[str, List[str], Tuple[str]): single text or list of texts
@@ -71,6 +72,9 @@ def hangulize(
7172
>>> output = hangulize(text, lang="ita")
7273
>>> print(output)
7374
"지로 디탈리아"
75+
76+
References:
77+
This was copied from [hangulize](https://github.com/sublee/hangulize) and modified by Kss
7478
"""
7579
text, finish = _check_text(text)
7680

0 commit comments

Comments
 (0)