hyunwoongko
diff --git a/‎README.md
Lines changed: 1385 additions & 1 deletion b/‎README.md
Lines changed: 1385 additions & 1 deletion
diff --git a/‎bench/preprocessing/test_preprocess.py
Lines changed: 2 additions & 2 deletions b/‎bench/preprocessing/test_preprocess.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎kss/__init__.py
Lines changed: 2 additions & 2 deletions b/‎kss/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎kss/_modules/augmentation/augment.py
Lines changed: 15 additions & 11 deletions b/‎kss/_modules/augmentation/augment.py
Lines changed: 15 additions & 11 deletions
diff --git a/‎kss/_modules/augmentation/replacement.py
Lines changed: 1 addition & 1 deletion b/‎kss/_modules/augmentation/replacement.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎kss/_modules/augmentation/utils.py
Lines changed: 3 additions & 0 deletions b/‎kss/_modules/augmentation/utils.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎kss/_modules/collocation/collocate.py
Lines changed: 8 additions & 4 deletions b/‎kss/_modules/collocation/collocate.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎kss/_modules/g2p/english.py
Lines changed: 1 addition & 1 deletion b/‎kss/_modules/g2p/english.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎kss/_modules/g2p/g2p.py
Lines changed: 7 additions & 3 deletions b/‎kss/_modules/g2p/g2p.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎kss/_modules/g2p/numerals.py
Lines changed: 1 addition & 1 deletion b/‎kss/_modules/g2p/numerals.py
Lines changed: 1 addition & 1 deletion
@@ -13,7 +13,7 @@
         normalization_type="NFC",
         allow_doubled_spaces=False,
         allow_html_tags=False,
-        allow_html_unescape=False,
+        allow_html_escape=False,
         allow_halfwidth_hangul=False,
         reduce_char_repeats_over=5,
         reduce_emoticon_repeats_over=2,
@@ -47,7 +47,7 @@
         max_ellipsis_ratio=0.5,
         min_hangul_ratio=0.4,
         max_hangul_ratio=1.0,
-        max_hangul_non_completed_form_ratio=5,
+        max_hangul_incompleted_form_ratio=5,
         max_words_length=1000,
         max_line_repeats=4,
         max_line_by_char_repeats=10,
 
@@ -133,8 +133,8 @@ def __init__(self, module: str):
     def __call__(self, *args, **kwargs):
         return self.module(*args, **kwargs)
 
-    def usage(self):
-        return self.module.__doc__
+    def help(self):
+        print(self.module.__doc__.strip())
 
     @staticmethod
     def available():
 
@@ -1,4 +1,4 @@
-from functools import partial
+from functools import partial, lru_cache
 from typing import Union, List, Tuple
 
 from kss._modules.augmentation.replacement import SynonymReplacement
@@ -8,23 +8,24 @@
 from kss._utils.sanity_checks import _check_text, _check_type, _check_num_workers, _check_backend_mecab_pecab_only
 
 
+@lru_cache(maxsize=500)
 def augment(
     text: Union[str, List[str], Tuple[str]],
     replacement_ratio: float = 0.3,
-    josa_normalization: bool = True,
+    josa_correction: bool = True,
     num_workers: Union[int, str] = "auto",
     backend: str = "auto",
     verbose: bool = False,
 ) -> Union[str, List[str]]:
     """
-    Augment text with synonym replacement method and,
-    postprocess with space normalization and josa normalization.
+    Augments text with synonym replacement method and, 
+    optionally it postprocesses the text by correcting josa.
+    For this, Kss uses the Korean wordnet from KAIST.
 
     Args:
         text (Union[str, List[str], Tuple[str]]): single text or list of texts
         replacement_ratio (float): ratio of words to be replaced
-        space_normalization (bool): whether to normalize spaces or not
-        josa_normalization (bool): whether to normalize josa or not
+        josa_correction (bool): whether to normalize josa or not
         num_workers (Union[int, str]): the number of multiprocessing workers
         backend (str): morpheme analyzer backend. 'mecab', 'pecab' are supported
         verbose (bool): whether to print verbose outputs or not
@@ -38,15 +39,18 @@ def augment(
         >>> text = "앞서 지난해 11월, 보이저 1호는 명령을 수신하고 수행하는 데엔 문제가 없었지만 통신 장치에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일 '포크'라는 명령을 보냈다."
         >>> output = augment(text)
         >>> print(output)
-        "앞서 지난해 11월, 보이저 1호는 명령을 수신하고 시행하는 데엔 문제가 없었지만 송신 장비에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일'포크'라는 명령을 보냈다."
+        "앞서 지난해 11월, 보이저 1호는 명령을 수신하고 시행하는 데엔 문제가 없었지만 송신 장비에 문제가 생겨 과학·엔지니어링 데이터가 지구로 전송되지 않았던 바 있다. 당시 그들은 컴퓨터 시스템을 재시작하고 문제의 근본적인 원인을 파악하기 위해 명령을 보내려고 시도했고, 이달 1일 '포크'라는 명령을 보냈다."
+
+    References:
+        This was copied from [KoEDA](https://github.com/toriving/KoEDA) and modified by Kss
     """
     text, finish = _check_text(text)
 
     if finish:
         return text
 
     replacement_ratio = _check_type(replacement_ratio, "replacement_ratio", float)
-    josa_normalization = _check_type(josa_normalization, "josa_normalization", bool)
+    josa_correction = _check_type(josa_correction, "josa_correction", bool)
     verbose = _check_type(verbose, "verbose", bool)
     num_workers = _check_num_workers(text, num_workers)
     _check_backend_mecab_pecab_only(backend)
@@ -62,7 +66,7 @@ def augment(
         func=partial(
             _augment,
             replacement_ratio=replacement_ratio,
-            josa_normalization=josa_normalization,
+            josa_correction=josa_correction,
             backend=backend,
             verbose=verbose,
         ),
@@ -74,15 +78,15 @@ def augment(
 def _augment(
     text: str,
     replacement_ratio: float = 0.3,
-    josa_normalization: bool = True,
+    josa_correction: bool = True,
     backend: str = "auto",
     verbose: bool = False,
 ):
     orig_text = text
     replacement = SynonymReplacement(backend=backend)  # I want WSD...
     text = replacement(text, p=replacement_ratio, verbose=verbose)
 
-    if josa_normalization:
+    if josa_correction:
         text = correct_josa(text)
 
     if verbose:
 
@@ -1,4 +1,4 @@
-# This is copied from KoEDA [https://github.com/toriving/KoEDA]
+# This was copied from KoEDA [https://github.com/toriving/KoEDA]
 # And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
 
 import random
 
@@ -1,3 +1,6 @@
+# This was copied from KoEDA [https://github.com/toriving/KoEDA]
+# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
+
 import json
 import os
 
 
@@ -1,4 +1,4 @@
-from functools import partial
+from functools import partial, lru_cache
 from typing import Union, List, Tuple
 
 from kollocate import Kollocate
@@ -10,22 +10,23 @@
 kollocate_obj = Kollocate()
 
 
+@lru_cache(maxsize=500)
 def collocate(
     text: Union[str, List[str], Tuple[str]],
     num_workers: Union[int, str] = "auto",
     verbose: bool = False,
 ) -> Union[dict, List[dict]]:
     """
-    Get collocation (연어) of words in text.
+    This returns collocation (연어) of given words.
     The collocation is a set of words that frequently appear together.
 
     Args:
-        text (Union[str, List[str], Tuple[str]]): single text or list of texts
+        text (Union[str, List[str], Tuple[str]]): single word or list of words
         num_workers (Union[int, str]): the number of multiprocessing workers
         verbose (bool): whether to print verbose outputs or not
 
     Returns:
-        Union[dict, List[dict]]: collocation of words in text or list of collocations
+        Union[dict, List[dict]]: collocations and frequencies of words in text or list of collocations and frequencies
 
     Examples:
         >>> from kss import Kss
@@ -34,6 +35,9 @@ def collocate(
         >>> output = collocate(text)
         >>> print(output)
         {'verb': {'noun': [('것', 39), ('수', 29), ('음식', 23), ('등', 16), ('고기', 14), ('먹이', 14), ('때', 12), ('식물', 12), ('개고기', 9), ('젖', 9), ('겁', 7), ('시작', 7), ('후', 7), ('밥', 7), ('요리', 7), ('경우', 6), ('풀', 6), ('사람', 5), ('자살', 5), ('과일', 4), ('늑대', 4), ('마음', 4), ('나이', 4), ('애', 4), ('생선', 3), ('개', 3), ('죽', 3), ('양', 3), ('나무', 3), ('만큼', 3), ('물', 3), ('방법', 3), ('알', 3), ('떡볶이', 3), ('식사', 3), ('아침', 3), ('사과', 3), ('라면', 3), ('자기', 3), ('약', 3), ('점심', 3), ('때문', 3), ('조리', 3), ('떡', 2), ('접시', 2), ('국수', 2), ('일반적', 2), ('무엇', 2), ('파이', 2), ('만', 2), ('다음', 2), ('이후', 2), ('무리', 2), ('기록', 2), ('풍습', 2), ('동물', 2), ('식물질', 2), ('곤충', 2), ('이', 2), ('유제류', 2), ('새끼', 2), ('불고기', 2), ('한국', 2), ('한국식', 2), ('동안', 2), ('몸', 2), ('돼지고기', 2), ('잡식성', 2), ('기관', 2), ('제사장', 2), ('끼', 2), ('운동', 2), ('곡식', 2), ('궁중', 2), ('젖소', 2), ('우유', 2), ('고', 2), ('이야기', 2), ('정도', 2), ('일', 2), ('자리', 2), ('지역', 2), ('소화', 2), ('도중', 2), ('쓰레기', 2), ('저녁', 2), ('그', 2), ('뒤', 2), ('조', 2), ('고구마', 1), ('가지', 1), ('가시', 1), ('가지도', 1), ('아이', 1), ('쌈', 1), ('노출', 1), ('그다음', 1), ('근육', 1), ('아침상', 1), ('대로', 1), ('솔잎', 1), ('생', 1), ('중세', 1), ('어른', 1), ('성소', 1), ('집중적', 1), ('한번', 1), ('멜론', 1), ('입', 1), ('나뭇가지', 1), ('혀', 1), ('무시', 1), ('발견', 1), ('전후', 1), ('운반', 1), ('찬밥', 1), ('벌레', 1), ('남편', 1), ('하', 1), ('부인', 1), ('플랑크톤', 1), ('4', 1), ('미역국', 1), ('벨', 1), ('잔가지', 1), ('그중', 1), ('말', 1), ('소시지', 1), ('사냥', 1), ('카스', 1), ('따위', 1), ('레', 1), ('새', 1), ('내서', 1), ('인류', 1), ('소문', 1), ('수라', 1), ('곳', 1), ('항원', 1), ('구지가', 1), ('되새김', 1), ('사료', 1), ('하루', 1), ('문제', 1), ('큰곰', 1), ('비율', 1), ('6', 1), ('보리', 1), ('메조', 1), ('광', 1), ('물냉면', 1), ('살', 1), ('여공', 1), ('수영', 1), ('주변', 1), ('파리', 1), ('체액', 1), ('가운데', 1), ('시체', 1), ('구기', 1), ('해초', 1), ('호두', 1), ('엽전', 1), ('고추장', 1), ('300', 1), ('배설', 1), ('곰국', 1), ('노루', 1), ('대나무', 1), ('초식성', 1), ('부육', 1), ('종류', 1), ('남', 1), ('포장마차', 1), ('어묵', 1), ('특효', 1), ('약물', 1), ('노래', 1), ('어미', 1), ('맥', 1), ('털가죽', 1), ('뇌', 1), ('돌', 1), ('방식', 1), ('비빔밥', 1), ('진화', 1), ('콩', 1), ('탈', 1), ('신의주', 1), ('뜰', 1), ('하느님', 1), ('공화제', 1), ('물질', 1), ('초식', 1), ('간', 1), ('흑', 1), ('식품', 1), ('덮밥', 1), ('탐사선', 1), ('해산물', 1), ('내부', 1), ('3', 1), ('공격', 1), ('정', 1), ('대부분', 1), ('데', 1), ('연습', 1), ('오이', 1), ('성장', 1), ('볶음', 1), ('설', 1), ('스탈린', 1), ('형식', 1), ('골목', 1), ('나뭇잎', 1), ('주간', 1), ('잎', 1), ('된장찌개', 1), ('자매', 1), ('시간당', 1), ('말복', 1), ('구어', 1), ('말고기', 1), ('당나귀', 1), ('모습', 1), ('박테리아', 1), ('영양분', 1), ('부분', 1), ('가축', 1), ('그것', 1), ('가공품', 1), ('구이', 1), ('목', 1), ('기름', 1), ('휴식', 1), ('소스', 1), ('나무껍질', 1), ('쇠고기', 1), ('고착', 1), ('밝기', 1), ('짜장면', 1), ('날', 1), ('수액', 1), ('배달', 1), ('장면', 1), ('단어', 1), ('떡국', 1), ('찜', 1), ('문화권', 1), ('소형', 1), ('이전', 1), ('로', 1), ('노년', 1), ('난자', 1), ('돼지', 1), ('채집', 1), ('나라', 1), ('전', 1), ('육식성', 1), ('격려', 1), ('반복', 1), ('유협', 1), ('디저트', 1), ('분위기', 1), ('일과', 1), ('쌀', 1), ('고관', 1), ('밑반찬', 1), ('떼', 1), ('개월', 1), ('다양', 1), ('본격적', 1), ('생각', 1), ('수면제', 1), ('학교', 1), ('칭', 1), ('고깃국', 1), ('충격', 1), ('복사본', 1), ('미', 1), ('감로', 1), ('비', 1), ('독약', 1), ('우두', 1), ('도', 1), ('농작물', 1), ('자장면', 1), ('인터넷', 1), ('술', 1), ('기회', 1), ('풀뿌리', 1), ('게걸', 1), ('외상술', 1), ('약밥', 1), ('풍속', 1), ('한방', 1), ('치질', 1), ('나', 1), ('허기', 1), ('어린이', 1), ('나물밥', 1), ('사약', 1), ('가죽', 1), ('서부', 1), ('명', 1), ('이승만', 1), ('가래떡', 1), ('절대', 1), ('숲', 1), ('깨', 1), ('봄맞이', 1), ('곡', 1), ('계속', 1), ('자연', 1), ('목축', 1), ('시간', 1), ('서로', 1), ('얼', 1), ('가능성', 1), ('빵', 1), ('뿌리', 1), ('독립', 1), ('혼자', 1), ('잠', 1), ('덴푸라', 1), ('둥', 1), ('성', 1), ('가열', 1), ('전자', 1), ('창신', 1), ('탈진', 1), ('맘', 1), ('자신', 1)], 'verb': [('하', 33), ('않', 21), ('살', 17), ('즐기', 11), ('굽', 9), ('곁들이', 7), ('썰', 6), ('찍', 5), ('섞', 5), ('있', 5), ('치우', 4), ('잡', 4), ('나누', 4), ('비비', 4), ('되', 3), ('싸', 3), ('어', 3), ('익히', 3), ('버리', 3), ('만들', 3), ('집', 3), ('보', 3), ('죽', 3), ('들', 2), ('튀기', 2), ('삭히', 2), ('마시', 2), ('말', 2), ('달이', 2), ('끓이', 2), ('뜯', 2), ('사', 2), ('자라', 2), ('뿌리', 2), ('넣', 2), ('시키', 2), ('위하', 2), ('찌', 1), ('씹', 1), ('뒤집', 1), ('바꾸', 1), ('부르', 1), ('다니', 1), ('담구', 1), ('도망치', 1), ('재우', 1), ('파', 1), ('오', 1), ('덜', 1), ('발르', 1), ('늘', 1), ('자르', 1), ('얹', 1), ('묻히', 1), ('볶', 1), ('담그', 1), ('태어나', 1), ('죽이', 1), ('줍', 1), ('갉', 1), ('생겨나', 1), ('찾', 1), ('아', 1), ('부숴뜨리', 1), ('짜', 1), ('부리', 1), ('훔치', 1), ('쓰', 1), ('나르', 1), ('베', 1), ('대', 1), ('둘러앉', 1), ('걸르', 1), ('맞', 1), ('넘', 1), ('부', 1), ('두들기', 1), ('팔', 1), ('따', 1)], 'adverb': [('많이', 10), ('주로', 7), ('다', 5), ('같이', 4), ('잘', 4), ('함께', 3), ('못', 3), ('가장', 3), ('우연히', 1), ('자주', 1), ('하지만', 1), ('그냥', 1), ('씩', 1), ('또한', 1), ('너무', 1), ('채', 1), ('내내', 1), ('어찌', 1), ('적당히', 1), ('대체로', 1), ('가끔', 1), ('특히', 1), ('흥청망청', 1), ('적이', 1), ('흔히', 1), ('상관없이', 1), ('또', 1), ('통째로', 1), ('날로', 1), ('빨리', 1), ('때로', 1), ('전혀', 1), ('배불리', 1), ('겨우', 1), ('안', 1), ('간단히', 1), ('달리', 1)], 'determiner': [('다른', 5), ('그', 2), ('여러', 1), ('세', 1), ('몇몇', 1), ('새', 1)], 'adjective': [('싶', 5), ('어리', 1), ('편하', 1), ('작', 1), ('좋', 1), ('손쉽', 1), ('못하', 1)]}, 'noun': {'noun': [('붓', 3), ('종이', 2), ('묘선', 1), ('청자', 1), ('은장도', 1), ('제조', 1), ('벼루', 1), ('농담', 1), ('윤', 1)], 'verb': [('의하', 1), ('그리', 1), ('찍', 1), ('차', 1), ('늘어놓', 1)], 'adverb': [('하지만', 1)]}}
+
+    References:
+        This was copied from [Kollocate](https://github.com/Kyubyong/kollocate) and modified by Kss
     """
     text, finish = _check_text(text)
 
 
@@ -1,4 +1,4 @@
-# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
+# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
 # And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
 
 
 
@@ -1,8 +1,8 @@
-# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
+# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
 # And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
 
 import re
-from functools import partial
+from functools import partial, lru_cache
 from typing import Union, List, Tuple
 
 from kss._modules.g2p.english import convert_eng
@@ -35,6 +35,7 @@
 from kss._utils.sanity_checks import _check_text, _check_num_workers, _check_type, _check_backend_mecab_pecab_only
 
 
+@lru_cache(maxsize=500)
 def g2p(
     text: Union[str, List[str], Tuple[str]],
     descriptive: bool = False,
@@ -62,7 +63,7 @@ def g2p(
         verbose (bool): whether to print verbose outputs or not
 
     Returns:
-        Union[str, List[str]]: phoneme string or list of phoneme strings from the given text (graphemes)
+        Union[str, List[str]]: phoneme string or list of phoneme strings from the given text
 
     Examples:
         >>> from kss import Kss
@@ -71,6 +72,9 @@ def g2p(
         >>> output = g2p(text)
         >>> print(output)
         "어제는 말간는데 오느른 흐리다."
+
+    References:
+        This was copied from [g2pk](https://github.com/Kyubyong/g2pk) and modified by Kss
     """
     text, finish = _check_text(text)
 
 
@@ -1,4 +1,4 @@
-# This code is copied from g2pk [https://github.com/kyubyong/g2pK]
+# This code was copied from g2pk [https://github.com/kyubyong/g2pK]
 # And modified by Hyunwoong Ko [https://github.com/hyunwoongko]
 
 import re
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# This is copied from KoEDA [https://github.com/toriving/KoEDA]`
	`1`	`+# This was copied from KoEDA [https://github.com/toriving/KoEDA]`
`2`	`2`	`# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]`
`3`	`3`
`4`	`4`	`import random`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# This code is copied from g2pk [https://github.com/kyubyong/g2pK]`
	`1`	`+# This code was copied from g2pk [https://github.com/kyubyong/g2pK]`
`2`	`2`	`# And modified by Hyunwoong Ko [https://github.com/hyunwoongko]`
`3`	`3`
`4`	`4`