Skip to content

Commit 91355e2

Browse files
authored
Merge pull request #70 from hyunwoongko/6.0.0
6.0.0
2 parents 79b3956 + a3b3e07 commit 91355e2

File tree

168 files changed

+261887
-906
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

168 files changed

+261887
-906
lines changed

.DS_Store

8 KB
Binary file not shown.

.github/workflows/test_macos.yaml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: macos-latest
2+
on: push
3+
4+
jobs:
5+
test:
6+
runs-on: macos-latest
7+
steps:
8+
- name: Checkout repository
9+
uses: actions/checkout@v2
10+
11+
- name: Setup Python
12+
uses: actions/setup-python@v2
13+
with:
14+
python-version: 3.10.11
15+
16+
- name: Install kss locally
17+
run: |
18+
pip3 install -e .
19+
- name: Install pytest
20+
run: |
21+
python3 -m pip install pytest
22+
- name: Run the test suite
23+
run: |
24+
cd tests && pytest -v

.github/workflows/test_ubuntu.yaml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: ubuntu-latest
2+
on: push
3+
4+
jobs:
5+
test:
6+
runs-on: ubuntu-latest
7+
steps:
8+
- name: Checkout repository
9+
uses: actions/checkout@v2
10+
11+
- name: Setup Python
12+
uses: actions/setup-python@v2
13+
with:
14+
python-version: 3.10.11
15+
16+
- name: Install kss locally
17+
run: |
18+
pip3 install -e .
19+
- name: Install pytest
20+
run: |
21+
python3 -m pip install pytest
22+
- name: Run the test suite
23+
run: |
24+
cd tests && pytest -v

.github/workflows/test_windows.yaml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: windows-latest
2+
on: push
3+
4+
jobs:
5+
test:
6+
runs-on: windows-latest
7+
steps:
8+
- name: Checkout repository
9+
uses: actions/checkout@v2
10+
11+
- name: Setup Python
12+
uses: actions/setup-python@v2
13+
with:
14+
python-version: 3.10.11
15+
16+
- name: Install kss locally
17+
run: |
18+
pip3 install -e .
19+
- name: Install pytest
20+
run: |
21+
python3 -m pip install pytest
22+
- name: Run the test suite
23+
run: |
24+
cd tests && pytest -v

MANIFEST.in

+4
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,8 @@ include csrc/kss_cython.pyx
44
include csrc/sentence_splitter.cpp
55
include csrc/sentence_splitter.h
66
include csrc/__init__.py
7+
include kss/_modules/g2p/assets/rules.txt
8+
include kss/_modules/g2p/assets/idiom.txt
9+
include kss/_modules/g2p/assets/table.csv
10+
include kss/_modules/augmentation/assets/wordnet.json
711
include setup.py

README.md

+1,190-823
Large diffs are not rendered by default.

bench/.DS_Store

10 KB
Binary file not shown.

bench/__init__.py

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
UNICODE_TO_REMOVE = {
2+
"\u0000": "", # Null
3+
"\u0001": "", # Start of Heading
4+
"\u0002": "", # Start of Text
5+
"\u0003": "", # End of Text
6+
"\u0004": "", # End of Transmission
7+
"\u0005": "", # Enquiry
8+
"\u0006": "", # Acknowledge
9+
"\u0007": "", # Bell
10+
"\u0008": "", # Backspace
11+
"\u0009": "", # Horizontal Tab
12+
"\u000B": "", # Vertical Tab
13+
"\u000C": "", # Form Feed
14+
"\u000D": "", # Carriage Return
15+
"\u000E": "", # Shift Out
16+
"\u000F": "", # Shift In
17+
"\u0010": "", # Data Link Escape
18+
"\u0011": "", # Device Control 1
19+
"\u0012": "", # Device Control 2
20+
"\u0013": "", # Device Control 3
21+
"\u0014": "", # Device Control 4
22+
"\u0015": "", # Negative Acknowledge
23+
"\u0016": "", # Synchronous Idle
24+
"\u0017": "", # End of Transmission Block
25+
"\u0018": "", # Cancel
26+
"\u0019": "", # End of Medium
27+
"\u001A": "", # Substitute
28+
"\u001B": "", # Escape
29+
"\u001C": "", # File Separator
30+
"\u001D": "", # Group Separator
31+
"\u001E": "", # Record Separator
32+
"\u001F": "", # Unit Separator
33+
"\u007F": "", # Delete
34+
"\u0080": "", # Padding Character
35+
"\u0081": "", # High Octet Preset
36+
"\u0082": "", # Break Permitted Here
37+
"\u0083": "", # No Break Here
38+
"\u0084": "", # Index
39+
"\u0085": "", # Next Line
40+
"\u0086": "", # Start of Selected Area
41+
"\u0087": "", # End of Selected Area
42+
"\u0088": "", # Character Tabulation Set
43+
"\u0089": "", # Character Tabulation with Justification
44+
"\u008A": "", # Line Tabulation Set
45+
"\u008B": "", # Partial Line Forward
46+
"\u008C": "", # Partial Line Backward
47+
"\u008D": "", # Reverse Line Feed
48+
"\u008E": "", # Single-Shift Two
49+
"\u008F": "", # Single-Shift Three
50+
"\u0090": "", # Device Control String
51+
"\u0091": "", # Private Use 1
52+
"\u0092": "", # Private Use 2
53+
"\u0093": "", # Set Transmit State
54+
"\u0094": "", # Cancel Character
55+
"\u0095": "", # Message Waiting
56+
"\u0096": "", # Start of Guarded Area
57+
"\u0097": "", # End of Guarded Area
58+
"\u0098": "", # Start of String
59+
"\u0099": "", # Single Graphic Character Introducer
60+
"\u009A": "", # Single Character Introducer
61+
"\u009B": "", # Control Sequence Introducer
62+
"\u009C": "", # String Terminator
63+
"\u009D": "", # Operating System Command
64+
"\u009E": "", # Privacy Message
65+
"\u009F": "", # Application Program Command
66+
"\u00A0": "", # No-Break Space
67+
"\u00AD": "", # Soft Hyphen
68+
"\u061C": "", # Arabic Letter Mark
69+
"\u115f": "", # Hangul Choseong Filler
70+
"\u1160": "", # Hangul Jungseong Filler
71+
"\u1680": "", # Ogham Space Mark
72+
"\u17B4": "", # Khmer Vowel Inherent AQ
73+
"\u17B5": "", # Khmer Vowel Inherent AA
74+
"\u180B": "", # Mongolian Free Variation Selector One
75+
"\u180C": "", # Mongolian Free Variation Selector Two
76+
"\u180D": "", # Mongolian Free Variation Selector Three
77+
"\u180E": "", # Mongolian Vowel Separator
78+
"\u2000": "", # En Quad
79+
"\u2001": "", # Em Quad
80+
"\u2002": "", # En Space
81+
"\u2003": "", # Em Space
82+
"\u2004": "", # Three-Per-Em Space
83+
"\u2005": "", # Four-Per-Em Space
84+
"\u2006": "", # Six-Per-Em Space
85+
"\u2007": "", # Figure Space
86+
"\u2008": "", # Punctuation Space
87+
"\u2009": "", # Thin Space
88+
"\u200A": "", # Hair Space
89+
"\u200B": "", # Zero Width Space
90+
"\u200C": "", # Zero Width Non-Joiner
91+
"\u200D": "", # Zero Width Joiner
92+
"\u200E": "", # Left-to-Right Mark
93+
"\u200F": "", # Right-to-Left Mark
94+
"\u202A": "", # Left-to-Right Embedding
95+
"\u202B": "", # Right-to-Left Embedding
96+
"\u202C": "", # Pop Directional Formatting
97+
"\u202D": "", # Left-to-Right Override
98+
"\u202E": "", # Right-to-Left Override
99+
"\u202F": "", # Narrow No-Break Space
100+
"\u2060": "", # Word Joiner
101+
"\u2061": "", # Function Application
102+
"\u2062": "", # Invisible Times
103+
"\u2063": "", # Invisible Separator
104+
"\u2064": "", # Invisible Plus
105+
"\u2066": "", # Left-to-Right Isolate
106+
"\u2067": "", # Right-to-Left Isolate
107+
"\u2068": "", # First Strong Isolate
108+
"\u2069": "", # Pop Directional Isolate
109+
"\u206A": "", # Inhibit Symmetric Swapping
110+
"\u206B": "", # Activate Symmetric Swapping
111+
"\u206C": "", # Inhibit Arabic Form Shaping
112+
"\u206D": "", # Activate Arabic Form Shaping
113+
"\u206E": "", # National Digit Shapes
114+
"\u206F": "", # Nominal Digit Shapes
115+
"\u3164": "", # Hangul Filler
116+
"\uFEFF": "", # Zero Width No-Break Space
117+
"\uFFA0": "", # Halfwidth Hangul Filler
118+
"\uFFFC": "", # Object Replacement Character
119+
"\uFFFE": "", # Byte Order Mark
120+
"\uFFFF": "", # Non character
121+
"\U0001307B": "", # Egyptian Hieroglyph Z015B
122+
"\U0001BCA0": "", # Shorthand Format Letter Overlap
123+
}
124+
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0xE0000, 0xF8FF + 1)}) # Tag + PUA characters
125+
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0xF0000, 0xFFFFF + 1)}) # PUA characters
126+
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0x100000, 0x10FFFF + 1)}) # PUA characters
127+
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0x1D100, 0x1D1FF + 1)}) # Musical symbols
128+
129+
TO_REPLACE = {
130+
"\u0020": " ", # Space
131+
"\u0009": "\t", # Horizontal Tab
132+
"\u000A": "\n", # Line Feed
133+
"\u034F": "\u034F", # Combining Grapheme Joiner
134+
"\u2028": "\n", # Line Separator
135+
"\u2029": "\n\n", # Paragraph Separator
136+
"\u2000": " ", # En Quad
137+
"\u2001": " ", # Em Quad
138+
"\u2002": " ", # En Space
139+
"\u2003": " ", # Em Space
140+
"\u2004": " ", # Three-Per-Em Space
141+
"\u2005": " ", # Four-Per-Em Space
142+
"\u2006": " ", # Six-Per-Em Space
143+
"\u2007": " ", # Figure Space
144+
"\u2008": " ", # Punctuation Space
145+
"\u2009": " ", # Thin Space
146+
"\u200A": " ", # Hair Space
147+
"\u205F": " ", # Medium Mathematical Space
148+
"\u3000": " ", # Ideographic Space
149+
"\u2800": "\u2800", # Braille Pattern Blank
150+
"\u200D": "\u200D", # Zero Width Joiner
151+
}
152+
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x02B0, 0x02FF + 1)}) # Modifier letters
153+
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x0300, 0x036F + 1)}) # Combining Diacritical Marks
154+
# TO_REPLACE.update({chr(char): chr(char) for char in range(0xFE00, 0xFE0F + 1)}) # Variation Selectors
155+
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x1E0100, 0x1E01EF + 1)}) # Variation Selectors
156+
157+
if __name__ == '__main__':
158+
text = "string\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00AD\u061C\u115f\u1160\u1680\u17B4\u17B5\u180B\u180C\u180D\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u202F\u2060\u2061\u2062\u2063\u2064\u2066\u2067\u2068\u2069\u206A\u206B\u206C\u206D\u206E\u206F\u3164\uFEFF\uFFA0\uFFFC\uFFFE\uFFFF\U0001307B\U0001BCA0"
159+
text += "".join(chr(char) for char in range(0xE0000, 0xF8FF + 1)) # Tag + PUA characters
160+
text += "".join(chr(char) for char in range(0xF0000, 0xFFFFF + 1)) # PUA characters
161+
text += "".join(chr(char) for char in range(0x100000, 0x10FFFF + 1)) # PUA characters
162+
text += "".join(chr(char) for char in range(0x1D100, 0x1D1FF + 1)) # Musical symbols
163+
text += "\u0020\u0009\u000A\u034F\u2028\u2029\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u205F\u3000\u2800\u200D"
164+
print(len(text))
165+
print(text)

bench/preprocessing/.DS_Store

6 KB
Binary file not shown.

0 commit comments

Comments
 (0)