Skip to content

Commit 0db05c4

Browse files
authored
Merge pull request #165 from bab2min/dev_script_type
Add `ScriptType`
2 parents 95f1e24 + 230188e commit 0db05c4

File tree

8 files changed

+1087
-5
lines changed

8 files changed

+1087
-5
lines changed

Diff for: CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ set ( CORE_SRCS
5050
src/KTrie.cpp
5151
src/PatternMatcher.cpp
5252
src/search.cpp
53+
src/ScriptType.cpp
5354
src/SwTokenizer.cpp
5455
src/TagUtils.cpp
5556
src/TypoTransformer.cpp

Diff for: include/kiwi/ScriptType.h

+245
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#pragma once
2+
#include <cstdint>
3+
4+
namespace kiwi
5+
{
6+
enum class ScriptType : uint8_t
7+
{
8+
unknown,
9+
latin,
10+
ipa_extensions,
11+
spacing_modifier_letters,
12+
combining_diacritical_marks,
13+
greek_and_coptic,
14+
cyrillic,
15+
armenian,
16+
hebrew,
17+
arabic,
18+
syriac,
19+
thaana,
20+
nko,
21+
samaritan,
22+
mandaic,
23+
devanagari,
24+
bengali,
25+
gurmukhi,
26+
gujarati,
27+
oriya,
28+
tamil,
29+
telugu,
30+
kannada,
31+
malayalam,
32+
sinhala,
33+
thai,
34+
lao,
35+
tibetan,
36+
myanmar,
37+
georgian,
38+
hangul,
39+
ethiopic,
40+
cherokee,
41+
unified_canadian_aboriginal_syllabics,
42+
ogham,
43+
runic,
44+
tagalog,
45+
hanunoo,
46+
buhid,
47+
tagbanwa,
48+
khmer,
49+
mongolian,
50+
limbu,
51+
tai_le,
52+
new_tai_lue,
53+
khmer_symbols,
54+
buginese,
55+
tai_tham,
56+
balinese,
57+
sundanese,
58+
batak,
59+
lepcha,
60+
ol_chiki,
61+
phonetic_extensions,
62+
punctuation,
63+
superscripts_and_subscripts,
64+
currency_symbols,
65+
combining_diacritical_marks_for_symbols,
66+
letterlike_symbols,
67+
number_forms,
68+
arrows,
69+
mathematical,
70+
miscellaneous_technical,
71+
control_pictures,
72+
optical_character_recognition,
73+
enclosed_alphanumerics,
74+
box_drawing,
75+
block_elements,
76+
geometric_shapes,
77+
miscellaneous_symbols,
78+
dingbats,
79+
braille_patterns,
80+
glagolitic,
81+
tifinagh,
82+
hanja,
83+
ideographic_description_characters,
84+
kana,
85+
bopomofo,
86+
kanbun,
87+
yijing_hexagram_symbols,
88+
yi,
89+
lisu,
90+
vai,
91+
bamum,
92+
modifier_tone_letters,
93+
syloti_nagri,
94+
common_indic_number_forms,
95+
phags_pa,
96+
saurashtra,
97+
kayah_li,
98+
rejang,
99+
javanese,
100+
cham,
101+
tai_viet,
102+
meetei_mayek,
103+
private_use_area,
104+
alphabetic_presentation_forms,
105+
arabic_presentation_forms_a,
106+
variation_selectors,
107+
vertical_forms,
108+
combining_half_marks,
109+
small_form_variants,
110+
arabic_presentation_forms_b,
111+
halfwidth_and_fullwidth_forms,
112+
specials,
113+
linear_b,
114+
aegean_numbers,
115+
ancient_greek_numbers,
116+
ancient_symbols,
117+
phaistos_disc,
118+
lycian,
119+
carian,
120+
coptic_epact_numbers,
121+
old_italic,
122+
gothic,
123+
old_permic,
124+
ugaritic,
125+
old_persian,
126+
deseret,
127+
shavian,
128+
osmanya,
129+
osage,
130+
elbasan,
131+
caucasian_albanian,
132+
vithkuqi,
133+
linear_a,
134+
cypriot_syllabary,
135+
imperial_aramaic,
136+
palmyrene,
137+
nabataean,
138+
hatran,
139+
phoenician,
140+
lydian,
141+
meroitic_hieroglyphs,
142+
meroitic_cursive,
143+
kharoshthi,
144+
old_south_arabian,
145+
old_north_arabian,
146+
manichaean,
147+
avestan,
148+
inscriptional_parthian,
149+
inscriptional_pahlavi,
150+
psalter_pahlavi,
151+
old_turkic,
152+
old_hungarian,
153+
hanifi_rohingya,
154+
rumi_numeral_symbols,
155+
yezidi,
156+
old_sogdian,
157+
sogdian,
158+
old_uyghur,
159+
chorasmian,
160+
elymaic,
161+
brahmi,
162+
kaithi,
163+
sora_sompeng,
164+
chakma,
165+
mahajani,
166+
sharada,
167+
sinhala_archaic_numbers,
168+
khojki,
169+
multani,
170+
khudawadi,
171+
grantha,
172+
newa,
173+
tirhuta,
174+
siddham,
175+
modi,
176+
takri,
177+
ahom,
178+
dogra,
179+
warang_citi,
180+
dives_akuru,
181+
nandinagari,
182+
zanabazar_square,
183+
soyombo,
184+
pau_cin_hau,
185+
bhaiksuki,
186+
marchen,
187+
masaram_gondi,
188+
gunjala_gondi,
189+
makasar,
190+
kawi,
191+
cuneiform,
192+
early_dynastic_cuneiform,
193+
cypro_minoan,
194+
egyptian_hieroglyphs,
195+
anatolian_hieroglyphs,
196+
mro,
197+
tangsa,
198+
bassa_vah,
199+
pahawh_hmong,
200+
medefaidrin,
201+
miao,
202+
ideographic_symbols_and_punctuation,
203+
tangut,
204+
khitan_small_script,
205+
nushu,
206+
duployan,
207+
shorthand_format_controls,
208+
znamenny_musical_notation,
209+
byzantine_musical_symbols,
210+
musical_symbols,
211+
ancient_greek_musical_notation,
212+
kaktovik_numerals,
213+
mayan_numerals,
214+
tai_xuan_jing_symbols,
215+
counting_rod_numerals,
216+
mathematical_alphanumeric_symbols,
217+
sutton_signwriting,
218+
nyiakeng_puachue_hmong,
219+
toto,
220+
wancho,
221+
nag_mundari,
222+
mende_kikakui,
223+
adlam,
224+
indic_siyaq_numbers,
225+
ottoman_siyaq_numbers,
226+
arabic_mathematical_alphabetic_symbols,
227+
mahjong_tiles,
228+
domino_tiles,
229+
playing_cards,
230+
enclosed_ideographic_supplement,
231+
symbols_and_pictographs,
232+
emoticons,
233+
transport_and_map_symbols,
234+
alchemical_symbols,
235+
chess_symbols,
236+
symbols_for_legacy_computing,
237+
tags,
238+
};
239+
240+
ScriptType chr2ScriptType(char32_t c);
241+
242+
const char* getScriptName(ScriptType type);
243+
244+
bool isEmoji(char32_t c0, char32_t c1 = 0);
245+
}

Diff for: include/kiwi/Types.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#endif
2727

2828
#include "TemplateUtils.hpp"
29+
#include "ScriptType.h"
2930

3031
#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \
3132
inline Type operator~(Type a)\
@@ -306,7 +307,10 @@ namespace kiwi
306307
uint32_t lineNumber = 0; /**< 줄 번호*/
307308
uint16_t length = 0; /**< 길이(UTF16 문자 기준) */
308309
POSTag tag = POSTag::unknown; /**< 품사 태그 */
309-
uint8_t senseId = 0; /**< 의미 번호 */
310+
union {
311+
uint8_t senseId = 0; /**< 의미 번호 */
312+
ScriptType script; /**< 유니코드 영역에 기반한 문자 타입 */
313+
};
310314
float score = 0; /**< 해당 형태소의 언어모델 점수 */
311315
float typoCost = 0; /**< 오타가 교정된 경우 오타 비용. 그렇지 않은 경우 0 */
312316
uint32_t typoFormId = 0; /**< 교정 전 오타의 형태에 대한 정보 (typoCost가 0인 경우 PreTokenizedSpan의 ID값) */

Diff for: src/KTrie.cpp

+24-1
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,16 @@ namespace kiwi
527527
}
528528
}
529529

530+
inline bool isDiscontinuous(POSTag prevTag, POSTag curTag, ScriptType prevScript, ScriptType curScript)
531+
{
532+
if ((prevTag == POSTag::sl || prevTag == POSTag::sh || prevTag == POSTag::sw) &&
533+
(curTag == POSTag::sl || curTag == POSTag::sh || curTag == POSTag::sw))
534+
{
535+
return prevScript != curScript;
536+
}
537+
return prevTag != curTag;
538+
}
539+
530540
template<ArchType arch, bool typoTolerant, bool continualTypoTolerant>
531541
size_t kiwi::splitByTrie(
532542
Vector<KGraphNode>& ret,
@@ -585,6 +595,7 @@ size_t kiwi::splitByTrie(
585595

586596
size_t lastSpecialEndPos = 0, specialStartPos = 0;
587597
POSTag chrType, lastChrType = POSTag::unknown, lastMatchedPattern = POSTag::unknown;
598+
ScriptType scriptType, lastScriptType = ScriptType::unknown;
588599
auto flushBranch = [&](size_t unkFormEndPos = 0, size_t unkFormEndPosWithSpace = 0, bool specialMatched = false)
589600
{
590601
if (!candidates.empty())
@@ -836,8 +847,17 @@ size_t kiwi::splitByTrie(
836847
}
837848

838849
chrType = identifySpecialChr(c32);
850+
scriptType = chr2ScriptType(c32);
851+
if (lastChrType == POSTag::sw &&
852+
(c32 == 0x200d || // zero width joiner
853+
(0x1f3fb <= c32 && c32 <= 0x1f3ff) || // skin color modifier
854+
scriptType == ScriptType::variation_selectors)) // variation selectors
855+
{
856+
chrType = lastChrType;
857+
scriptType = lastScriptType;
858+
}
839859

840-
if (lastChrType != chrType || lastChrType == POSTag::sso || lastChrType == POSTag::ssc)
860+
if (isDiscontinuous(lastChrType, chrType, lastScriptType, scriptType) || lastChrType == POSTag::sso || lastChrType == POSTag::ssc)
841861
{
842862
// sequence of speical characters found
843863
if (lastChrType != POSTag::max && lastChrType != POSTag::unknown && lastChrType != lastMatchedPattern)
@@ -875,13 +895,15 @@ size_t kiwi::splitByTrie(
875895
if (!isSpace(str[n - 3]) && !isSpace(str[n - 2]))
876896
{
877897
lastChrType = chrType;
898+
lastScriptType = scriptType;
878899
break;
879900
}
880901
}
881902
// 혹은 공백 문자가 아예 없는 경우 너무 길어지는 것을 방지하기 위해 강제로 중단
882903
else if (n >= 8192)
883904
{
884905
lastChrType = chrType;
906+
lastScriptType = scriptType;
885907
break;
886908
}
887909

@@ -1021,6 +1043,7 @@ size_t kiwi::splitByTrie(
10211043
}
10221044
continueFor:
10231045
lastChrType = chrType;
1046+
lastScriptType = scriptType;
10241047
}
10251048

10261049
// sequence of speical characters found

Diff for: src/Kiwi.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,23 @@ namespace kiwi
598598
return ++first;
599599
}
600600

601+
inline void updateTokenInfoScript(TokenInfo& info)
602+
{
603+
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return;
604+
if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return;
605+
if (info.str.empty()) return;
606+
char32_t c = info.str[0];
607+
if (isHighSurrogate(c))
608+
{
609+
c = mergeSurrogate(c, info.str[1]);
610+
}
611+
info.script = chr2ScriptType(c);
612+
if (info.script == ScriptType::latin)
613+
{
614+
info.tag = POSTag::sl;
615+
}
616+
}
617+
601618
inline void insertPathIntoResults(
602619
vector<TokenResult>& ret,
603620
Vector<SpecialState>& spStatesByRet,
@@ -718,6 +735,8 @@ namespace kiwi
718735
token.score = s.wordScore;
719736
token.typoCost = s.typoCost;
720737
token.typoFormId = s.typoFormId;
738+
token.senseId = s.morph->senseId;
739+
updateTokenInfoScript(token);
721740
auto ptId = nodeInWhichPretokenized[s.nodeId] + 1;
722741
if (ptId)
723742
{

0 commit comments

Comments
 (0)