Skip to content

Commit 2b204f6

Browse files
authored
Merge pull request #198 from bab2min/dev/saisiot
사이시옷 분석 기능 추가
2 parents e371e0d + 2901ad8 commit 2b204f6

19 files changed

+512
-364
lines changed

Diff for: ModelGenerator/morphemes.txt

+298-296
Large diffs are not rendered by default.

Diff for: include/kiwi/Form.h

+6-3
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ namespace kiwi
109109
/**< 선행형태소의 모음조화 조건 */
110110
CondPolarity polar() const { return static_cast<CondPolarity>((vpPack >> 4) & 0x7); }
111111

112+
/**< 추가 분석이 가능한 형태소인지(파생어나 사이시옷이 포함된 합성명사 등) */
112113
bool complex() const { return !!(vpPack & 0x80); }
113114

114115
void setVowel(CondVowel v)
@@ -141,8 +142,9 @@ namespace kiwi
141142
const KString* kform = nullptr;
142143
POSTag tag = POSTag::unknown;
143144
CondVowel vowel : 4;
144-
CondPolarity polar : 3;
145+
CondPolarity polar : 2;
145146
bool complex : 1;
147+
bool saisiot : 1;
146148
uint8_t senseId = 0;
147149
uint8_t combineSocket = 0;
148150
int32_t combined = 0;
@@ -205,7 +207,8 @@ namespace kiwi
205207
CondVowel vowel = CondVowel::none;
206208
CondPolarity polar = CondPolarity::none;
207209
uint8_t formHash = 0;
208-
uint8_t zCodaAppendable = 0;
210+
uint8_t zCodaAppendable : 1;
211+
uint8_t zSiotAppendable : 1;
209212

210213
Form();
211214
~Form();
@@ -251,7 +254,7 @@ namespace kiwi
251254
* @param morphBase 형태소 배열의 시작 위치
252255
* @return 최적화된 형태 정보
253256
*/
254-
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands = {});
257+
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands = {});
255258

256259
/**
257260
* @brief 변경 가능한 형태소 정보를 bake하여 최적화한다.

Diff for: include/kiwi/PatternMatcher.h

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ namespace kiwi
2525
splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */
2626
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
2727
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
28+
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
29+
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
2830
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
2931
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
3032
all = url | email | hashtag | mention | serial | emoji | zCoda,

Diff for: include/kiwi/TagUtils.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ namespace kiwi
3131
{
3232
return POSTag::jks <= tag && tag <= POSTag::jc;
3333
}
34-
34+
35+
inline bool isNNClass(POSTag tag)
36+
{
37+
return POSTag::nng <= tag && tag <= POSTag::nnb;
38+
}
39+
3540
inline bool isSuffix(POSTag tag)
3641
{
3742
tag = clearIrregular(tag);

Diff for: include/kiwi/Types.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ namespace kiwi
211211
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
212212
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
213213
ep, ef, ec, etn, etm,
214-
z_coda,
214+
z_coda, z_siot,
215215
user0, user1, user2, user3, user4,
216216
p, /**< 분할된 동사/형용사를 나타내는데 사용됨 */
217217
max, /**< POSTag의 총 개수를 나타내는 용도 */
@@ -275,7 +275,7 @@ namespace kiwi
275275
* @brief 선행 형태소의 양/음성 조건(모음 조화)과 관련된 열거형
276276
*
277277
*/
278-
enum class CondPolarity : char
278+
enum class CondPolarity : uint8_t
279279
{
280280
none, /**< 조건이 설정되지 않음 */
281281
positive, /**< 선행 형태소가 양성(ㅏ,ㅑ,ㅗ)인 경우만 등장 가능 */

Diff for: models/base/default.dict

-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
# * (점수)는 생략시 0으로 처리됩니다.
1818
# 예) 사겼다 사귀/VV + 었/EP + 다/EF -1.0
1919
#
20-
# 현재는 공백을 포함하는 다어절 형태를 등록할 수 없습니다.
21-
#
2220
# <규칙 기반의 변형된 이형태를 추가하는 경우>
2321
# (형태 규칙)$ \t (변형된 형태/품사태그) \t (점수)
2422
# 예) 요$ 용/EF -5

Diff for: models/base/sj.knlm

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:52d50761ed4aeea82e3be9f6fbb4724b75f526e56368bdc76dd530049ef9a07e
3-
size 35828400
2+
oid sha256:00f93f6abdc6bc31b3995c063564ffd558c475c2e5f5ea1e2ac38b64b4e06842
3+
size 35836336

Diff for: models/base/sj.morph

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:1c8a9999841059a00ef2c76dc190f651e698c0516209da524930c9818f01279c
3-
size 3581294
2+
oid sha256:125fb05ad20c0d8d7ebb45591b8acaadcea0e740197aceff1ee2d14e8c8195e4
3+
size 3586754

Diff for: models/base/skipbigram.mdl

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:ae4f0ee268f516bf4dfe282337f8773ab900ce081fefc1167b5c29259b1c465b
3-
size 3186444
2+
oid sha256:3f5239d9542be89970c454336538375c8c59df2a678988ab79c512bed2301e78
3+
size 3186448

Diff for: models/base/typo.dict

+1
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
편찬 편찮/VA -5
1616
귀찬 귀찮/VA -5
1717
하찬 하찮/VA -5
18+
시끄러 시끄럽/VA-I + 어/EF -5

Diff for: src/Form.cpp

+14-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include <cassert>
1+
#include <cassert>
22
#include <algorithm>
33
#include <kiwi/Utils.h>
44
#include <kiwi/Form.h>
@@ -70,7 +70,10 @@ namespace kiwi
7070

7171
DEFINE_SERIALIZER_OUTSIDE(FormRaw, form, candidate);
7272

73-
Form::Form() = default;
73+
Form::Form()
74+
: zCodaAppendable(0), zSiotAppendable(0)
75+
{
76+
}
7477

7578
Form::~Form() = default;
7679

@@ -87,7 +90,7 @@ namespace kiwi
8790
return ComparatorIgnoringSpace::less(form, o.form);
8891
}
8992

90-
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands)
93+
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands)
9194
{
9295
Form ret;
9396
ret.numSpaces = count(o.form.begin(), o.form.end(), u' ');
@@ -102,6 +105,7 @@ namespace kiwi
102105
ret.candidate[i + o.candidate.size()] = morphBase + additionalCands[i];
103106
}
104107
ret.zCodaAppendable = zCodaAppendable ? 1 : 0;
108+
ret.zSiotAppendable = zSiotAppendable ? 1 : 0;
105109
return ret;
106110
}
107111

@@ -112,19 +116,25 @@ namespace kiwi
112116
ret.tag = o.tag;
113117
ret.vowel = o.vowel();
114118
ret.polar = o.polar();
115-
ret.complex = o.complex();
116119
ret.combineSocket = o.combineSocket;
117120
ret.combined = o.combined;
118121
ret.userScore = o.userScore;
119122
ret.lmMorphemeId = o.lmMorphemeId;
120123
ret.origMorphemeId = o.origMorphemeId;
121124
ret.senseId = o.senseId;
122125
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
126+
127+
bool hasSaisiot = false;
123128
for (size_t i = 0; i < o.chunks.size(); ++i)
124129
{
125130
ret.chunks[i] = morphBase + o.chunks[i];
126131
ret.chunks.getSecond(i) = o.chunkPositions[i];
132+
hasSaisiot = hasSaisiot || (morphBase[o.chunks[i]].tag == POSTag::z_siot);
127133
}
134+
// 사이시옷이 포함된 경우는 saisiot을 true로, 그 외에는 complex를 true로 설정
135+
ret.complex = o.complex() && !hasSaisiot;
136+
ret.saisiot = o.complex() && hasSaisiot;
137+
128138
return ret;
129139
}
130140

Diff for: src/Joiner.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ namespace kiwi
2727
if (l == POSTag::sso || l == POSTag::ssc) return false;
2828
if (r == POSTag::sso) return true;
2929
if ((isJClass(l) || isEClass(l)) && r == POSTag::ss) return true;
30+
if (l == POSTag::z_siot && isNNClass(r)) return false;
3031

3132
if (r == POSTag::vx && rform.size() == 1 && (rform[0] == u'' || rform[0] == u'')) return false;
3233

Diff for: src/KTrie.cpp

+17-26
Original file line numberDiff line numberDiff line change
@@ -158,19 +158,16 @@ namespace kiwi
158158
};
159159

160160
template<bool typoTolerant>
161-
bool getZCodaAppendable(
162-
const Form* foundCand,
163-
const Form* formBase
164-
)
161+
const Form& getForm(const Form* foundCand, const Form* formBase)
165162
{
166163
if (typoTolerant)
167164
{
168165
auto tCand = reinterpret_cast<const TypoForm*>(foundCand);
169-
return tCand->form(formBase).zCodaAppendable;
166+
return tCand->form(formBase);
170167
}
171168
else
172169
{
173-
return foundCand->zCodaAppendable;
170+
return *foundCand;
174171
}
175172
}
176173

@@ -229,23 +226,6 @@ namespace kiwi
229226
return true;
230227
}
231228

232-
template<bool typoTolerant>
233-
size_t getFormLength(
234-
const Form* form,
235-
const Form* formBase
236-
)
237-
{
238-
if (typoTolerant)
239-
{
240-
auto tCand = reinterpret_cast<const TypoForm*>(form);
241-
return tCand->form(formBase).form.size();
242-
}
243-
else
244-
{
245-
return form->form.size();
246-
}
247-
}
248-
249229
inline void removeUnconnected(Vector<KGraphNode>& ret, const Vector<KGraphNode>& graph, const Vector<std::pair<uint32_t, uint32_t>>& endPosMap)
250230
{
251231
thread_local Vector<uint8_t> connectedList;
@@ -549,7 +529,7 @@ namespace kiwi
549529
if (!cand) break;
550530
else if (!trie.hasSubmatch(cand))
551531
{
552-
if (getFormLength<typoTolerant>(cand, formBase) <= 1) break;
532+
if (getForm<typoTolerant>(cand, formBase).form.size() <= 1) break;
553533
inserted = true;
554534
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, boundary, continualTypoCost / 2)) break;
555535
}
@@ -753,7 +733,7 @@ size_t kiwi::splitByTrie(
753733
}
754734
};
755735

756-
bool zCodaFollowable = false;
736+
bool zCodaFollowable = false, zSiotFollowable = false;
757737
const Form* const fallbackFormBegin = trie.value((size_t)POSTag::nng);
758738
const Form* const fallbackFormEnd = trie.value((size_t)POSTag::max);
759739
for (; n < str.size(); ++n)
@@ -1006,7 +986,12 @@ size_t kiwi::splitByTrie(
1006986
{
1007987
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
1008988
}
989+
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
990+
{
991+
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
992+
}
1009993
zCodaFollowable = false;
994+
zSiotFollowable = false;
1010995

1011996
// invalidate typo nodes
1012997
if (continualTypoTolerant)
@@ -1107,7 +1092,12 @@ size_t kiwi::splitByTrie(
11071092
{
11081093
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
11091094
}
1095+
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
1096+
{
1097+
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
1098+
}
11101099
zCodaFollowable = false;
1100+
zSiotFollowable = false;
11111101

11121102
if (continualTypoTolerant && lastChrType == POSTag::max)
11131103
{
@@ -1128,7 +1118,8 @@ size_t kiwi::splitByTrie(
11281118
if (!cand) break;
11291119
else if (!trie.hasSubmatch(cand))
11301120
{
1131-
zCodaFollowable = zCodaFollowable || getZCodaAppendable<typoTolerant>(cand, formBase);
1121+
zCodaFollowable = zCodaFollowable || getForm<typoTolerant>(cand, formBase).zCodaAppendable;
1122+
zSiotFollowable = zSiotFollowable || getForm<typoTolerant>(cand, formBase).zSiotAppendable;
11321123
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
11331124
}
11341125
}

Diff for: src/Kiwi.cpp

+24-6
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,12 @@ namespace kiwi
541541
template<class TokenInfoIt>
542542
TokenInfoIt joinAffixTokens(TokenInfoIt first, TokenInfoIt last, Match matchOptions)
543543
{
544-
if (!(matchOptions & (Match::joinNounPrefix | Match::joinNounSuffix | Match::joinVerbSuffix | Match::joinAdjSuffix | Match::joinAdvSuffix))) return last;
544+
if (!(matchOptions & (Match::joinNounPrefix
545+
| Match::joinNounSuffix
546+
| Match::joinVerbSuffix
547+
| Match::joinAdjSuffix
548+
| Match::joinAdvSuffix
549+
| Match::mergeSaisiot))) return last;
545550
if (std::distance(first, last) < 2) return last;
546551

547552
auto next = first;
@@ -554,7 +559,7 @@ namespace kiwi
554559
// XPN + (NN. | SN) => (NN. | SN)
555560
if (!!(matchOptions & Match::joinNounPrefix)
556561
&& current.tag == POSTag::xpn
557-
&& ((POSTag::nng <= nextToken.tag && nextToken.tag <= POSTag::nnb) || nextToken.tag == POSTag::sn)
562+
&& (isNNClass(nextToken.tag) || nextToken.tag == POSTag::sn)
558563
)
559564
{
560565
concatTokens(current, nextToken, nextToken.tag);
@@ -563,7 +568,7 @@ namespace kiwi
563568
// (NN. | SN) + XSN => (NN. | SN)
564569
else if (!!(matchOptions & Match::joinNounSuffix)
565570
&& nextToken.tag == POSTag::xsn
566-
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::sn)
571+
&& (isNNClass(current.tag) || current.tag == POSTag::sn)
567572
)
568573
{
569574
concatTokens(current, nextToken, current.tag);
@@ -572,7 +577,7 @@ namespace kiwi
572577
// (NN. | XR) + XSV => VV
573578
else if (!!(matchOptions & Match::joinVerbSuffix)
574579
&& clearIrregular(nextToken.tag) == POSTag::xsv
575-
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
580+
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
576581
)
577582
{
578583
concatTokens(current, nextToken, setIrregular(POSTag::vv, isIrregular(nextToken.tag)));
@@ -581,7 +586,7 @@ namespace kiwi
581586
// (NN. | XR) + XSA => VA
582587
else if (!!(matchOptions & Match::joinAdjSuffix)
583588
&& clearIrregular(nextToken.tag) == POSTag::xsa
584-
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
589+
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
585590
)
586591
{
587592
concatTokens(current, nextToken, setIrregular(POSTag::va, isIrregular(nextToken.tag)));
@@ -590,12 +595,24 @@ namespace kiwi
590595
// (NN. | XR) + XSM => MAG
591596
else if (!!(matchOptions & Match::joinAdvSuffix)
592597
&& nextToken.tag == POSTag::xsm
593-
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
598+
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
594599
)
595600
{
596601
concatTokens(current, nextToken, POSTag::mag);
597602
++next;
598603
}
604+
// NN. + Z_SIOT + NN. => NN
605+
else if (!!(matchOptions & Match::mergeSaisiot)
606+
&& nextToken.tag == POSTag::z_siot
607+
&& isNNClass(current.tag)
608+
&& next + 1 != last
609+
&& isNNClass((next + 1)->tag))
610+
{
611+
current.str.back() += (0x11BA - 0x11A7);
612+
concatTokens(current, *(next + 1), POSTag::nng);
613+
++next;
614+
++next;
615+
}
599616
else
600617
{
601618
++first;
@@ -1047,6 +1064,7 @@ namespace kiwi
10471064
topN,
10481065
false,
10491066
!!(matchOptions & Match::splitComplex),
1067+
!!(matchOptions & Match::splitSaisiot),
10501068
blocklist
10511069
);
10521070
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);

0 commit comments

Comments
 (0)