Skip to content

Commit a9ee8c3

Browse files
authored
Merge pull request #199 from bab2min/dev/saisiot
사이시옷 분석 기능 보강
2 parents 2a5291f + 2c162c7 commit a9ee8c3

File tree

14 files changed

+57
-33
lines changed

14 files changed

+57
-33
lines changed

Diff for: CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cmake_minimum_required(VERSION 3.12)
22

3-
project(kiwi VERSION 0.19.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
3+
project(kiwi VERSION 0.20.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
44

55
set ( CMAKE_CXX_STANDARD 14 )
66
set ( CMAKE_VERBOSE_MAKEFILE true )

Diff for: ModelGenerator/morphemes.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -6370,7 +6370,7 @@
63706370
와인 NNG 270
63716371
은혜 NNG 270
63726372
공평 NNG 270
6373-
횟수 NNG 270
6373+
횟수 NNG 270 complex 회/NNG ᆺ/Z_SIOT 수/NNG 010112
63746374
반짝이 VV 270 complex 반짝/MAG 이/XSV 0223
63756375
서랍 NNG 270
63766376
허무 NNG 270
@@ -14689,7 +14689,7 @@ LG화학 NNP 82
1468914689
조흥은행 NNP 75
1469014690
노라 EC 75
1469114691
영양가 NNG 75
14692-
툇마루 NNG 75
14692+
툇마루 NNG 75 complex 퇴/NNG ᆺ/Z_SIOT 마루/NNG 010113
1469314693
오묘 XR 75
1469414694
의미심장 XR 75
1469514695
주인집 NNG 75
@@ -16670,7 +16670,7 @@ LG화학 NNP 82
1667016670
막중 XR 61
1667116671
엄중 XR 61
1667216672
경박 XR 61
16673-
셋방 NNG 61
16673+
셋방 NNG 61 complex 세/NNG ᆺ/Z_SIOT 방/NNG 010112
1667416674
애무 NNG 61
1667516675
천진 NNG 61
1667616676
맞아들이 VV 61 complex 맞/VV 어/EC 들이/VV 011224
@@ -23962,7 +23962,7 @@ SK그룹 NNP 33
2396223962
판매자 NNG 33
2396323963
차두리 NNP 33
2396423964
자필 NNG 33
23965-
곳간 NNG 33
23965+
곳간 NNG 33 complex 고/NNG ᆺ/Z_SIOT 간/NNB 010112
2396623966
에베레스트 NNP 33
2396723967
국전 NNG 33
2396823968
온존 NNG 33

Diff for: bindings/java/kr/pe/bab2min/Kiwi.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
public class Kiwi implements AutoCloseable {
1414
private long _inst;
15-
final private static String _version = "0.19.1";
15+
final private static String _version = "0.20.0";
1616

1717
public static class Match {
1818
final static public int none = 0,

Diff for: include/kiwi/Form.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file Form.h
33
* @author bab2min ([email protected])
44
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-07-01
77
*
88
*

Diff for: include/kiwi/Kiwi.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file Kiwi.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C++ API를 담고 있는 헤더 파일
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-07-01
77
*
88
*

Diff for: include/kiwi/Macro.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#define KIWI_STR(x) KIWI_STR_HELPER(x)
55

66
#define KIWI_VERSION_MAJOR 0
7-
#define KIWI_VERSION_MINOR 19
8-
#define KIWI_VERSION_PATCH 1
7+
#define KIWI_VERSION_MINOR 20
8+
#define KIWI_VERSION_PATCH 0
99

1010
#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)

Diff for: include/kiwi/SwTokenizer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file SwTokenizer.h
33
* @author bab2min ([email protected])
44
* @brief Subword Tokenizer
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-07-01
77
*
88
*

Diff for: include/kiwi/Types.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file Types.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-07-01
77
*
88
*

Diff for: include/kiwi/TypoTransformer.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file TypoTransformer.h
33
* @author bab2min ([email protected])
44
* @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다.
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-09-15
77
*
88
*

Diff for: include/kiwi/capi.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* @file capi.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C API를 담고 있는 헤더 파일
5-
* @version 0.19.0
5+
* @version 0.20.0
66
* @date 2024-07-01
77
*
88
*

Diff for: models/base/sj.morph

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:125fb05ad20c0d8d7ebb45591b8acaadcea0e740197aceff1ee2d14e8c8195e4
3-
size 3586754
2+
oid sha256:8f92b96709467b4941a8d98efda10a803f7a7457bb5c7d9d18b8466ec3ededb6
3+
size 3586826

Diff for: src/Kiwi.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,7 @@ namespace kiwi
10651065
false,
10661066
!!(matchOptions & Match::splitComplex),
10671067
!!(matchOptions & Match::splitSaisiot),
1068+
!!(matchOptions & Match::mergeSaisiot),
10681069
blocklist
10691070
);
10701071
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);

Diff for: src/PathEvaluator.hpp

+29-17
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ namespace kiwi
119119
bool openEnd,
120120
bool splitComplex = false,
121121
bool splitSaisiot = false,
122+
bool mergeSaisiot = false,
122123
const std::unordered_set<const Morpheme*>* blocklist = nullptr
123124
);
124125

@@ -136,6 +137,7 @@ namespace kiwi
136137
const Vector<SpecialState>& prevSpStates,
137138
bool splitComplex = false,
138139
bool splitSaisiot = false,
140+
bool mergeSaisiot = false,
139141
const std::unordered_set<const Morpheme*>* blocklist = nullptr
140142
);
141143

@@ -525,7 +527,7 @@ namespace kiwi
525527

526528
// fill the rest information of resultOut
527529
newPath.wid = lastSeqId;
528-
if (curMorph->chunks.empty() || curMorph->complex)
530+
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
529531
{
530532
newPath.combineSocket = curMorph->combineSocket;
531533
newPath.ownFormId = ownFormId;
@@ -570,7 +572,7 @@ namespace kiwi
570572

571573
// fill the rest information of resultOut
572574
newPath.wid = lastSeqId;
573-
if (curMorph->chunks.empty() || curMorph->complex)
575+
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
574576
{
575577
newPath.combineSocket = curMorph->combineSocket;
576578
newPath.ownFormId = ownFormId;
@@ -622,7 +624,7 @@ namespace kiwi
622624

623625
// fill the rest information of resultOut
624626
newPath.wid = lastSeqId;
625-
if (curMorph->chunks.empty() || curMorph->complex)
627+
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
626628
{
627629
newPath.combineSocket = curMorph->combineSocket;
628630
newPath.ownFormId = ownFormId;
@@ -659,7 +661,7 @@ namespace kiwi
659661

660662
const Morpheme* lastMorph;
661663
Wid firstWid;
662-
if (curMorph->chunks.empty() || curMorph->complex)
664+
if (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot)
663665
{
664666
lastMorph = curMorph->getCombined() ? curMorph->getCombined() : curMorph;
665667
firstWid = curMorph->lmMorphemeId;
@@ -691,8 +693,10 @@ namespace kiwi
691693
{
692694
for (auto& prevPath : cache[prev - startNode])
693695
{
694-
// 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외
695-
if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag))
696+
// 사이시옷 뒤에 명사가 아닌 태그가 오거나 공백이 있는 경우 제외
697+
if (prevPath.morpheme->tag == POSTag::z_siot && (
698+
!isNNClass(curMorph->tag) || prev->endPos < node->startPos
699+
))
696700
{
697701
continue;
698702
}
@@ -701,7 +705,7 @@ namespace kiwi
701705
if (prevPath.combineSocket)
702706
{
703707
// merge <v> <chunk> with only the same socket
704-
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex))
708+
if (prevPath.combineSocket != curMorph->combineSocket || (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
705709
{
706710
continue;
707711
}
@@ -747,7 +751,7 @@ namespace kiwi
747751
}
748752

749753
auto cLmState = prevPath.lmState;
750-
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex))
754+
if (curMorph->combineSocket && (curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
751755
{
752756
// no-op
753757
}
@@ -760,7 +764,7 @@ namespace kiwi
760764
}
761765
float ll = cLmState.next(langMdl, firstWid);
762766
candScore += ll;
763-
if (!(curMorph->chunks.empty() || curMorph->complex))
767+
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
764768
{
765769
for (size_t i = 1; i < curMorph->chunks.size(); ++i)
766770
{
@@ -833,6 +837,7 @@ namespace kiwi
833837
const Vector<SpecialState>& prevSpStates,
834838
bool splitComplex,
835839
bool splitSaisiot,
840+
bool mergeSaisiot,
836841
const std::unordered_set<const Morpheme*>* blocklist
837842
)
838843
{
@@ -893,6 +898,11 @@ namespace kiwi
893898
// 사이시옷(zSiot)을 위한 지름길
894899
if (curMorph->tag == POSTag::z_siot)
895900
{
901+
if (!(splitSaisiot || mergeSaisiot))
902+
{
903+
continue;
904+
}
905+
896906
for (auto* prev = node->getPrev(); prev; prev = prev->getSibling())
897907
{
898908
for (auto& p : cache[prev - startNode])
@@ -912,7 +922,7 @@ namespace kiwi
912922
}
913923

914924
// if the morpheme has chunk set
915-
if (!(curMorph->chunks.empty()|| curMorph->complex))
925+
if (!(curMorph->chunks.empty() || curMorph->complex || curMorph->saisiot))
916926
{
917927
// '하다/하게/하지'가 '다/게/지'로 축약된 경우인데 앞에 공백이 있는 경우는 탐색후보에서 제외
918928
if (node->prev && node[-(int)node->prev].endPos < node->startPos
@@ -1019,13 +1029,13 @@ namespace kiwi
10191029
float scoreDiff = cur->accScore - prev->accScore;
10201030
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
10211031
auto morpheme = cur->morpheme;
1022-
size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex) ? 1 : morpheme->chunks.size();
1032+
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
10231033
auto& gNode = graph[csearcher(cur)];
10241034
scoreDiff += typoCostDiff * typoCostWeight;
10251035
scoreDiff /= numNewTokens;
10261036
typoCostDiff /= numNewTokens;
10271037

1028-
if (morpheme->chunks.empty() || morpheme->complex)
1038+
if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
10291039
{
10301040
ret.emplace_back(
10311041
unifyMorpheme(morpheme),
@@ -1093,6 +1103,7 @@ namespace kiwi
10931103
bool openEnd,
10941104
bool splitComplex,
10951105
bool splitSaisiot,
1106+
bool mergeSaisiot,
10961107
const std::unordered_set<const Morpheme*>* blocklist
10971108
)
10981109
{
@@ -1148,24 +1159,24 @@ namespace kiwi
11481159
{
11491160
evalPath<LmState>(kw, startNode, node, topN, cache,
11501161
ownFormList, i, ownFormId, node->form->candidate,
1151-
false, uniqStates, splitComplex, splitSaisiot, blocklist);
1162+
false, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
11521163
if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m)
11531164
{
1154-
return m->combineSocket || (!m->chunks.empty() && !m->complex);
1165+
return m->combineSocket || !(m->chunks.empty() || m->complex || m->saisiot);
11551166
}))
11561167
{
11571168
ownFormList.emplace_back(node->form->form);
11581169
ownFormId = ownFormList.size();
11591170
evalPath<LmState>(kw, startNode, node, topN, cache,
11601171
ownFormList, i, ownFormId, unknownNodeLCands,
1161-
true, uniqStates, splitComplex, splitSaisiot, blocklist);
1172+
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
11621173
};
11631174
}
11641175
else
11651176
{
11661177
evalPath<LmState>(kw, startNode, node, topN, cache,
11671178
ownFormList, i, ownFormId, unknownNodeCands,
1168-
true, uniqStates, splitComplex, splitSaisiot, blocklist);
1179+
true, uniqStates, splitComplex, splitSaisiot, mergeSaisiot, blocklist);
11691180
}
11701181

11711182
#ifdef DEBUG_PRINT
@@ -1186,13 +1197,14 @@ namespace kiwi
11861197
for (auto& p : cache[prev - startNode])
11871198
{
11881199
if (p.combineSocket) continue;
1189-
if (!p.morpheme->chunks.empty() && !p.morpheme->complex)
1200+
if (!(p.morpheme->chunks.empty() || p.morpheme->complex || p.morpheme->saisiot))
11901201
{
11911202
if (p.morpheme->chunks.size() <= (p.morpheme->combineSocket ? 2 : 1))
11921203
{
11931204
if (!FeatureTestor::isMatched(nullptr, p.morpheme->vowel)) continue;
11941205
}
11951206
}
1207+
if (p.morpheme->tag == POSTag::z_siot) continue;
11961208

11971209
float c = p.accScore + (openEnd ? 0 : p.lmState.next(kw->langMdl, eosId));
11981210
if (p.spState.singleQuote) c -= 2;

Diff for: test/test_cpp.cpp

+11
Original file line numberDiff line numberDiff line change
@@ -994,13 +994,24 @@ TEST(KiwiCpp, ZSiot)
994994
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
995995
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
996996
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
997+
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
997998
EXPECT_EQ(resSplit.first.size(), 3);
998999
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
9991000
EXPECT_EQ(resSplit.first[1].tag, POSTag::z_siot);
10001001
EXPECT_EQ(resSplit.first[2].tag, POSTag::nng);
10011002
EXPECT_EQ(resMerge.first.size(), 1);
10021003
EXPECT_EQ(resMerge.first[0].tag, POSTag::nng);
10031004
}
1005+
1006+
for (auto s : {u"발렛 파킹", u"미닛"})
1007+
{
1008+
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
1009+
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
1010+
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
1011+
EXPECT_EQ(resNone.second, resSplit.second);
1012+
EXPECT_EQ(resNone.second, resMerge.second);
1013+
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
1014+
}
10041015
}
10051016

10061017
TEST(KiwiCpp, AnalyzeWithWordPosition)

0 commit comments

Comments
 (0)