Skip to content

Commit f07989e

Browse files
authored
Merge pull request #201 from bab2min/dev/minor_fix
Minor fix
2 parents d88f0e9 + 5a5cb41 commit f07989e

File tree

6 files changed

+37
-8
lines changed

6 files changed

+37
-8
lines changed

Diff for: include/kiwi/TypoTransformer.h

+1
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ namespace kiwi
393393
continualTypoSet,
394394
basicTypoSetWithContinual,
395395
lengtheningTypoSet,
396+
basicTypoSetWithContinualAndLengthening,
396397
};
397398

398399
/**

Diff for: include/kiwi/capi.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ enum
124124
KIWI_MATCH_EMAIL = 2,
125125
KIWI_MATCH_HASHTAG = 4,
126126
KIWI_MATCH_MENTION = 8,
127+
KIWI_MATCH_SERIAL = 16,
127128

128129
KIWI_MATCH_NORMALIZE_CODA = 1 << 16,
129130
KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17,
@@ -139,7 +140,7 @@ enum
139140
KIWI_MATCH_SPLIT_SAISIOT = 1 << 25,
140141
KIWI_MATCH_MERGE_SAISIOT = 1 << 26,
141142

142-
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_Z_CODA,
143+
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA,
143144
KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA,
144145
};
145146

@@ -361,6 +362,7 @@ enum
361362
KIWI_TYPO_CONTINUAL_TYPO_SET = 2,
362363
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3,
363364
KIWI_TYPO_LENGTHENING_TYPO_SET = 4,
365+
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5,
364366
};
365367

366368
/**

Diff for: src/KTrie.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1090,11 +1090,11 @@ size_t kiwi::splitByTrie(
10901090

10911091
if (!!(matchOptions & Match::zCoda) && zCodaFollowable && isHangulCoda(c) && (n + 1 >= str.size() || !isHangulSyllable(str[n + 1])))
10921092
{
1093-
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
1093+
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
10941094
}
10951095
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
10961096
{
1097-
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
1097+
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, (nonSpaces.size() - 1) * posMultiplier);
10981098
}
10991099
zCodaFollowable = false;
11001100
zSiotFollowable = false;

Diff for: src/Kiwi.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -905,6 +905,7 @@ namespace kiwi
905905
morph.vowel = CondVowel::none;
906906
morph.polar = CondPolarity::none;
907907
morph.complex = 0;
908+
morph.saisiot = 0;
908909
morph.lmMorphemeId = getDefaultMorphemeId(s.tokenization[0].tag);
909910
form.candidate[0] = &morph;
910911
}
@@ -921,6 +922,7 @@ namespace kiwi
921922
morph.vowel = CondVowel::none;
922923
morph.polar = CondPolarity::none;
923924
morph.complex = 0;
925+
morph.saisiot = 0;
924926
morph.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ s.tokenization.size() };
925927
for (size_t i = 0; i < s.tokenization.size(); ++i)
926928
{
@@ -949,6 +951,7 @@ namespace kiwi
949951
cmorph.vowel = CondVowel::none;
950952
cmorph.polar = CondPolarity::none;
951953
cmorph.complex = 0;
954+
cmorph.saisiot = 0;
952955
cmorph.tag = t.tag;
953956
cmorph.lmMorphemeId = getDefaultMorphemeId(t.tag);
954957
foundMorph = &cmorph;

Diff for: src/PathEvaluator.hpp

+24-5
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,6 @@ namespace kiwi
872872
for (auto& curMorph : cands)
873873
{
874874
if (splitComplex && curMorph->getCombined()->complex) continue;
875-
if (splitSaisiot && curMorph->getCombined()->saisiot) continue;
876875
if (blocklist && blocklist->count(curMorph->getCombined())) continue;
877876

878877
// 덧붙은 받침(zCoda)을 위한 지름길
@@ -1007,7 +1006,8 @@ namespace kiwi
10071006
const Vector<U16StringView>& ownFormList,
10081007
float typoCostWeight,
10091008
const Morpheme* morphFirst,
1010-
size_t langVocabSize)
1009+
size_t langVocabSize,
1010+
bool splitSaisiot)
10111011
{
10121012
Vector<const WordLL<LmState>*> steps;
10131013
for (auto s = result->parent; s->parent; s = s->parent)
@@ -1029,13 +1029,32 @@ namespace kiwi
10291029
float scoreDiff = cur->accScore - prev->accScore;
10301030
float typoCostDiff = cur->accTypoCost - prev->accTypoCost;
10311031
auto morpheme = cur->morpheme;
1032-
const size_t numNewTokens = (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot) ? 1 : morpheme->chunks.size();
1032+
const size_t numNewTokens = (splitSaisiot && morpheme->saisiot) || !(morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
1033+
? morpheme->chunks.size() : 1;
10331034
auto& gNode = graph[csearcher(cur)];
10341035
scoreDiff += typoCostDiff * typoCostWeight;
10351036
scoreDiff /= numNewTokens;
10361037
typoCostDiff /= numNewTokens;
10371038

1038-
if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
1039+
if (splitSaisiot && morpheme->saisiot)
1040+
{
1041+
for (size_t ch = 0; ch < numNewTokens; ++ch)
1042+
{
1043+
auto& p = morpheme->chunks.getSecond(ch);
1044+
ret.emplace_back(
1045+
unifyMorpheme(morpheme->chunks[ch]),
1046+
KString{},
1047+
gNode.startPos + p.first,
1048+
gNode.startPos + p.second,
1049+
scoreDiff,
1050+
typoCostDiff,
1051+
typoCostDiff ? gNode.typoFormId : 0,
1052+
&gNode - graph
1053+
);
1054+
}
1055+
ret.back().end = gNode.endPos;
1056+
}
1057+
else if (morpheme->chunks.empty() || morpheme->complex || morpheme->saisiot)
10391058
{
10401059
ret.emplace_back(
10411060
unifyMorpheme(morpheme),
@@ -1274,7 +1293,7 @@ namespace kiwi
12741293
{
12751294
auto tokens = generateTokenList(
12761295
&cand[i], csearcher, graph, ownFormList, kw->typoCostWeight,
1277-
kw->morphemes.data(), langVocabSize
1296+
kw->morphemes.data(), langVocabSize, splitSaisiot
12781297
);
12791298
ret.emplace_back(move(tokens), cand[i].accScore, uniqStates[cand[i].rootId], cand[i].spState);
12801299
}

Diff for: src/TypoTransformer.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -662,6 +662,8 @@ namespace kiwi
662662

663663
static const TypoTransformer lengtheningTypoSet = TypoTransformer::fromLengtheningTypoCost(0.25f);
664664

665+
static const TypoTransformer basicTypoSetWithContinualAndLengthening = basicTypoSetWithContinual | lengtheningTypoSet;
666+
665667
switch (set)
666668
{
667669
case kiwi::DefaultTypoSet::withoutTypo:
@@ -674,6 +676,8 @@ namespace kiwi
674676
return basicTypoSetWithContinual;
675677
case kiwi::DefaultTypoSet::lengtheningTypoSet:
676678
return lengtheningTypoSet;
679+
case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening:
680+
return basicTypoSetWithContinualAndLengthening;
677681
default:
678682
throw invalid_argument{ "Invalid `DefaultTypoSet`" };
679683
}

0 commit comments

Comments
 (0)