Skip to content

Commit afad6b6

Browse files
committed
Add compatibleJamo option to kiwi::Match
1 parent f6a714f commit afad6b6

File tree

5 files changed

+51
-17
lines changed

5 files changed

+51
-17
lines changed

include/kiwi/PatternMatcher.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#pragma once
1+
#pragma once
22

33
#include <vector>
44
#include <string>
@@ -24,6 +24,7 @@ namespace kiwi
2424
joinAdvSuffix = 1 << 21, /**< 부사파생접미사(XSM)를 분리하지 않고 합쳐서 매칭한다 */
2525
splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */
2626
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
27+
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
2728
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
2829
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
2930
all = url | email | hashtag | mention | serial | emoji | zCoda,

include/kiwi/Utils.h

+23-1
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,29 @@ namespace kiwi
5858
return within(chr, 0xAC00, 0xD7A4);
5959
}
6060

61+
inline bool isHangulOnset(char16_t chr)
62+
{
63+
return within(chr, 0x1100, 0x1100 + 19);
64+
}
65+
6166
inline bool isHangulCoda(char16_t chr)
6267
{
63-
return within(chr, 0x11A8, 0x11A7 + 28);
68+
return within(chr, 0x11A8, 0x11A8 + 27);
69+
}
70+
71+
inline bool isHangulVowel(char16_t chr)
72+
{
73+
return within(chr, 0x314F, 0x3164);
74+
}
75+
76+
inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
77+
{
78+
return 0xAC00 + (char16_t)((onset * 21 + vowel) * 28);
79+
}
80+
81+
inline int extractVowel(char16_t chr)
82+
{
83+
return ((chr - 0xAC00) / 28) % 21;
6484
}
6585

6686
inline bool isOldHangulOnset(char16_t chr)
@@ -88,6 +108,8 @@ namespace kiwi
88108
return within(chr, 0x3131, 0x314E) || within(chr, 0x3165, 0x3186);
89109
}
90110

111+
char16_t toCompatibleHangulConsonant(char16_t chr);
112+
91113
struct ComparatorIgnoringSpace
92114
{
93115
static bool less(const KString& a, const KString& b, const kchar_t space = u' ');

src/Kiwi.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,14 @@ namespace kiwi
616616
}
617617
}
618618

619+
inline void toCompatibleJamo(u16string& str)
620+
{
621+
for (auto& c : str)
622+
{
623+
c = toCompatibleHangulConsonant(c);
624+
}
625+
}
626+
619627
inline void insertPathIntoResults(
620628
vector<TokenResult>& ret,
621629
Vector<SpecialState>& spStatesByRet,
@@ -726,6 +734,12 @@ namespace kiwi
726734
}
727735
joined = joinHangul(s.str.empty() ? *s.morph->kform : s.str);
728736
} while (0);
737+
738+
if (!!(matchOptions & Match::compatibleJamo))
739+
{
740+
toCompatibleJamo(joined);
741+
}
742+
729743
rarr.emplace_back(joined, s.morph->tag);
730744
auto& token = rarr.back();
731745
token.morph = within(s.morph, pretokenizedGroup.morphemes) ? nullptr : s.morph;

src/StrUtils.h

-15
Original file line numberDiff line numberDiff line change
@@ -728,21 +728,6 @@ namespace kiwi
728728
}
729729
}
730730

731-
inline bool isHangulOnset(char16_t c)
732-
{
733-
return u'' <= c && c <= u'';
734-
}
735-
736-
inline bool isHangulVowel(char16_t c)
737-
{
738-
return u'' <= c && c <= u'';
739-
}
740-
741-
inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
742-
{
743-
return u'' + (char16_t)((onset * 21 + vowel) * 28);
744-
}
745-
746731
inline bool isChineseChr(char32_t c)
747732
{
748733
return (0x4E00 <= c && c <= 0x9FFF)

src/Utils.cpp

+12
Original file line numberDiff line numberDiff line change
@@ -498,4 +498,16 @@ namespace kiwi
498498
return ret;
499499
}
500500

501+
char16_t toCompatibleHangulConsonant(char16_t chr)
502+
{
503+
if (isHangulOnset(chr))
504+
{
505+
return u"ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"[chr - 0x1100];
506+
}
507+
else if (isHangulCoda(chr))
508+
{
509+
return u"ㄱㄲㄳㄴㄵㄶㄷㄹㄺㄻㄼㄽㄾㄿㅀㅁㅂㅄㅅㅆㅇㅈㅊㅋㅌㅍㅎ"[chr - 0x11A8];
510+
}
511+
return chr;
512+
}
501513
}

0 commit comments

Comments
 (0)