Skip to content

Commit 5713b10

Browse files
authored
Merge pull request #167 from bab2min/dev_emoji
emoji 태그 추가
2 parents 0db05c4 + 8ce3c21 commit 5713b10

File tree

15 files changed

+310
-215
lines changed

15 files changed

+310
-215
lines changed

Diff for: ModelGenerator/sj.knlm

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:fe03f0a1fdd25186113e8ec192df4369975fdf174426d46411bc5ff5c4d6caf2
3-
size 35867382
2+
oid sha256:e36c3d16e1f305169f977840e7257b80de153e613ac5809fd7e25ee59cda4f6e
3+
size 35861136

Diff for: ModelGenerator/sj.morph

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:31aea323cf870f614fb07681aa01c76d83d1fa27e325d626d4995b4e064d5b05
3-
size 3581056
2+
oid sha256:76f155f03b402b866af32ed5e759560fac72b6174982ac5835f9d56c3a997e34
3+
size 3581100

Diff for: ModelGenerator/skipbigram.mdl

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:152de02fa696817564fad589460d8874ce37ef3e912442c0ed37fb2955471288
3-
size 3186824
2+
oid sha256:edce9fbe938b9f21eeb915c0695c4c160708ed5879a6ce2e4dd20117ff7f9ca5
3+
size 3186748

Diff for: bindings/java/kr/pe/bab2min/Kiwi.java

+9-7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public static class Match {
2121
hashtag = 1 << 2,
2222
mention = 1 << 3,
2323
serial = 1 << 4,
24+
emoji = 1 << 5,
2425
normalizeCoda = 1 << 16,
2526
joinNounPrefix = 1 << 17,
2627
joinNounSuffix = 1 << 18,
@@ -48,13 +49,13 @@ public static class POSTag {
4849
vcp = 19, vcn = 20,
4950
sf = 21, sp = 22, ss = 23, sso = 24, ssc = 25, se = 26, so = 27, sw = 28, sb = 29,
5051
sl = 30, sh = 31, sn = 32,
51-
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37,
52-
jks = 38, jkc = 39, jkg = 40, jko = 41, jkb = 42, jkv = 43, jkq = 44, jx = 45, jc = 46,
53-
ep = 47, ef = 48, ec = 49, etn = 50, etm = 51,
54-
z_coda = 52,
55-
user0 = 53, user1 = 54, user2 = 55, user3 = 56, user4 = 57,
56-
p = 58,
57-
max = 59,
52+
w_url = 33, w_email = 34, w_mention = 35, w_hashtag = 36, w_serial = 37, w_emoji = 38,
53+
jks = 39, jkc = 40, jkg = 41, jko = 42, jkb = 43, jkv = 44, jkq = 45, jx = 46, jc = 47,
54+
ep = 48, ef = 49, ec = 50, etn = 51, etm = 52,
55+
z_coda = 53,
56+
user0 = 54, user1 = 55, user2 = 56, user3 = 57, user4 = 58,
57+
p = 59,
58+
max = 60,
5859
pv = p,
5960
pa = (byte)(p + 1),
6061
irregular = - 128,
@@ -106,6 +107,7 @@ static String toString(byte tag) {
106107
case w_mention: return "W_MENTION";
107108
case w_hashtag: return "W_HASHTAG";
108109
case w_serial: return "W_SERIAL";
110+
case w_emoji: return "W_EMOJI";
109111
case jks: return "JKS";
110112
case jkc: return "JKC";
111113
case jkg: return "JKG";

Diff for: bindings/java/kr/pe/bab2min/KiwiBuilder.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ public static class BuildOption {
88
integrateAllomorph = 1 << 0,
99
loadDefaultDict = 1 << 1,
1010
loadTypoDict = 1 << 2,
11-
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict;
11+
loadMultiDict = 1 << 3,
12+
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict;
1213
}
1314

1415
public static class AnalyzedMorph {

Diff for: include/kiwi/PatternMatcher.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ namespace kiwi
1515
hashtag = 1 << 2, /**< 해시태그 형태의 텍스트(#해시)를 w_hashtag 태그에 매칭한다 */
1616
mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */
1717
serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */
18+
emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */
1819
normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */
1920
joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */
2021
joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */
@@ -25,7 +26,7 @@ namespace kiwi
2526
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
2627
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
2728
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
28-
all = url | email | hashtag | mention | serial | zCoda,
29+
all = url | email | hashtag | mention | serial | emoji | zCoda,
2930
allWithNormalizing = all | normalizeCoda,
3031
};
3132

Diff for: include/kiwi/ScriptType.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -241,5 +241,10 @@ namespace kiwi
241241

242242
const char* getScriptName(ScriptType type);
243243

244-
bool isEmoji(char32_t c0, char32_t c1 = 0);
244+
/**
245+
* @brief Check if the character is an emoji
246+
*
247+
* @return 0 if the character is not an emoji, 1 if c0 is an emoji, 2 if c0 and c1 are combined to form an emoji.
248+
*/
249+
int isEmoji(char32_t c0, char32_t c1 = 0);
245250
}

Diff for: include/kiwi/Types.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ namespace kiwi
202202
vcp, vcn,
203203
sf, sp, ss, sso, ssc, se, so, sw, sb,
204204
sl, sh, sn,
205-
w_url, w_email, w_mention, w_hashtag, w_serial,
205+
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
206206
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
207207
ep, ef, ec, etn, etm,
208208
z_coda,

Diff for: include/kiwi/Utils.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#pragma once
1+
#pragma once
22
#include <iostream>
33
#include <string>
44
#include <memory>
@@ -30,7 +30,7 @@ namespace kiwi
3030

3131
inline bool isWebTag(POSTag t)
3232
{
33-
return POSTag::w_url <= t && t <= POSTag::w_hashtag;
33+
return POSTag::w_url <= t && t <= POSTag::w_emoji;
3434
}
3535

3636
POSTag toPOSTag(const std::u16string& tagStr);

Diff for: src/Kiwi.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ namespace kiwi
600600

601601
inline void updateTokenInfoScript(TokenInfo& info)
602602
{
603-
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw)) return;
603+
if (!(info.tag == POSTag::sl || info.tag == POSTag::sh || info.tag == POSTag::sw || info.tag == POSTag::w_emoji)) return;
604604
if ((info.morph && info.morph->kform && !info.morph->kform->empty())) return;
605605
if (info.str.empty()) return;
606606
char32_t c = info.str[0];

Diff for: src/PatternMatcher.cpp

+75
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#include <kiwi/PatternMatcher.h>
22
#include <kiwi/Utils.h>
3+
#include <kiwi/ScriptType.h>
34
#include "pattern.hpp"
5+
#include "StrUtils.h"
46

57
using namespace std;
68
using namespace kiwi;
@@ -26,6 +28,7 @@ namespace kiwi
2628
size_t testNumeric(const char16_t left, const char16_t* first, const char16_t* last) const;
2729
size_t testSerial(const char16_t* first, const char16_t* last) const;
2830
size_t testAbbr(const char16_t* first, const char16_t* last) const;
31+
size_t testEmoji(const char16_t* first, const char16_t* last) const;
2932

3033
public:
3134
std::pair<size_t, POSTag> match(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions) const;
@@ -290,6 +293,77 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last)
290293
return b - first;
291294
}
292295

296+
size_t PatternMatcherImpl::testEmoji(const char16_t* first, const char16_t* last) const
297+
{
298+
const char16_t* b = first;
299+
while (b + 1 < last)
300+
{
301+
char32_t c0 = 0, c1 = 0;
302+
const char16_t* b1 = b;
303+
if (isHighSurrogate(*b1))
304+
{
305+
c0 = mergeSurrogate(b1[0], b1[1]);
306+
b1 += 2;
307+
}
308+
else
309+
{
310+
c0 = *b1++;
311+
}
312+
313+
const char16_t* b2 = b1;
314+
if (b2 < last)
315+
{
316+
if (isHighSurrogate(*b2) && b2 + 1 < last)
317+
{
318+
c1 = mergeSurrogate(b2[0], b2[1]);
319+
b2 += 2;
320+
}
321+
else
322+
{
323+
c1 = *b2++;
324+
}
325+
}
326+
327+
auto r = isEmoji(c0, c1);
328+
if (r == 1)
329+
{
330+
b = b1;
331+
}
332+
else if (r == 2)
333+
{
334+
b = b2;
335+
}
336+
else
337+
{
338+
break;
339+
}
340+
341+
if (b == last) return b - first;
342+
if (0xfe00 <= *b && *b <= 0xfe0f) // variation selectors
343+
{
344+
++b;
345+
if (b == last) return b - first;
346+
}
347+
else if (b + 1 < last && isHighSurrogate(b[0]))
348+
{
349+
c1 = mergeSurrogate(b[0], b[1]);
350+
if (0x1f3fb <= c1 && c1 <= 0x1f3ff) // skin color modifier
351+
{
352+
b += 2;
353+
if (b == last) return b - first;
354+
}
355+
}
356+
357+
if (*b == 0x200d) // zero width joiner
358+
{
359+
++b;
360+
continue;
361+
}
362+
break;
363+
}
364+
return b - first;
365+
}
366+
293367
pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * first, const char16_t * last, Match matchOptions) const
294368
{
295369
size_t size;
@@ -299,6 +373,7 @@ pair<size_t, POSTag> PatternMatcherImpl::match(char16_t left, const char16_t * f
299373
if (!!(matchOptions & Match::email) && (size = testEmail(first, last))) return make_pair(size, POSTag::w_email);
300374
if (!!(matchOptions & Match::mention) && (size = testMention(first, last))) return make_pair(size, POSTag::w_mention);
301375
if (!!(matchOptions & Match::url) && (size = testUrl(first, last))) return make_pair(size, POSTag::w_url);
376+
if (!!(matchOptions & Match::emoji) && (size = testEmoji(first, last))) return make_pair(size, POSTag::w_emoji);
302377
if ((size = testAbbr(first, last))) return make_pair(size, POSTag::sl);
303378
return make_pair(0, POSTag::unknown);
304379
}

0 commit comments

Comments
 (0)