Skip to content

Commit 4c87dcd

Browse files
authored
Merge pull request #172 from bab2min/dev_18
Prepare v0.18.0
2 parents 30c9f7e + 0ab6c9d commit 4c87dcd

22 files changed

+265
-48
lines changed

Diff for: .github/workflows/macos.yml

+21-21
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,16 @@ jobs:
99
strategy:
1010
matrix:
1111
include:
12-
- name: "macOS 11 + Xcode 11.7"
13-
os: macos-11
12+
- name: "macOS 13 + Xcode 15.0"
13+
os: macos-13
14+
arch: x86_64
1415
compiler: xcode
15-
version: "11.7"
16-
- name: "macOS 11 + Xcode 12.2"
17-
os: macos-11
16+
version: "15.0"
17+
- name: "macOS 14 Arm64 + Xcode 15.0"
18+
os: macos-14
19+
arch: arm64
1820
compiler: xcode
19-
version: "12.4"
20-
- name: "macOS 11 + gcc-10"
21-
os: macos-11
22-
compiler: gcc
23-
version: "10"
24-
- name: "macOS 11 + gcc-11"
25-
os: macos-11
26-
compiler: gcc
27-
version: "11"
21+
version: "15.0"
2822

2923
runs-on: ${{ matrix.os }}
3024
name: ${{ matrix.name }}
@@ -42,8 +36,8 @@ jobs:
4236
else
4337
ls -ls /Applications/
4438
sudo xcode-select -switch /Applications/Xcode_${{ matrix.version }}.app
45-
echo "CC=clang" >> $GITHUB_ENV
46-
echo "CXX=clang++" >> $GITHUB_ENV
39+
echo "CC=$(brew --prefix llvm@15)/bin/clang" >> $GITHUB_ENV
40+
echo "CXX=$(brew --prefix llvm@15)/bin/clang++" >> $GITHUB_ENV
4741
fi
4842
- name: Configure Build
4943
run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 ..
@@ -74,11 +68,17 @@ jobs:
7468
./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
7569
KIWI_ARCH_TYPE=none ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
7670
KIWI_ARCH_TYPE=balanced ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
77-
KIWI_ARCH_TYPE=sse2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
78-
KIWI_ARCH_TYPE=sse4_1 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
79-
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
80-
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --sbg kowiki1000.txt
81-
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --typos 6 kowiki1000.txt
71+
if [ "${{ matrix.arch }}" = "x86_64" ]; then
72+
KIWI_ARCH_TYPE=sse2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
73+
KIWI_ARCH_TYPE=sse4_1 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
74+
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
75+
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --sbg kowiki1000.txt
76+
KIWI_ARCH_TYPE=avx2 ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --typos 6 kowiki1000.txt
77+
else
78+
KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out kowiki1000.txt
79+
KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --sbg kowiki1000.txt
80+
KIWI_ARCH_TYPE=neon ./build/kiwi-cli-* -m ./ModelGenerator -e -o test.out --typos 6 kowiki1000.txt
81+
fi
8282
- name: Archive binaries
8383
uses: actions/upload-artifact@v2
8484
with:

Diff for: .github/workflows/release.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ jobs:
144144
build-macos:
145145
strategy:
146146
matrix:
147-
os: [macos-11]
147+
os: [macos-13]
148148
arch: [x86_64, arm64]
149149

150150
runs-on: ${{ matrix.os }}

Diff for: CMakeLists.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cmake_minimum_required(VERSION 3.12)
22

3-
project(kiwi VERSION 0.17.1 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
3+
project(kiwi VERSION 0.18.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
44

55
set ( CMAKE_CXX_STANDARD 14 )
66
set ( CMAKE_VERBOSE_MAKEFILE true )
@@ -48,14 +48,15 @@ set ( CORE_SRCS
4848
src/Form.cpp
4949
src/FeatureTestor.cpp
5050
src/FileUtils.cpp
51-
src/HSDataset.cpp
51+
src/Dataset.cpp
5252
src/Joiner.cpp
5353
src/Kiwi.cpp
5454
src/KiwiBuilder.cpp
5555
src/KTrie.cpp
5656
src/PatternMatcher.cpp
5757
src/search.cpp
5858
src/ScriptType.cpp
59+
src/SubstringExtractor.cpp
5960
src/SwTokenizer.cpp
6061
src/TagUtils.cpp
6162
src/TypoTransformer.cpp

Diff for: bindings/java/JniUtils.hpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -924,7 +924,8 @@ namespace jni
924924
{
925925
return CppType{ env, v };
926926
}
927-
if (!env->IsInstanceOf(v, JIteratorBase::jClass)) throw std::runtime_error{ StringConcat_v<svNotInstanceOf, typeStr, svNullTerm>.data()};
927+
// The following line crashes clang compiler. I don't know why, but it's not necessary. So I commented it out.
928+
if (!env->IsInstanceOf(v, JIteratorBase::jClass)) throw std::runtime_error{ ""/*StringConcat_v<svNotInstanceOf, typeStr, svNullTerm>.data()*/};
928929
return CppType{ env, v };
929930
}
930931
};

Diff for: bindings/java/kr/pe/bab2min/Kiwi.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
public class Kiwi implements AutoCloseable {
1414
private long _inst;
15-
final private static String _version = "0.17.1";
15+
final private static String _version = "0.18.0";
1616

1717
public static class Match {
1818
final static public int none = 0,
File renamed without changes.

Diff for: include/kiwi/Form.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* @file Form.h
33
* @author bab2min ([email protected])
44
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
5-
* @version 0.17.0
6-
* @date 2022-09-01
5+
* @version 0.18.0
6+
* @date 2024-07-01
77
*
88
*
99
*/

Diff for: include/kiwi/Kiwi.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* @file Kiwi.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C++ API를 담고 있는 헤더 파일
5-
* @version 0.17.0
6-
* @date 2022-09-01
5+
* @version 0.18.0
6+
* @date 2024-07-01
77
*
88
*
99
*/
@@ -650,7 +650,7 @@ namespace kiwi
650650
* @param numThreads 모델 및 형태소 분석에 사용할 스레드 개수
651651
* @param options 생성 옵션. `kiwi::BuildOption`을 참조
652652
*/
653-
KiwiBuilder(const std::string& modelPath, size_t numThreads = 0, BuildOption options = BuildOption::integrateAllomorph | BuildOption::loadDefaultDict, bool useSBG = false);
653+
KiwiBuilder(const std::string& modelPath, size_t numThreads = 0, BuildOption options = BuildOption::default_, bool useSBG = false);
654654

655655
/**
656656
* @brief 현재 KiwiBuilder 객체가 유효한 분석 모델을 로딩한 상태인지 알려준다.

Diff for: include/kiwi/Macro.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#define KIWI_STR(x) KIWI_STR_HELPER(x)
55

66
#define KIWI_VERSION_MAJOR 0
7-
#define KIWI_VERSION_MINOR 17
8-
#define KIWI_VERSION_PATCH 1
7+
#define KIWI_VERSION_MINOR 18
8+
#define KIWI_VERSION_PATCH 0
99

1010
#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)

Diff for: include/kiwi/SubstringExtractor.h

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#pragma once
2+
3+
#include <vector>
4+
#include <string>
5+
6+
namespace kiwi
7+
{
8+
std::vector<std::pair<std::u16string, size_t>> extractSubstrings(
9+
const char16_t* first,
10+
const char16_t* last,
11+
size_t minCnt,
12+
size_t minLength = 2,
13+
size_t maxLength = 32,
14+
bool longestOnly = true,
15+
char16_t stopChr = 0);
16+
}

Diff for: include/kiwi/SwTokenizer.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* @file SwTokenizer.h
33
* @author bab2min ([email protected])
44
* @brief Subword Tokenizer
5-
* @version 0.16.1
6-
* @date 2022-07-28
5+
* @version 0.18.0
6+
* @date 2024-07-01
77
*
88
*
99
*/

Diff for: include/kiwi/Types.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* @file Types.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
5-
* @version 0.17.0
6-
* @date 2022-09-01
5+
* @version 0.18.0
6+
* @date 2024-07-01
77
*
88
*
99
*/

Diff for: include/kiwi/capi.h

+17-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
* @file capi.h
33
* @author bab2min ([email protected])
44
* @brief Kiwi C API를 담고 있는 헤더 파일
5-
* @version 0.17.0
6-
* @date 2022-09-01
5+
* @version 0.18.0
6+
* @date 2024-07-01
77
*
88
*
99
*/
@@ -45,7 +45,11 @@ typedef struct {
4545
uint32_t line_number; /**< 줄 번호*/
4646
uint16_t length; /**< 길이(UTF16 문자 기준) */
4747
uint8_t tag; /**< 품사 태그 */
48-
uint8_t sense_id; /**< 의미 번호 */
48+
union
49+
{
50+
uint8_t sense_id; /**< 의미 번호 */
51+
uint8_t script; /**< 유니코드 영역에 기반한 문자 타입 */
52+
};
4953
float score; /**< 해당 형태소의 언어모델 점수 */
5054
float typo_cost; /**< 오타가 교정된 경우 오타 비용. 그렇지 않은 경우 0 */
5155
uint32_t typo_form_id; /**< 교정 전 오타의 형태에 대한 정보 (typoCost가 0인 경우 의미 없음) */
@@ -1008,6 +1012,16 @@ DECL_DLL int kiwi_pt_add_token_to_span_w(kiwi_pretokenized_h handle, int span_id
10081012
*/
10091013
DECL_DLL int kiwi_pt_close(kiwi_pretokenized_h handle);
10101014

1015+
/**
1016+
* @brief `kiwi_token_info_t`의 `script`가 가리키는 문자 영역의 유니코드 상 이름을 반환합니다.
1017+
*
1018+
* @param script `kiwi_token_info_t`의 `script` 필드 값
1019+
* @return 유니코드 영역의 이름을 반환합니다. 알 수 없을 경우 "Unknown"을 반환합니다.
1020+
*
1021+
* @note 이 함수가 반환하는 값은 string literal이므로 별도로 해제할 필요가 없습니다.
1022+
*/
1023+
DECL_DLL const char* kiwi_get_script_name(uint8_t script);
1024+
10111025
#ifdef __cplusplus
10121026
}
10131027
#endif

Diff for: src/HSDataset.cpp renamed to src/Dataset.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include <kiwi/HSDataset.h>
1+
#include <kiwi/Dataset.h>
22
#include "RaggedVector.hpp"
33

44
using namespace kiwi;

Diff for: src/KiwiBuilder.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
#include <kiwi/Kiwi.h>
55
#include <kiwi/Utils.h>
6-
#include <kiwi/HSDataset.h>
6+
#include <kiwi/Dataset.h>
77
#include "ArchAvailable.h"
88
#include "KTrie.h"
99
#include "StrUtils.h"

Diff for: src/PatternMatcher.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last)
272272
if (b != last && *b == ' ')
273273
{
274274
if (l > (isUpperAlpha(*first) ? 5 : 3)) return 0; // reject too long patterns for abbreviation
275-
++b;
275+
return b - first;
276276
}
277277
else
278278
{
@@ -287,7 +287,6 @@ size_t PatternMatcherImpl::testAbbr(const char16_t* first, const char16_t* last)
287287

288288
if (b != last && *b == '.') ++b;
289289
else return b - first;
290-
if (b != last && *b == ' ') ++b;
291290
}
292291
if (b[-1] == ' ') --b;
293292
return b - first;

Diff for: src/ScriptType.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -563,7 +563,7 @@ namespace kiwi
563563
if (type == ScriptType::chess_symbols) return "Chess Symbols";
564564
if (type == ScriptType::symbols_for_legacy_computing) return "Symbols for Legacy Computing";
565565
if (type == ScriptType::tags) return "Tags";
566-
return "unknown";
566+
return "Unknown";
567567
}
568568

569569
int isEmoji(char32_t c0, char32_t c1)

0 commit comments

Comments
 (0)