Skip to content

Commit 5d88229

Browse files
authored
Merge pull request #216 from bab2min/dev/pclm
Refactoring for new models and new kernels
2 parents 1e9a784 + 3d586d3 commit 5d88229

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

96 files changed

+10853
-4586
lines changed

Diff for: .github/workflows/arm64_centos7.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
- name: Test
3535
run: |
3636
./build/test/kiwi-test
37-
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/ && ./build/kiwi-evaluator -m ./models/base eval_data/*.txt --sbg -o eval_results/
37+
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t knlm -o eval_results/ && ./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t sbg -o eval_results/
3838
cp -r build /artifacts/
3939
cp -r eval_results /artifacts/
4040
- name: Benchmark

Diff for: .github/workflows/centos7.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ jobs:
4040
- name: Run Evaluator
4141
run: |
4242
mkdir eval_results
43-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/
44-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt --sbg -o eval_results/
43+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t knlm -o eval_results/
44+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t sbg -o eval_results/
4545
- run: tar -zcvf arts.tgz build/*kiwi* build/test/*kiwi* eval_results/*.txt build/bindings/java/*.jar
4646
- name: Archive binaries
4747
uses: actions/upload-artifact@v4

Diff for: .github/workflows/macos.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ jobs:
6060
- name: Run Evaluator
6161
run: |
6262
mkdir eval_results
63-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/
64-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt --sbg -o eval_results/
63+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t knlm -o eval_results/
64+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t sbg -o eval_results/
6565
- name: Run Benchmark
6666
run: |
6767
curl -OL https://latina.bab2min.pe.kr/_data/kowiki1000.txt

Diff for: .github/workflows/ppc64le_centos7.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
mkdir build && pushd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_USE_MIMALLOC=0 -DKIWI_JAVA_BINDING=1 ..
2929
make -j2 && popd
3030
./build/test/kiwi-test
31-
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/ && ./build/kiwi-evaluator -m ./models/base eval_data/*.txt --sbg -o eval_results/
31+
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t knlm -o eval_results/ && ./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t sbg -o eval_results/
3232
cp -r build /artifacts/
3333
cp -r eval_results /artifacts/
3434
- name: Archive binaries

Diff for: .github/workflows/release.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ jobs:
220220
- name: Test
221221
run: |
222222
./build/test/kiwi-test
223-
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/
223+
mkdir eval_results && ./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -o eval_results/
224224
- name: Release
225225
run: |
226226
cd build

Diff for: .github/workflows/ubuntu.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ jobs:
6060
- name: Run Evaluator
6161
run: |
6262
mkdir eval_results
63-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt -o eval_results/
64-
./build/kiwi-evaluator -m ./models/base eval_data/*.txt --sbg -o eval_results/
63+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t knlm -o eval_results/
64+
./build/kiwi-evaluator -m ./models/base --morph eval_data/*.txt -t sbg -o eval_results/
6565
- name: Run Benchmark
6666
run: |
6767
curl -OL https://latina.bab2min.pe.kr/_data/kowiki1000.txt

Diff for: .github/workflows/windows.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ jobs:
3535
- name: Run Evaluator
3636
run: |
3737
mkdir eval_results
38-
.\build\Release\kiwi-evaluator.exe -m .\models\base (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\
39-
.\build\Release\kiwi-evaluator.exe -m .\models\base --sbg (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\
38+
.\build\Release\kiwi-evaluator.exe -m .\models\base -t knlm --morph (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\
39+
.\build\Release\kiwi-evaluator.exe -m .\models\base -t sbg --morph (Get-ChildItem eval_data\*.txt | Select-Object -Expand FullName) -o eval_results\
4040
- name: Archive binaries
4141
uses: actions/upload-artifact@v4
4242
with:

Diff for: .gitmodules

+3-3
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@
1515
[submodule "third_party/cpuinfo"]
1616
path = third_party/cpuinfo
1717
url = https://github.com/pytorch/cpuinfo
18-
[submodule "third_party/variant"]
19-
path = third_party/variant
20-
url = https://github.com/mapbox/variant
2118
[submodule "third_party/eigen"]
2219
path = third_party/eigen
2320
url = https://gitlab.com/libeigen/eigen
2421
[submodule "third_party/json"]
2522
path = third_party/json
2623
url = https://github.com/nlohmann/json
24+
[submodule "third_party/streamvbyte"]
25+
path = third_party/streamvbyte
26+
url = https://github.com/fast-pack/streamvbyte

Diff for: CMakeLists.txt

+53-10
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
cmake_minimum_required(VERSION 3.12)
22

3-
project(kiwi VERSION 0.20.4 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
3+
project(kiwi VERSION 0.21.0 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
44

5-
set ( CMAKE_CXX_STANDARD 14 )
5+
set ( CMAKE_CXX_STANDARD 17 )
66
set ( CMAKE_VERBOSE_MAKEFILE true )
77

88
option(KIWI_USE_MIMALLOC "Use mimalloc for faster memory allocation" ON)
@@ -38,17 +38,29 @@ if(NOT KIWI_CPU_ARCH)
3838
set(KIWI_CPU_ARCH "${KIWI_CPU_ARCH}" PARENT_SCOPE)
3939
endif()
4040

41+
42+
if (KIWI_USE_CPUINFO AND
43+
(MSVC OR
44+
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 11)
45+
)
46+
)
47+
set ( AVX_VNNI_SUPPORTED ON )
48+
else()
49+
set ( AVX_VNNI_SUPPORTED OFF )
50+
endif()
51+
4152
if(APPLE)
4253
set(CMAKE_OSX_ARCHITECTURES "${KIWI_CPU_ARCH}")
4354
endif()
4455

4556
set ( CORE_SRCS
4657
src/ArchUtils.cpp
4758
src/Combiner.cpp
59+
src/CoNgramModel.cpp
60+
src/Dataset.cpp
4861
src/Form.cpp
4962
src/FeatureTestor.cpp
5063
src/FileUtils.cpp
51-
src/Dataset.cpp
5264
src/Joiner.cpp
5365
src/Kiwi.cpp
5466
src/KiwiBuilder.cpp
@@ -57,6 +69,7 @@ set ( CORE_SRCS
5769
src/PatternMatcher.cpp
5870
src/search.cpp
5971
src/ScriptType.cpp
72+
src/SkipBigramModel.cpp
6073
src/SubstringExtractor.cpp
6174
src/SwTokenizer.cpp
6275
src/TagUtils.cpp
@@ -81,9 +94,13 @@ endif()
8194
include_directories( include/ )
8295
include_directories( third_party/tclap/include )
8396
include_directories( third_party/cpp-btree )
84-
include_directories( third_party/variant/include )
8597
include_directories( third_party/eigen )
8698
include_directories( third_party/json/include )
99+
include_directories( third_party/streamvbyte/include )
100+
add_subdirectory( third_party/streamvbyte )
101+
set ( STREAMVBYTE_OBJECTS
102+
$<TARGET_OBJECTS:streamvbyte>
103+
)
87104
if(KIWI_USE_CPUINFO)
88105
message(STATUS "Use cpuinfo")
89106
include_directories( third_party/cpuinfo/include )
@@ -98,9 +115,6 @@ if(KIWI_USE_CPUINFO)
98115
set ( ADDITIONAL_FLAGS ${ADDITIONAL_FLAGS} "-DKIWI_USE_CPUINFO" )
99116

100117
if(MSVC)
101-
target_compile_options("clog" PUBLIC
102-
/MT
103-
)
104118
target_compile_options("cpuinfo" PUBLIC
105119
/MT
106120
)
@@ -110,15 +124,18 @@ if(KIWI_USE_CPUINFO)
110124
endif()
111125

112126
set ( CPUINFO_OBJECTS_STATIC
113-
$<TARGET_OBJECTS:clog>
114127
$<TARGET_OBJECTS:cpuinfo_internals>
115128
)
116129
set ( CPUINFO_OBJECTS_SHARED
117-
$<TARGET_OBJECTS:clog>
118130
$<TARGET_OBJECTS:cpuinfo>
119131
)
120132
endif()
121133

134+
if (AVX_VNNI_SUPPORTED)
135+
message(STATUS "AVX-VNNI is supported")
136+
set ( ADDITIONAL_FLAGS ${ADDITIONAL_FLAGS} "-DKIWI_AVX_VNNI_SUPPORTED" )
137+
endif()
138+
122139
if(MSVC)
123140
set ( CMAKE_C_FLAGS_DEBUG "-DDEBUG -DC_FLAGS -Zi -Od /utf-8 /bigobj" )
124141
set ( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}" )
@@ -143,6 +160,12 @@ else()
143160
set ( CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE} -g3")
144161
set ( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO}")
145162
set ( CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELEASE}" )
163+
164+
if (APPLE)
165+
set ( CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wno-unqualified-std-cast-call" )
166+
set ( CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wno-unqualified-std-cast-call" )
167+
set ( CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -Wno-unqualified-std-cast-call" )
168+
endif()
146169
endif()
147170

148171
if (KIWI_CPU_ARCH MATCHES "x86_64")
@@ -157,21 +180,36 @@ if (KIWI_CPU_ARCH MATCHES "x86_64")
157180
${CORE_SRCS}
158181
src/archImpl/avx2.cpp
159182
src/archImpl/avx512bw.cpp
183+
src/archImpl/avx512vnni.cpp
160184
)
185+
# If AVX-VNNI is supported (MSVC, GCC 11+ or Clang 11+)
186+
if (AVX_VNNI_SUPPORTED)
187+
set( CORE_SRCS
188+
${CORE_SRCS}
189+
src/archImpl/avx_vnni.cpp
190+
)
191+
endif()
161192
endif()
193+
162194
if(MSVC)
163195
set_source_files_properties(src/archImpl/sse2.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
164196
set_source_files_properties(src/archImpl/sse4_1.cpp PROPERTIES COMPILE_FLAGS "/arch:SSE2")
165197
if (KIWI_USE_CPUINFO)
166198
set_source_files_properties(src/archImpl/avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
199+
set_source_files_properties(src/archImpl/avx_vnni.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
167200
set_source_files_properties(src/archImpl/avx512bw.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
201+
set_source_files_properties(src/archImpl/avx512vnni.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
168202
endif()
169203
else()
170204
set_source_files_properties(src/archImpl/sse2.cpp PROPERTIES COMPILE_FLAGS "-msse2")
171205
set_source_files_properties(src/archImpl/sse4_1.cpp PROPERTIES COMPILE_FLAGS "-msse2 -msse4.1")
172206
if (KIWI_USE_CPUINFO)
173207
set_source_files_properties(src/archImpl/avx2.cpp PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma")
174-
set_source_files_properties(src/archImpl/avx512bw.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw")
208+
set_source_files_properties(src/archImpl/avx512bw.cpp PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mavx512f -mavx512vl -mavx512dq -mavx512bw")
209+
set_source_files_properties(src/archImpl/avx512vnni.cpp PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mavx512f -mavx512vl -mavx512dq -mavx512bw -mavx512vnni")
210+
if (AVX_VNNI_SUPPORTED)
211+
set_source_files_properties(src/archImpl/avx_vnni.cpp PROPERTIES COMPILE_FLAGS "-mavx -mavx2 -mfma -mavxvnni")
212+
endif()
175213
endif()
176214
endif()
177215
elseif (KIWI_CPU_ARCH MATCHES "arm64")
@@ -191,12 +229,14 @@ add_library( "${PROJECT_NAME}_static" STATIC
191229
${CORE_SRCS}
192230
src/capi/kiwi_c.cpp
193231
${CPUINFO_OBJECTS_STATIC}
232+
${STREAMVBYTE_OBJECTS}
194233
)
195234

196235
add_library( "${PROJECT_NAME}" SHARED
197236
${CORE_SRCS}
198237
src/capi/kiwi_c.cpp
199238
${CPUINFO_OBJECTS_SHARED}
239+
${STREAMVBYTE_OBJECTS}
200240
)
201241

202242
# Install the kiwi library as well as header files to (`include/kiwi` directory)
@@ -265,6 +305,9 @@ if(MSVC)
265305
target_compile_options("${PROJECT_NAME}_static" PUBLIC
266306
/MT
267307
)
308+
target_compile_options("streamvbyte" PUBLIC
309+
/MT
310+
)
268311
endif()
269312

270313
target_compile_options("${PROJECT_NAME}" PUBLIC

Diff for: bindings/java/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@ set(CMAKE_JAVA_COMPILE_FLAGS -source 8 -target 8 -encoding utf-8)
99
set(pkg_name "KiwiJava-${PROJECT_VERSION}")
1010
add_library (${pkg_name} SHARED kiwi_java.cpp
1111
$<TARGET_OBJECTS:${PROJECT_NAME}>
12-
$<TARGET_OBJECTS:clog>
1312
$<TARGET_OBJECTS:cpuinfo>
13+
$<TARGET_OBJECTS:streamvbyte>
1414
)
1515
if(UNIX AND NOT APPLE)
1616
target_link_libraries( ${pkg_name}

Diff for: bindings/java/JniUtils.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <vector>
77
#include <optional>
88
#include <iostream>
9+
#include <cstdint>
910

1011
#include <jni.h>
1112

Diff for: bindings/java/kiwi_java.cpp

+18-1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,23 @@ namespace jni
9595
}
9696
};
9797

98+
template<>
99+
struct ValueBuilder<kiwi::ModelType> : public ValueBuilder<uint32_t>
100+
{
101+
using CppType = kiwi::ModelType;
102+
using JniType = jint;
103+
104+
CppType fromJava(JNIEnv* env, JniType v)
105+
{
106+
return (CppType)v;
107+
}
108+
109+
JniType toJava(JNIEnv* env, CppType v)
110+
{
111+
return (JniType)v;
112+
}
113+
};
114+
98115
template<>
99116
struct ValueBuilder<kiwi::Match> : public ValueBuilder<uint32_t>
100117
{
@@ -564,7 +581,7 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
564581
.template method<&JTypoTransformer::scaleCost>("_scaleCost"),
565582

566583
jni::define<JKiwiBuilder>()
567-
.template ctor<std::string, size_t, kiwi::BuildOption, bool>()
584+
.template ctor<std::string, size_t, kiwi::BuildOption, kiwi::ModelType>()
568585
.template method<&JKiwiBuilder::addWord>("addWord")
569586
.template method<&JKiwiBuilder::addWord2>("addWord")
570587
.template method<&JKiwiBuilder::addPreAnalyzedWord>("addPreAnalyzedWord")

Diff for: bindings/java/kr/pe/bab2min/Kiwi.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
public class Kiwi implements AutoCloseable {
1414
private long _inst;
15-
final private static String _version = "0.20.4";
15+
final private static String _version = "0.21.0";
1616

1717
public static class Match {
1818
final static public int none = 0,
@@ -345,8 +345,8 @@ public Kiwi(long _inst) {
345345
this._inst = _inst;
346346
}
347347

348-
public static Kiwi init(String modelPath, int numWorkers, int buildOptions, boolean useSBG) throws Exception {
349-
try(KiwiBuilder b = new KiwiBuilder(modelPath, numWorkers, buildOptions, useSBG)) {
348+
public static Kiwi init(String modelPath, int numWorkers, int buildOptions, int modelType) throws Exception {
349+
try(KiwiBuilder b = new KiwiBuilder(modelPath, numWorkers, buildOptions, modelType)) {
350350
return b.build();
351351
}
352352
}

Diff for: bindings/java/kr/pe/bab2min/KiwiBuilder.java

+14-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ public static class BuildOption {
1212
default_ = integrateAllomorph | loadDefaultDict | loadTypoDict | loadMultiDict;
1313
}
1414

15+
public static class ModelType {
16+
final static public int none = 0,
17+
knlm = 1,
18+
sbg = 2,
19+
cong = 3,
20+
congGlobal = 4;
21+
}
22+
1523
public static class AnalyzedMorph {
1624
public String form;
1725
public byte tag = Kiwi.POSTag.nng;
@@ -113,20 +121,20 @@ public KiwiBuilder(long _inst) {
113121
this._inst = _inst;
114122
}
115123

116-
public KiwiBuilder(String modelPath, int numWorkers, int buildOptions, boolean useSBG) {
117-
ctor(modelPath, numWorkers, buildOptions, useSBG);
124+
public KiwiBuilder(String modelPath, int numWorkers, int buildOptions, int modelType) {
125+
ctor(modelPath, numWorkers, buildOptions, modelType);
118126
}
119127

120128
public KiwiBuilder(String modelPath, int numWorkers, int buildOptions) {
121-
ctor(modelPath, numWorkers, buildOptions, false);
129+
ctor(modelPath, numWorkers, buildOptions, ModelType.none);
122130
}
123131

124132
public KiwiBuilder(String modelPath, int numWorkers) {
125-
ctor(modelPath, numWorkers, BuildOption.default_, false);
133+
ctor(modelPath, numWorkers, BuildOption.default_, ModelType.none);
126134
}
127135

128136
public KiwiBuilder(String modelPath) {
129-
ctor(modelPath, 1, BuildOption.default_, false);
137+
ctor(modelPath, 1, BuildOption.default_, ModelType.none);
130138
}
131139

132140
protected void finalize() throws Exception {
@@ -137,7 +145,7 @@ public boolean isAlive() {
137145
return _inst != 0;
138146
}
139147

140-
private native void ctor(String modelPath, int numWorkers, int buildOptions, boolean useSBG);
148+
private native void ctor(String modelPath, int numWorkers, int buildOptions, int modelType);
141149

142150
@Override
143151
public native void close() throws Exception;

0 commit comments

Comments
 (0)