Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feature](analysis) add new chinese tokenizer IK #269

Open
wants to merge 2 commits into
base: clucene
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ INCLUDE_DIRECTORIES( ${_CL_BOOST_INCLUDE_PATH} )

#include the projects
ADD_SUBDIRECTORY (src/ext)
set(PARALLEL_HASHMAP_INCLUDE_DIR "src/ext/parallel_hashmap")
include_directories(${PARALLEL_HASHMAP_INCLUDE_DIR})
include(cmake/FindRoaring.cmake)
find_package(Roaring REQUIRED)

Expand Down
41 changes: 36 additions & 5 deletions src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "CLucene/analysis/Analyzers.h"
#include "CLucene/analysis/cjk/CJKAnalyzer.h"
#include "CLucene/analysis/jieba/ChineseTokenizer.h"
#include "CLucene/analysis/ik/IKTokenizer.h"
#include "CLucene/analysis/ik/dic/Dictionary.h"
#include "CLucene/analysis/standard/StandardFilter.h"
#include "CLucene/analysis/standard/StandardTokenizer.h"
#include "CLucene/snowball/SnowballFilter.h"
Expand All @@ -20,6 +22,7 @@ CL_NS_USE2(analysis, cjk)
CL_NS_USE2(analysis, jieba)
CL_NS_USE2(analysis, standard)
CL_NS_USE2(analysis, snowball)
CL_NS_USE2(analysis, ik)

CL_NS_DEF(analysis)

Expand All @@ -33,6 +36,8 @@ LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem, A
this->stem = stem;
this->mode = mode;
Analyzer::_lowercase = false;
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>();
ikConfig->setUseSmart(mode == AnalyzerMode::IK_Smart);
}

LanguageBasedAnalyzer::~LanguageBasedAnalyzer() {
Expand Down Expand Up @@ -77,6 +82,12 @@ void LanguageBasedAnalyzer::initDict(const std::string &dictPath) {
}

CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict);
} else if (_tcscmp(lang, _T("ik")) == 0) {
if (!ikConfig) {
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>();
}
ikConfig->setDictPath(dictPath);
Dictionary::initial(*ikConfig);
}
}

Expand All @@ -90,9 +101,11 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldNam
} else if (_tcscmp(lang, _T("chinese")) == 0) {
streams->tokenStream = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase);
streams->filteredTokenStream = streams->tokenStream;
} else if (_tcscmp(lang, _T("ik")) == 0) {
streams->tokenStream = _CLNEW CL_NS2(analysis, ik)::IKTokenizer(reader, ikConfig, mode==AnalyzerMode::IK_Smart, Analyzer::_lowercase);
streams->filteredTokenStream = streams->tokenStream;
} else {
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();

if (bufferedReader == nullptr) {
streams->tokenStream = _CLNEW StandardTokenizer(
_CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true);
Expand All @@ -116,13 +129,21 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldNam
return streams->filteredTokenStream;
}

TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *reader) {
TokenStream *ret = nullptr;
TokenStream* LanguageBasedAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) {
TokenStream* ret = nullptr;
if (_tcscmp(lang, _T("cjk")) == 0) {
ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader);
} else if (_tcscmp(lang, _T("chinese")) == 0) {
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase, Analyzer::_ownReader);
} else {
ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(
reader, mode, Analyzer::_lowercase, Analyzer::_ownReader);
} else if (_tcscmp(lang, _T("ik")) == 0) {
if (ikConfig) {
ret = _CLNEW CL_NS2(analysis, ik)::IKTokenizer(
reader, ikConfig, mode!=AnalyzerMode::IK_Max_Word, Analyzer::_lowercase, Analyzer::_ownReader);
} else {
_CLTHROWA(CL_ERR_NullPointer, std::string("no ikConfig for ik tokenizer").c_str());
}
}else {
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();

if (bufferedReader == nullptr) {
Expand All @@ -143,4 +164,14 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *
return ret;
}



void LanguageBasedAnalyzer::setIKConfiguration(const CL_NS2(analysis,ik)::Configuration& cfg) {
if (!ikConfig) {
ikConfig = std::make_shared<CL_NS2(analysis,ik)::Configuration>(cfg);
} else {
*ikConfig = cfg;
}
}

CL_NS_END
8 changes: 7 additions & 1 deletion src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
#ifndef _lucene_analysis_languagebasedanalyzer_
#define _lucene_analysis_languagebasedanalyzer_

#include <memory>
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/analysis/ik/cfg/Configuration.h"

CL_NS_DEF(analysis)

enum class AnalyzerMode {
Default,
All,
Search
Search,
IK_Smart,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest separating the IK and Jieba enums for better clarity.

IK_Max_Word
};

class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::Analyzer {
Expand All @@ -37,6 +41,7 @@ class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::An
TCHAR lang[100]{};
bool stem;
AnalyzerMode mode{};
std::shared_ptr<CL_NS2(analysis,ik)::Configuration> ikConfig;

public:
explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem = true, AnalyzerMode mode = AnalyzerMode::All);
Expand All @@ -47,6 +52,7 @@ class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::An
void setLanguage(const TCHAR *language);
void setStem(bool s);
void setMode(AnalyzerMode m);
void setIKConfiguration(const CL_NS2(analysis,ik)::Configuration& cfg);
void initDict(const std::string &dictPath) override;
TokenStream *tokenStream(const TCHAR *fieldName, CL_NS(util)::Reader *reader) override;
TokenStream *reusableTokenStream(const TCHAR * /*fieldName*/, CL_NS(util)::Reader *reader) override;
Expand Down
63 changes: 63 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/IKTokenizer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#include "IKTokenizer.h"

#include "CLucene/_ApiHeader.h"
#include "CLucene/analysis/ik/core/IKSegmenter.h"
#include "CLucene/util/CLStreams.h"

CL_NS_DEF2(analysis, ik)
CL_NS_USE(analysis)
CL_NS_USE(util)

IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config)
: Tokenizer(reader), config_(config) {
reset(reader);
Tokenizer::lowercase = false;
Tokenizer::ownReader = false;
}

IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config, bool isSmart,
bool lowercase, bool ownReader)
: Tokenizer(reader), config_(config) {
config_->setUseSmart(isSmart);
config_->setEnableLowercase(lowercase);
reset(reader);
Tokenizer::lowercase = lowercase;
Tokenizer::ownReader = ownReader;
}

Token* IKTokenizer::next(Token* token) {
if (buffer_index_ >= data_length_) {
return nullptr;
}

std::string& token_text = tokens_text_[buffer_index_++];
size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN));
if (Tokenizer::lowercase) {
if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) {
std::transform(token_text.begin(), token_text.end(), token_text.begin(),
[](char c) { return to_lower(c); });
}
}
token->setNoCopy(token_text.data(), 0, size);
return token;
}

void IKTokenizer::reset(Reader* reader) {
this->input = reader;
this->buffer_index_ = 0;
this->data_length_ = 0;
this->tokens_text_.clear();

buffer_.reserve(input->size());

IKSegmentSingleton::getInstance().setContext(reader, config_);

Lexeme lexeme;
while (IKSegmentSingleton::getInstance().next(lexeme)) {
tokens_text_.emplace_back(std::move(lexeme.getText()));
}

data_length_ = tokens_text_.size();
}

CL_NS_END2
44 changes: 44 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/IKTokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#ifndef CLUCENE_IKTOKENIZER_H
#define CLUCENE_IKTOKENIZER_H
#include <memory>
#include <string_view>

#include "CLucene.h"
#include "CLucene/analysis/AnalysisHeader.h"
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
#include "CLucene/analysis/ik/cfg/Configuration.h"
#include "CLucene/analysis/ik/core/IKSegmenter.h"
CL_NS_DEF2(analysis, ik)
CL_NS_USE(analysis)

class IKSegmentSingleton {
public:
static IKSegmenter& getInstance() {
static IKSegmenter instance;
return instance;
}

private:
IKSegmentSingleton() = default;
};

class IKTokenizer : public lucene::analysis::Tokenizer {
private:
int32_t buffer_index_ {0};
int32_t data_length_ {0};
std::string buffer_;
std::vector<std::string> tokens_text_;
std::shared_ptr<Configuration> config_;

public:
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config);
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config,
bool is_smart, bool use_lowercase, bool own_reader = false);
~IKTokenizer() override = default;

lucene::analysis::Token* next(lucene::analysis::Token* token) override;
void reset(lucene::util::Reader* reader) override;
};

CL_NS_END2
#endif //CLUCENE_IKTOKENIZER_H
71 changes: 71 additions & 0 deletions src/contribs-lib/CLucene/analysis/ik/cfg/Configuration.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#ifndef CLUCENE_CONFIGURATION_H
#define CLUCENE_CONFIGURATION_H

#include <string>

CL_NS_DEF2(analysis, ik)

// TODO(whj): Optimize the design of the Configuration class, remove duplicate configurations (like mode and lowercase)
class Configuration {
private:
bool use_smart_;
bool enable_lowercase_;
std::string dict_path_;

struct DictFiles {
std::string main {"main.dic"};
std::string quantifier {"quantifier.dic"};
std::string stopwords {"stopword.dic"};
} dict_files_;

std::vector<std::string> ext_dict_files_;
std::vector<std::string> ext_stop_word_dict_files_;

public:
Configuration() : use_smart_(true), enable_lowercase_(true) {
ext_dict_files_ = {"extra_main.dic", "extra_single_word.dic", "extra_single_word_full.dic",
"extra_single_word_low_freq.dic"};

ext_stop_word_dict_files_ = {"extra_stopword.dic"};
}

bool isUseSmart() const { return use_smart_; }
Configuration& setUseSmart(bool smart) {
use_smart_ = smart;
return *this;
}

bool isEnableLowercase() const { return enable_lowercase_; }
Configuration& setEnableLowercase(bool enable) {
enable_lowercase_ = enable;
return *this;
}

std::string getDictPath() const { return dict_path_; }
Configuration& setDictPath(const std::string& path) {
dict_path_ = path;
return *this;
}

void setMainDictFile(const std::string& file) { dict_files_.main = file; }
void setQuantifierDictFile(const std::string& file) { dict_files_.quantifier = file; }
void setStopWordDictFile(const std::string& file) { dict_files_.stopwords = file; }

const std::string& getMainDictFile() const { return dict_files_.main; }
const std::string& getQuantifierDictFile() const { return dict_files_.quantifier; }
const std::string& getStopWordDictFile() const { return dict_files_.stopwords; }

void addExtDictFile(const std::string& filePath) { ext_dict_files_.push_back(filePath); }
void addExtStopWordDictFile(const std::string& filePath) {
ext_stop_word_dict_files_.push_back(filePath);
}

const std::vector<std::string>& getExtDictFiles() const { return ext_dict_files_; }
const std::vector<std::string>& getExtStopWordDictFiles() const {
return ext_stop_word_dict_files_;
}
};

CL_NS_END2

#endif //CLUCENE_CONFIGURATION_H
Loading