-
Notifications
You must be signed in to change notification settings - Fork 40
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[feature](analysis) add new chinese tokenizer IK #269
Open
Ryan19929
wants to merge
2
commits into
apache:clucene
Choose a base branch
from
Ryan19929:clucene-ik-20250102
base: clucene
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#include "IKTokenizer.h" | ||
|
||
#include "CLucene/_ApiHeader.h" | ||
#include "CLucene/analysis/ik/core/IKSegmenter.h" | ||
#include "CLucene/util/CLStreams.h" | ||
|
||
CL_NS_DEF2(analysis, ik) | ||
CL_NS_USE(analysis) | ||
CL_NS_USE(util) | ||
|
||
IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config) | ||
: Tokenizer(reader), config_(config) { | ||
reset(reader); | ||
Tokenizer::lowercase = false; | ||
Tokenizer::ownReader = false; | ||
} | ||
|
||
IKTokenizer::IKTokenizer(Reader* reader, std::shared_ptr<Configuration> config, bool isSmart, | ||
bool lowercase, bool ownReader) | ||
: Tokenizer(reader), config_(config) { | ||
config_->setUseSmart(isSmart); | ||
config_->setEnableLowercase(lowercase); | ||
reset(reader); | ||
Tokenizer::lowercase = lowercase; | ||
Tokenizer::ownReader = ownReader; | ||
} | ||
|
||
Token* IKTokenizer::next(Token* token) { | ||
if (buffer_index_ >= data_length_) { | ||
return nullptr; | ||
} | ||
|
||
std::string& token_text = tokens_text_[buffer_index_++]; | ||
size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); | ||
if (Tokenizer::lowercase) { | ||
if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) { | ||
std::transform(token_text.begin(), token_text.end(), token_text.begin(), | ||
[](char c) { return to_lower(c); }); | ||
} | ||
} | ||
token->setNoCopy(token_text.data(), 0, size); | ||
return token; | ||
} | ||
|
||
void IKTokenizer::reset(Reader* reader) { | ||
this->input = reader; | ||
this->buffer_index_ = 0; | ||
this->data_length_ = 0; | ||
this->tokens_text_.clear(); | ||
|
||
buffer_.reserve(input->size()); | ||
|
||
IKSegmentSingleton::getInstance().setContext(reader, config_); | ||
|
||
Lexeme lexeme; | ||
while (IKSegmentSingleton::getInstance().next(lexeme)) { | ||
tokens_text_.emplace_back(std::move(lexeme.getText())); | ||
} | ||
|
||
data_length_ = tokens_text_.size(); | ||
} | ||
|
||
CL_NS_END2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#ifndef CLUCENE_IKTOKENIZER_H | ||
#define CLUCENE_IKTOKENIZER_H | ||
#include <memory> | ||
#include <string_view> | ||
|
||
#include "CLucene.h" | ||
#include "CLucene/analysis/AnalysisHeader.h" | ||
#include "CLucene/analysis/LanguageBasedAnalyzer.h" | ||
#include "CLucene/analysis/ik/cfg/Configuration.h" | ||
#include "CLucene/analysis/ik/core/IKSegmenter.h" | ||
CL_NS_DEF2(analysis, ik) | ||
CL_NS_USE(analysis) | ||
|
||
class IKSegmentSingleton { | ||
public: | ||
static IKSegmenter& getInstance() { | ||
static IKSegmenter instance; | ||
return instance; | ||
} | ||
|
||
private: | ||
IKSegmentSingleton() = default; | ||
}; | ||
|
||
class IKTokenizer : public lucene::analysis::Tokenizer { | ||
private: | ||
int32_t buffer_index_ {0}; | ||
int32_t data_length_ {0}; | ||
std::string buffer_; | ||
std::vector<std::string> tokens_text_; | ||
std::shared_ptr<Configuration> config_; | ||
|
||
public: | ||
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config); | ||
explicit IKTokenizer(lucene::util::Reader* reader, std::shared_ptr<Configuration> config, | ||
bool is_smart, bool use_lowercase, bool own_reader = false); | ||
~IKTokenizer() override = default; | ||
|
||
lucene::analysis::Token* next(lucene::analysis::Token* token) override; | ||
void reset(lucene::util::Reader* reader) override; | ||
}; | ||
|
||
CL_NS_END2 | ||
#endif //CLUCENE_IKTOKENIZER_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#ifndef CLUCENE_CONFIGURATION_H | ||
#define CLUCENE_CONFIGURATION_H | ||
|
||
#include <string> | ||
|
||
CL_NS_DEF2(analysis, ik) | ||
|
||
// TODO(whj): Optimize the design of the Configuration class, remove duplicate configurations (like mode and lowercase) | ||
class Configuration { | ||
private: | ||
bool use_smart_; | ||
bool enable_lowercase_; | ||
std::string dict_path_; | ||
|
||
struct DictFiles { | ||
std::string main {"main.dic"}; | ||
std::string quantifier {"quantifier.dic"}; | ||
std::string stopwords {"stopword.dic"}; | ||
} dict_files_; | ||
|
||
std::vector<std::string> ext_dict_files_; | ||
std::vector<std::string> ext_stop_word_dict_files_; | ||
|
||
public: | ||
Configuration() : use_smart_(true), enable_lowercase_(true) { | ||
ext_dict_files_ = {"extra_main.dic", "extra_single_word.dic", "extra_single_word_full.dic", | ||
"extra_single_word_low_freq.dic"}; | ||
|
||
ext_stop_word_dict_files_ = {"extra_stopword.dic"}; | ||
} | ||
|
||
bool isUseSmart() const { return use_smart_; } | ||
Configuration& setUseSmart(bool smart) { | ||
use_smart_ = smart; | ||
return *this; | ||
} | ||
|
||
bool isEnableLowercase() const { return enable_lowercase_; } | ||
Configuration& setEnableLowercase(bool enable) { | ||
enable_lowercase_ = enable; | ||
return *this; | ||
} | ||
|
||
std::string getDictPath() const { return dict_path_; } | ||
Configuration& setDictPath(const std::string& path) { | ||
dict_path_ = path; | ||
return *this; | ||
} | ||
|
||
void setMainDictFile(const std::string& file) { dict_files_.main = file; } | ||
void setQuantifierDictFile(const std::string& file) { dict_files_.quantifier = file; } | ||
void setStopWordDictFile(const std::string& file) { dict_files_.stopwords = file; } | ||
|
||
const std::string& getMainDictFile() const { return dict_files_.main; } | ||
const std::string& getQuantifierDictFile() const { return dict_files_.quantifier; } | ||
const std::string& getStopWordDictFile() const { return dict_files_.stopwords; } | ||
|
||
void addExtDictFile(const std::string& filePath) { ext_dict_files_.push_back(filePath); } | ||
void addExtStopWordDictFile(const std::string& filePath) { | ||
ext_stop_word_dict_files_.push_back(filePath); | ||
} | ||
|
||
const std::vector<std::string>& getExtDictFiles() const { return ext_dict_files_; } | ||
const std::vector<std::string>& getExtStopWordDictFiles() const { | ||
return ext_stop_word_dict_files_; | ||
} | ||
}; | ||
|
||
CL_NS_END2 | ||
|
||
#endif //CLUCENE_CONFIGURATION_H |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest separating the IK and Jieba enums for better clarity.