From e7a075845f621b31c4eaf1999d928e815c99f3bf Mon Sep 17 00:00:00 2001 From: zzzxl1993 Date: Wed, 5 Feb 2025 15:38:21 +0800 Subject: [PATCH 1/2] [opt](compile) optimize icu library compilation --- cmake/FindICU.cmake | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake index 5d92a51777b..b3d7f1b871c 100644 --- a/cmake/FindICU.cmake +++ b/cmake/FindICU.cmake @@ -5,7 +5,7 @@ if(NOT TARGET icu) set(ICU_ARCHIVE "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/release-75-1.tar.gz") set(ICU_EXTRACT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty") set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/icu/icu4c/source") - set(ICU_INSTALL_DIR "${ICU_SOURCE_DIR}/install") + set(ICU_INSTALL_DIR "${CMAKE_BINARY_DIR}/icu-build") set(ICU_INCLUDE_DIR "${ICU_INSTALL_DIR}/include") set(ICU_LIBRARY_DIR "${ICU_INSTALL_DIR}/lib") set(ICU_UC_LIB "${ICU_LIBRARY_DIR}/libicuuc.a") @@ -90,7 +90,6 @@ if(NOT TARGET icu) WORKING_DIRECTORY "${ICU_SOURCE_DIR}" RESULT_VARIABLE configure_result OUTPUT_QUIET - ERROR_QUIET ) # Check if the configuration was successful @@ -107,7 +106,6 @@ if(NOT TARGET icu) WORKING_DIRECTORY "${ICU_SOURCE_DIR}" RESULT_VARIABLE build_result OUTPUT_QUIET - ERROR_QUIET ) # Check if the build was successful @@ -124,7 +122,6 @@ if(NOT TARGET icu) WORKING_DIRECTORY "${ICU_SOURCE_DIR}" RESULT_VARIABLE install_result OUTPUT_QUIET - ERROR_QUIET ) # Check if the install was successful From 72acb7d82c03406ac0ceb7a923eb37e565ea1cb1 Mon Sep 17 00:00:00 2001 From: zzzxl1993 Date: Wed, 5 Feb 2025 17:02:11 +0800 Subject: [PATCH 2/2] [feature](analysis) Add a simple tokenizer --- .../CLucene/analysis/simple/SimpleAnalyzer.h | 39 +++++++++ .../analysis/simple/SimpleTokenizer.cpp | 87 +++++++++++++++++++ .../CLucene/analysis/simple/SimpleTokenizer.h | 31 +++++++ src/core/CMakeLists.txt | 1 + 4 files changed, 158 insertions(+) create mode 100644 src/core/CLucene/analysis/simple/SimpleAnalyzer.h create mode 100644 src/core/CLucene/analysis/simple/SimpleTokenizer.cpp create mode 100644 src/core/CLucene/analysis/simple/SimpleTokenizer.h diff --git a/src/core/CLucene/analysis/simple/SimpleAnalyzer.h b/src/core/CLucene/analysis/simple/SimpleAnalyzer.h new file mode 100644 index 00000000000..202a60241a0 --- /dev/null +++ b/src/core/CLucene/analysis/simple/SimpleAnalyzer.h @@ -0,0 +1,39 @@ +#pragma once + +#include + +#include "SimpleTokenizer.h" + +namespace lucene::analysis_v2 { + +class SimpleAnalyzer : public Analyzer { +public: + SimpleAnalyzer() { + _lowercase = true; + _ownReader = false; + } + + ~SimpleAnalyzer() override = default; + + bool isSDocOpt() override { return true; } + + TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { + auto tokenizer = _CLNEW SimpleTokenizer(_lowercase, _ownReader); + tokenizer->reset(reader); + return tokenizer; + } + + TokenStream* reusableTokenStream(const TCHAR* fieldName, + lucene::util::Reader* reader) override { + if (tokenizer_ == nullptr) { + tokenizer_ = std::make_unique(_lowercase, _ownReader); + } + tokenizer_->reset(reader); + return tokenizer_.get(); + }; + +private: + std::unique_ptr tokenizer_; +}; + +} // namespace lucene::analysis_v2 \ No newline at end of file diff --git a/src/core/CLucene/analysis/simple/SimpleTokenizer.cpp b/src/core/CLucene/analysis/simple/SimpleTokenizer.cpp new file mode 100644 index 00000000000..efa65a51ac7 --- /dev/null +++ b/src/core/CLucene/analysis/simple/SimpleTokenizer.cpp @@ -0,0 +1,87 @@ +#include "SimpleTokenizer.h" + +#include + +namespace lucene::analysis_v2 { + +SimpleTokenizer::SimpleTokenizer() { + Tokenizer::lowercase = false; + Tokenizer::ownReader = false; +} + +SimpleTokenizer::SimpleTokenizer(bool lowercase, bool ownReader) : SimpleTokenizer() { + Tokenizer::lowercase = lowercase; + Tokenizer::ownReader = ownReader; +} + +Token* SimpleTokenizer::next(Token* token) { + if (bufferIndex >= dataLen) { + return nullptr; + } + + std::string_view& token_text = tokens_text[bufferIndex++]; + size_t size = std::min(token_text.size(), static_cast(LUCENE_MAX_WORD_LEN)); + if (Tokenizer::lowercase) { + if (!token_text.empty() && static_cast(token_text[0]) < 0x80) { + std::transform(token_text.begin(), token_text.end(), + const_cast(token_text.data()), + [](char c) { return to_lower(c); }); + } + } + token->setNoCopy(token_text.data(), 0, size); + return token; +} + +void SimpleTokenizer::reset(lucene::util::Reader* input) { + bufferIndex = 0; + dataLen = 0; + tokens_text.clear(); + + buffer_.resize(input->size()); + int32_t numRead = input->readCopy(buffer_.data(), 0, buffer_.size()); + assert(buffer_.size() == numRead); + + cut(); + + dataLen = tokens_text.size(); +} + +void SimpleTokenizer::cut() { + uint8_t* s = (uint8_t*)buffer_.data(); + int32_t length = (int32_t)buffer_.size(); + + for (int32_t i = 0; i < length;) { + uint8_t firstByte = s[i]; + + if (is_alnum(firstByte)) { + int32_t start = i; + while (i < length) { + uint8_t nextByte = s[i]; + if (!is_alnum(nextByte)) { + break; + } + s[i] = to_lower(nextByte); + i++; + } + std::string_view token((const char*)(s + start), i - start); + tokens_text.emplace_back(std::move(token)); + } else { + UChar32 c = U_UNASSIGNED; + const int32_t prev_i = i; + + U8_NEXT(s, i, length, c); + + if (c == U_UNASSIGNED) { + _CLTHROWT(CL_ERR_Runtime, "invalid UTF-8 sequence"); + } + + const UCharCategory category = static_cast(u_charType(c)); + if (category == U_OTHER_LETTER) { + const int32_t len = i - prev_i; + tokens_text.emplace_back(reinterpret_cast(s + prev_i), len); + } + } + } +} + +} // namespace lucene::analysis_v2 \ No newline at end of file diff --git a/src/core/CLucene/analysis/simple/SimpleTokenizer.h b/src/core/CLucene/analysis/simple/SimpleTokenizer.h new file mode 100644 index 00000000000..6803580ec67 --- /dev/null +++ b/src/core/CLucene/analysis/simple/SimpleTokenizer.h @@ -0,0 +1,31 @@ +#pragma once + +#include + +#include "CLucene.h" +#include "CLucene/analysis/AnalysisHeader.h" +#include "CLucene/analysis/icu/ICUCommon.h" + +using namespace lucene::analysis; + +namespace lucene::analysis_v2 { + +class SimpleTokenizer : public Tokenizer { +public: + SimpleTokenizer(); + SimpleTokenizer(bool lowercase, bool ownReader); + ~SimpleTokenizer() override = default; + + Token* next(Token* token) override; + void reset(lucene::util::Reader* input) override; + + void cut(); + +private: + int32_t bufferIndex = 0; + int32_t dataLen = 0; + std::string buffer_; + std::vector tokens_text; +}; + +} // namespace lucene::analysis_v2 \ No newline at end of file diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 7fd9cabc60b..d860f561961 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -76,6 +76,7 @@ SET(clucene_core_Files ./CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp ./CLucene/analysis/icu/ICUTokenizer.cpp ./CLucene/analysis/icu/ScriptIterator.cpp + ./CLucene/analysis/simple/SimpleTokenizer.cpp ./CLucene/analysis/Analyzers.cpp ./CLucene/analysis/AnalysisHeader.cpp ./CLucene/store/MMapInput.cpp