From f00e4dfc38420523451c563aaaaecf065a9a78f7 Mon Sep 17 00:00:00 2001 From: amzxyz Date: Mon, 9 Mar 2026 13:55:11 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E5=BC=95=E5=85=A5=20SuperFilter=20?= =?UTF-8?q?(=E8=B6=85=E7=BA=A7=E6=BB=A4=E9=95=9C)=E9=AB=98=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E3=80=81=E9=AB=98=E6=80=A7=E8=83=BD=E7=9A=84?= =?UTF-8?q?=E6=B5=81=E6=B0=B4=E7=BA=BF=E5=80=99=E9=80=89=E8=AF=8D=E5=A4=84?= =?UTF-8?q?=E7=90=86=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/rime/gear/gears_module.cc | 4 +- src/rime/gear/super_filter.cc | 751 ++++++++++++++++++++++++++++++++++ src/rime/gear/super_filter.h | 113 +++++ 3 files changed, 867 insertions(+), 1 deletion(-) create mode 100644 src/rime/gear/super_filter.cc create mode 100644 src/rime/gear/super_filter.h diff --git a/src/rime/gear/gears_module.cc b/src/rime/gear/gears_module.cc index 59679b07cf..28ef58011c 100644 --- a/src/rime/gear/gears_module.cc +++ b/src/rime/gear/gears_module.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -85,7 +86,8 @@ static void rime_gears_initialize() { r.Register("cjk_minifier", new Component); // alias r.Register("reverse_lookup_filter", new Component); r.Register("single_char_filter", new Component); - + r.Register("super_filter", new Component); + // formatters r.Register("shape_formatter", new Component); } diff --git a/src/rime/gear/super_filter.cc b/src/rime/gear/super_filter.cc new file mode 100644 index 0000000000..df3ca8ae7c --- /dev/null +++ b/src/rime/gear/super_filter.cc @@ -0,0 +1,751 @@ +// librime/src/rime/gear/super_filter.cc +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rime { + +// Process-level global cache for the LevelDb instance. +// Prevents high I/O latency and stuttering when Rime frequently recreates sessions (e.g., switching windows). +struct SuperDbCache { + an db; + std::string db_name; + std::string files_sig; +}; + +static SuperDbCache& GetGlobalDbCache() { + static SuperDbCache cache; + return cache; +} + +static std::vector GetUtf8Offsets(const std::string& text) { + std::vector offsets; + size_t i = 0; + while (i < text.length()) { + offsets.push_back(i); + unsigned char c = text[i]; + if (c < 0x80) i += 1; + else if (c < 0xE0) i += 2; + else if (c < 0xF0) i += 3; + else i += 4; + } + offsets.push_back(text.length()); + return offsets; +} + +static std::vector Split(const std::string& str, const std::string& delim) { + std::vector tokens; + if (str.empty()) return tokens; + if (delim.empty()) { + tokens.push_back(str); + return tokens; + } + size_t prev = 0, pos = 0; + do { + pos = str.find(delim, prev); + if (pos == std::string::npos) pos = str.length(); + std::string token = str.substr(prev, pos - prev); + if (!token.empty()) tokens.push_back(token); + prev = pos + delim.length(); + } while (pos <= str.length() && prev < str.length()); + return tokens; +} + +SuperFilterTranslation::SuperFilterTranslation( + an inner, + const std::vector& rules, + an db, + Context* ctx, + const std::string& delimiter, + const std::string& comment_format, + bool is_chain) + : inner_(inner), rules_(rules), db_(db), ctx_(ctx), + delimiter_(delimiter), comment_format_(comment_format), is_chain_(is_chain) { + + size_t start = 0; + size_t end = ctx_->input().length(); + if (inner_ && !inner_->exhausted()) { + auto first_cand = inner_->Peek(); + if (first_cand) { + start = first_cand->start(); + end = first_cand->end(); + } + } + + // Pre-calculate abbreviation matches upon input change. + std::string seg_input = ctx_->input().substr(start, end - start); + GenerateAbbrevCandidates(seg_input, start, end); + UpdateExhausted(); +} + +void SuperFilterTranslation::UpdateExhausted() { + // Lazy evaluation: fetch only the required amount of candidates from the inner translation. + while (pending_candidates_.empty() && !inner_->exhausted()) { + ProcessNextInner(); + } + set_exhausted(index_cands_.empty() && quality_cands_.empty() && + pending_candidates_.empty() && lazy_cands_.empty() && inner_->exhausted()); +} + +an SuperFilterTranslation::Peek() { + if (exhausted()) return nullptr; + + // Dispatch priority: + // 1. Exact index matches (forced insertion) + if (!index_cands_.empty() && (yield_count_ + 1) >= index_cands_.front().value) { + return index_cands_.front().cand; + } + + // 2. Quality threshold matches (dynamic insertion) + if (!quality_cands_.empty()) { + if (pending_candidates_.empty() || pending_candidates_.front()->quality() < quality_cands_.front().value) { + return quality_cands_.front().cand; + } + } + + // 3. Regular pipeline candidates + if (!pending_candidates_.empty()) { + return pending_candidates_.front(); + } + + // 4. Flush remaining priority queues if the main pipeline is exhausted + if (!index_cands_.empty()) return index_cands_.front().cand; + if (!quality_cands_.empty()) return quality_cands_.front().cand; + + // 5. Fallback candidates (quality=0) + if (!lazy_cands_.empty()) { + return lazy_cands_.front(); + } + + return nullptr; +} + +bool SuperFilterTranslation::Next() { + if (exhausted()) return false; + + an p = Peek(); + if (!index_cands_.empty() && p == index_cands_.front().cand) index_cands_.pop_front(); + else if (!quality_cands_.empty() && p == quality_cands_.front().cand) quality_cands_.pop_front(); + else if (!pending_candidates_.empty() && p == pending_candidates_.front()) pending_candidates_.pop_front(); + else if (!lazy_cands_.empty() && p == lazy_cands_.front()) lazy_cands_.pop_front(); + + yield_count_++; + UpdateExhausted(); + return !exhausted(); +} + +// Forward Maximum Matching algorithm for segmenting and replacing long phrases. +std::string SuperFilterTranslation::SegmentConvert(const std::string& text, const std::string& prefix, bool sentence) { + if (!db_) return text; + + if (!sentence) { + std::string val; + if (db_->Fetch(prefix + text, &val)) { + auto parts = Split(val, delimiter_); + return parts.empty() ? text : parts[0]; + } + return text; + } + + std::vector offsets = GetUtf8Offsets(text); + if (offsets.size() <= 1) return text; + size_t char_count = offsets.size() - 1; + std::string result; + size_t i = 0; + const size_t MAX_LOOKAHEAD = 6; + + while (i < char_count) { + bool matched = false; + size_t max_j = std::min(i + MAX_LOOKAHEAD, char_count); + for (size_t j = max_j; j > i; --j) { + std::string sub_text = text.substr(offsets[i], offsets[j] - offsets[i]); + std::string val; + if (db_->Fetch(prefix + sub_text, &val)) { + auto parts = Split(val, delimiter_); + result += parts.empty() ? sub_text : parts[0]; + i = j; + matched = true; + break; + } + } + if (!matched) { + std::string single = text.substr(offsets[i], offsets[i+1] - offsets[i]); + std::string val; + if (db_->Fetch(prefix + single, &val)) { + auto parts = Split(val, delimiter_); + result += parts.empty() ? single : parts[0]; + } else { + result += single; + } + i++; + } + } + return result; +} + +void SuperFilterTranslation::GenerateAbbrevCandidates(const std::string& input_code, size_t start, size_t end) { + if (!db_) return; + abbrev_yielded_.clear(); + for (const auto& r : rules_) { + if (r.mode == "abbrev") { + bool is_active = r.always_on; + if (!is_active) { + // Dynamically check Rime options state + for (const auto& opt : r.options) { + if (ctx_->get_option(opt)) { is_active = true; break; } + } + } + if (!is_active) continue; + + std::string val; + if (!db_->Fetch(r.prefix + input_code, &val)) { + std::string upper_code = input_code; + for (auto& c : upper_code) { + c = static_cast(std::toupper(static_cast(c))); + } + db_->Fetch(r.prefix + upper_code, &val); + } + + if (!val.empty()) { + auto parts = Split(val, delimiter_); + int count = 0; + for (const auto& p : parts) { + std::string item_text = p; + std::string item_preedit = input_code; + + if (abbrev_yielded_.count(item_text)) continue; + abbrev_yielded_.insert(item_text); + count++; + + auto cand = New(r.cand_type, start, end, item_text, ""); + cand->set_preedit(item_preedit); + + if (count <= r.always_qty) { + if (r.order_type == "index") { + cand->set_quality(999); + index_cands_.push_back({cand, r.order_value + (count - 1)}); + } else if (r.order_type == "quality") { + cand->set_quality(r.order_value); + quality_cands_.push_back({cand, r.order_value}); + } + } else { + // Fallback candidates sink to the bottom with quality 0 + cand->set_quality(0); + lazy_cands_.push_back(cand); + } + } + } + } + } + + std::sort(index_cands_.begin(), index_cands_.end(), [](const InjectCand& a, const InjectCand& b) { + return a.value < b.value; + }); + std::sort(quality_cands_.begin(), quality_cands_.end(), [](const InjectCand& a, const InjectCand& b) { + return a.value > b.value; + }); +} + +struct CandData { + std::string text; + std::string comment; + std::string cand_type; + bool is_original; +}; + +void SuperFilterTranslation::ProcessNextInner() { + if (inner_->exhausted()) return; + auto cand = inner_->Peek(); + inner_->Next(); + + if (!cand) return; + + std::vector current_items; + current_items.push_back({cand->text(), cand->comment(), cand->type(), true}); + + if (db_) { + for (const auto& r : rules_) { + if (r.mode == "abbrev") continue; + + bool is_active = r.always_on; + if (!is_active) { + for (const auto& opt : r.options) { + if (ctx_->get_option(opt)) { is_active = true; break; } + } + } + if (!is_active) continue; + + std::vector next_items; + + for (const auto& item : current_items) { + std::string val; + if (r.sentence) { + std::string fmm_res = SegmentConvert(item.text, r.prefix, true); + if (fmm_res != item.text) val = fmm_res; + } else { + db_->Fetch(r.prefix + item.text, &val); + } + + if (!val.empty()) { + auto parts = Split(val, delimiter_); + if (r.t9_mode) { + for (auto& p : parts) { + size_t delim_pos = p.find("=="); + if (delim_pos != std::string::npos) { + p = p.substr(0, delim_pos); + } + } + } + std::string rule_comment = ""; + if (r.comment_mode == "text" && !item.text.empty()) { + std::string cfmt = comment_format_; + size_t pos = cfmt.find("%s"); + if (pos != std::string::npos) { + cfmt.replace(pos, 2, item.text); + rule_comment = cfmt; + } else { + rule_comment = item.text; + } + } else if (r.comment_mode == "append") { + rule_comment = item.comment; + } + + if (r.mode == "replace") { + for (size_t i = 0; i < parts.size(); ++i) { + std::string final_comment = (i == 0 && r.comment_mode == "none") ? "" : rule_comment; + std::string ctype = (i == 0 && item.is_original) ? item.cand_type : r.cand_type; + next_items.push_back({parts[i], final_comment, ctype, false}); + } + } else if (r.mode == "append") { + next_items.push_back(item); + for (const auto& p : parts) { + std::string final_comment = (r.comment_mode == "none") ? "" : rule_comment; + next_items.push_back({p, final_comment, r.cand_type, false}); + } + } else if (r.mode == "comment") { + std::string joined; + for(size_t i=0; i parallel_merged; + for (const auto& og : current_items) parallel_merged.push_back(og); + for (const auto& nx : next_items) { + if (!nx.is_original) parallel_merged.push_back(nx); + } + current_items = std::move(parallel_merged); + } + } + } + + for (const auto& result : current_items) { + auto nc = New(result.cand_type, cand->start(), cand->end(), result.text, result.comment); + nc->set_quality(cand->quality()); + nc->set_preedit(cand->preedit()); + pending_candidates_.push_back(nc); + } +} + +SuperFilter::SuperFilter(const Ticket& ticket) : Filter(ticket) { + if (ticket.schema) { + LoadConfig(ticket.schema->config()); + InitializeDb(); + } +} + +SuperFilter::~SuperFilter() { + // The globally cached LevelDb instance remains open across session lifetimes. +} + +void SuperFilter::LoadConfig(Config* config) { + config->GetString("super_filter/db_name", &db_name_); + + if (!db_name_.empty()) { + db_name_ = std::filesystem::path(db_name_).filename().string(); + } + + if (db_name_.empty() || db_name_ == "." || db_name_ == "..") { + db_name_ = "super_filter"; + } + db_name_ = "data/" + db_name_; + + config->GetString("super_filter/delimiter", &delimiter_); + if (delimiter_.empty()) delimiter_ = "|"; + config->GetString("super_filter/comment_format", &comment_format_); + if (comment_format_.empty()) comment_format_ = "〔%s〕"; + config->GetBool("super_filter/chain", &chain_); + + auto root = config->GetItem("super_filter/rules"); + if (auto rule_list = As(root)) { + for (size_t i = 0; i < rule_list->size(); ++i) { + auto item = As(rule_list->GetAt(i)); + if (!item) continue; + + SuperRule rule; + + if (auto name_val = As(item->Get("name"))) { + rule.name = name_val->str(); + } else { + rule.name = "Rule_" + std::to_string(i + 1); + } + + auto opt_node = item->Get("option"); + if (auto opt_val = As(opt_node)) { + if (opt_val->str() == "true") { + rule.always_on = true; + } else if (opt_val->str() == "false") { + // Explicitly frozen rule, option vector remains empty. + } else { + rule.options.push_back(opt_val->str()); + } + } else if (auto opt_list = As(opt_node)) { + for (size_t j=0; jsize(); ++j) { + if (auto v = As(opt_list->GetAt(j))) rule.options.push_back(v->str()); + } + } + + // Discard disabled or misconfigured rules during the parse phase to save CPU cycles. + if (!rule.always_on && rule.options.empty()) { + LOG(INFO) << "super_filter: [" << rule.name << "] frozen or missing option, safely ignored."; + continue; + } + + if (auto mode_val = As(item->Get("mode"))) rule.mode = mode_val->str(); + else rule.mode = "append"; + + if (rule.mode != "append" && rule.mode != "replace" && rule.mode != "comment" && rule.mode != "abbrev") { + LOG(WARNING) << "super_filter: [" << rule.name << "] unsupported mode '" << rule.mode << "', skipping."; + continue; + } + + if (auto sent_val = As(item->Get("sentence"))) { + if (sent_val->str() == "true") rule.sentence = true; + } + + if (auto pre_val = As(item->Get("prefix"))) { + rule.prefix = pre_val->str(); + } else { + rule.prefix = ""; + } + + if (auto cmod_val = As(item->Get("comment_mode"))) rule.comment_mode = cmod_val->str(); + else rule.comment_mode = "none"; + + if (auto ctype_val = As(item->Get("cand_type"))) rule.cand_type = ctype_val->str(); + else rule.cand_type = "derived"; + + // 解析 T9 模式开关,并设立严格防火墙:仅允许 abbrev 模式使用 + if (auto t9_val = As(item->Get("t9_mode"))) { + rule.t9_mode = (t9_val->str() == "true"); + if (rule.t9_mode && rule.mode != "abbrev") { + LOG(WARNING) << "super_filter: [" << rule.name << "] t9_mode 仅支持在 abbrev 模式下开启,已自动忽略。"; + rule.t9_mode = false; + } + } + + if (rule.mode == "abbrev") { + auto ord_val = As(item->Get("order")); + if (!ord_val) { + LOG(WARNING) << "super_filter: [" << rule.name << "] missing 'order' parameter in abbrev mode."; + continue; + } + auto parts = Split(ord_val->str(), ","); + if (parts.size() < 2) { + LOG(WARNING) << "super_filter: [" << rule.name << "] malformed 'order' format."; + continue; + } + try { + rule.order_type = parts[0]; + rule.order_value = std::stoi(parts[1]); + if (parts.size() >= 3) rule.always_qty = std::stoi(parts[2]); + } catch (...) { + LOG(WARNING) << "super_filter: [" << rule.name << "] parse exception in 'order'."; + continue; + } + } + + auto files_node = item->Get("files"); + if (!files_node || (!As(files_node) && !As(files_node))) { + LOG(WARNING) << "super_filter: [" << rule.name << "] missing 'files' dependency."; + continue; + } + + if (auto files_list = As(item->Get("files"))) { + for(size_t j=0; jsize(); ++j) { + if (auto f = As(files_list->GetAt(j))) { + std::string filepath = f->str(); + if (!filepath.empty() && + filepath.front() != '/' && filepath.front() != '\\' && + filepath.find("..") == std::string::npos) { + rule.files.push_back(filepath); + } else { + LOG(WARNING) << "super_filter: [" << rule.name << "] 非法或不安全的词库路径被拦截: " << filepath; + } + } + } + } + if (rule.files.empty()) { + LOG(WARNING) << "super_filter: [" << rule.name << "] 过滤后无可用的合法 files,已跳过装载!"; + continue; + } + + rules_.push_back(rule); + } + } +} + +// Generates a stringent signature combining prefixes, file paths, and system attributes +// to accurately trigger database rebuilds only when necessary. +std::string SuperFilter::GenerateFilesSignature() { + std::string sig = "delim:" + delimiter_ + "||"; + std::string user_dir = string(rime_get_api()->get_user_data_dir()); + std::error_code ec_exist; + + for (const auto& rule : rules_) { + sig += "t9:" + std::to_string(rule.t9_mode) + "@"; + for (const auto& path : rule.files) { + sig += "prefix:" + rule.prefix + "@path:" + path + "="; + std::filesystem::path full_path = user_dir + "/" + path; + + if (std::filesystem::exists(full_path, ec_exist) && !ec_exist) { + std::error_code ec_time; + auto ftime = std::filesystem::last_write_time(full_path, ec_time); + + std::error_code ec_size; + auto fsize = std::filesystem::file_size(full_path, ec_size); + + if (!ec_time && !ec_size) { + auto time_sec = std::chrono::duration_cast(ftime.time_since_epoch()).count(); + sig += std::to_string(fsize) + "_" + std::to_string(time_sec) + "|"; + } + } + } + } + return sig; +} +static std::mutex g_db_cache_mutex; +void SuperFilter::InitializeDb() { + std::lock_guard lock(g_db_cache_mutex); + auto& cache = GetGlobalDbCache(); + std::string current_sig = GenerateFilesSignature(); + + // Cache Hit: Instantly mount the pre-opened LevelDb to eliminate I/O lag. + if (cache.db && cache.db_name == db_name_ && cache.files_sig == current_sig) { + db_ = cache.db; + return; + } + + std::string user_dir = string(rime_get_api()->get_user_data_dir()); + std::error_code ec_dir; + std::filesystem::create_directories(user_dir + "/data", ec_dir); + if (ec_dir) { + LOG(ERROR) << "super_filter: 无法创建 data 文件夹 '" << (user_dir + "/data") + << "': " << ec_dir.message(); + return; + } + + auto* db_component = Db::Require("userdb"); + if (!db_component) return; + + an new_db = an(db_component->Create(db_name_)); + if (!new_db) return; + + bool need_rebuild = false; + + if (new_db->OpenReadOnly()) { + std::string db_sig; + new_db->MetaFetch("_files_sig", &db_sig); + if (db_sig != current_sig) need_rebuild = true; + new_db->Close(); + } else { + need_rebuild = true; + } + + if (need_rebuild) { + if (new_db->Open()) { + LOG(INFO) << "super_filter: Database schema updated, initiating LevelDb rebuild..."; + db_ = new_db; + RebuildDb(); + new_db->MetaUpdate("_files_sig", current_sig); + LOG(INFO) << "super_filter: LevelDb rebuild complete."; + new_db->Close(); + db_.reset(); + } + } + + if (new_db->OpenReadOnly()) { + cache.db = new_db; + cache.db_name = db_name_; + cache.files_sig = current_sig; + db_ = new_db; + } +} + +// Data structure for in-memory sorting before writing to LevelDb +struct DictItem { + std::string value; + double weight; + int order; +}; + +void SuperFilter::RebuildDb() { + if (db_) { + auto accessor = db_->Query(""); + if (accessor) { + std::string key, value; + while (!accessor->exhausted()) { + if (accessor->GetNextRecord(&key, &value)) { + db_->Erase(key); + } + } + } + } + std::string user_dir = string(rime_get_api()->get_user_data_dir()); + for (const auto& rule : rules_) { + // Build a temporary in-memory map to aggregate keys across multiple lines/files + std::unordered_map> merged_data; + int line_counter = 0; + + for (const auto& path : rule.files) { + std::string full_path = user_dir + "/" + path; + std::ifstream file(full_path); + if (!file.is_open()) { + LOG(WARNING) << "super_filter: 无法打开词库文件 (被占用或不存在): " << full_path; + continue; + } + std::string line; + while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + + size_t sep1 = line.find_first_of(" \t"); + if (sep1 != std::string::npos) { + std::string key = line.substr(0, sep1); + std::string orig_key = key; + + static const char t9_map[26] = { + '2','2','2', '3','3','3', '4','4','4', '5','5','5', '6','6','6', + '7','7','7','7', '8','8','8', '9','9','9','9' + }; + + if (rule.t9_mode) { + for (char& c : key) { + if (c >= 'a' && c <= 'z') c = t9_map[c - 'a']; + else if (c >= 'A' && c <= 'Z') c = t9_map[c - 'A']; + } + } + + size_t val_start = line.find_first_not_of(" \t", sep1); + if (val_start != std::string::npos) { + std::string rest = line.substr(val_start); + rest.erase(rest.find_last_not_of("\r\n \t") + 1); // trim right + + std::string val = rest; + + // 如果是 T9 模式,且值里还没有包含 ==,把原始拼音作为尾巴藏进去 + if (rule.t9_mode && val.find("==") == std::string::npos) { + val = val + "==" + orig_key; + } + double weight = 0.0; + + // Try to extract weight from a potential 3rd column + size_t last_delim = rest.find_last_of(" \t"); + if (last_delim != std::string::npos) { + size_t weight_start = rest.find_first_not_of(" \t", last_delim); + if (weight_start != std::string::npos) { + std::string weight_str = rest.substr(weight_start); + try { + size_t parsed_len; + weight = std::stod(weight_str, &parsed_len); + // Ensure the parsed number spans the entire rest of the string + if (parsed_len == weight_str.length()) { + val = rest.substr(0, last_delim); + val.erase(val.find_last_not_of(" \t") + 1); + } else { + weight = 0.0; + } + } catch (...) { + weight = 0.0; + } + } + } + + // Push into the map (grouped by prefix + key) + merged_data[rule.prefix + key].push_back({val, weight, line_counter++}); + } + } + } + } + + // Sort items by weight and merge them into a single string for DB insertion + for (auto& kv : merged_data) { + auto& items = kv.second; + + // Sort logic: Descending by weight, Ascending by original read order + std::sort(items.begin(), items.end(), [](const DictItem& a, const DictItem& b) { + if (a.weight != b.weight) return a.weight > b.weight; + return a.order < b.order; + }); + + std::string final_val; + for (size_t i = 0; i < items.size(); ++i) { + final_val += items[i].value; + if (i < items.size() - 1) final_val += delimiter_; + } + + db_->Update(kv.first, final_val); + } + } +} + +an SuperFilter::Apply( + an translation, + CandidateList* candidates) { + + if (!translation) return nullptr; + Context* ctx = engine_->context(); + + if (!ctx->IsComposing() || ctx->input().empty()) { + return translation; + } + + return New(translation, rules_, db_, ctx, delimiter_, comment_format_, chain_); +} + +} // namespace rime \ No newline at end of file diff --git a/src/rime/gear/super_filter.h b/src/rime/gear/super_filter.h new file mode 100644 index 0000000000..e41dc44d14 --- /dev/null +++ b/src/rime/gear/super_filter.h @@ -0,0 +1,113 @@ +// librime/src/rime/gear/super_filter.h +#ifndef RIME_SUPER_FILTER_H_ +#define RIME_SUPER_FILTER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rime { + +// Representation of a single filter rule configured in YAML. +struct SuperRule { + std::string name; + bool always_on = false; + std::vector options; + + std::string mode; // Supported modes: append, replace, comment, abbrev + bool sentence = false; // Enable FMM (Forward Maximum Matching) for long phrases + + std::string prefix; + std::vector files; + + // 开启九宫格(T9)模式:构建词库时自动将编码转数字,并保留原编码为 preedit + bool t9_mode = false; + + std::string cand_type = "derived"; + std::string comment_mode; // Supported modes: none, text, append + + // Injection parameters strictly for 'abbrev' mode + std::string order_type = "index"; // 'index' (absolute position) or 'quality' (score threshold) + int order_value = 1; + int always_qty = 1; +}; + +// Wrapper for candidates that require forced injection at specific positions or quality thresholds. +struct InjectCand { + an cand; + int value; +}; + +// The core translation class implementing lazy evaluation and stream processing. +class SuperFilterTranslation : public Translation { +public: + SuperFilterTranslation(an inner, + const std::vector& rules, + an db, + Context* ctx, + const std::string& delimiter, + const std::string& comment_format, + bool is_chain); + + an Peek() override; + bool Next() override; + +private: + void GenerateAbbrevCandidates(const std::string& input_code, size_t start, size_t end); + void ProcessNextInner(); + void UpdateExhausted(); + + std::string SegmentConvert(const std::string& text, const std::string& prefix, bool sentence); + + an inner_; + std::vector rules_; + an db_; + Context* ctx_; + std::string delimiter_; + std::string comment_format_; + bool is_chain_; + + int yield_count_ = 0; + + // Priority queues for candidate distribution + std::deque index_cands_; + std::deque quality_cands_; + std::deque> lazy_cands_; + std::deque> pending_candidates_; + std::unordered_set abbrev_yielded_; +}; + +// Filter component responsible for parsing configurations and managing the global LevelDb connection. +class SuperFilter : public Filter { +public: + explicit SuperFilter(const Ticket& ticket); + virtual ~SuperFilter(); + + an Apply(an translation, + CandidateList* candidates) override; + +private: + void LoadConfig(Config* config); + void InitializeDb(); + std::string GenerateFilesSignature(); + void RebuildDb(); + + std::vector rules_; + an db_; + std::string db_name_; + std::string delimiter_; + std::string comment_format_; + bool chain_ = false; +}; + +} // namespace rime + +#endif // RIME_SUPER_FILTER_H_ \ No newline at end of file From 85ed81e1dd25bf4f5aa42cfad0ccda5acb25d178 Mon Sep 17 00:00:00 2001 From: amzxyz Date: Tue, 10 Mar 2026 15:12:38 +0800 Subject: [PATCH 2/2] =?UTF-8?q?feat:=20=E5=BC=95=E5=85=A5=20SuperFilter=20?= =?UTF-8?q?(=E8=B6=85=E7=BA=A7=E6=BB=A4=E9=95=9C)=E9=AB=98=E8=87=AA?= =?UTF-8?q?=E5=AE=9A=E4=B9=89=E3=80=81=E9=AB=98=E6=80=A7=E8=83=BD=E7=9A=84?= =?UTF-8?q?=E6=B5=81=E6=B0=B4=E7=BA=BF=E5=80=99=E9=80=89=E8=AF=8D=E5=A4=84?= =?UTF-8?q?=E7=90=86=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/rime/gear/super_filter.cc | 970 ++++++++++++++++------------------ src/rime/gear/super_filter.h | 93 ++-- 2 files changed, 525 insertions(+), 538 deletions(-) diff --git a/src/rime/gear/super_filter.cc b/src/rime/gear/super_filter.cc index df3ca8ae7c..63f4796044 100644 --- a/src/rime/gear/super_filter.cc +++ b/src/rime/gear/super_filter.cc @@ -1,33 +1,160 @@ // librime/src/rime/gear/super_filter.cc #include -#include #include #include #include -#include -#include #include #include #include #include +#include #include #include #include #include +#include namespace rime { -// Process-level global cache for the LevelDb instance. -// Prevents high I/O latency and stuttering when Rime frequently recreates sessions (e.g., switching windows). -struct SuperDbCache { - an db; - std::string db_name; +// SuperBinaryDb +SuperBinaryDb::SuperBinaryDb(const std::string& bin_path) : bin_path_(bin_path) {} + +SuperBinaryDb::~SuperBinaryDb() { Close(); } + +void SuperBinaryDb::Close() { + region_.reset(); + mapping_.reset(); + data_ptr_ = nullptr; + record_count_ = 0; + index_ptr_ = nullptr; +} + +bool SuperBinaryDb::Open() { + try { + mapping_ = std::make_unique(bin_path_.c_str(), boost::interprocess::read_only); + region_ = std::make_unique(*mapping_, boost::interprocess::read_only); + data_ptr_ = static_cast(region_->get_address()); + data_size_ = region_->get_size(); + + if (data_size_ < 12) return false; + if (std::strncmp(data_ptr_, "SUPR", 4) != 0) return false; + + uint32_t sig_len; + std::memcpy(&sig_len, data_ptr_ + 8, 4); + + uint32_t padded_sig_len = (sig_len + 3) & ~3; + size_t header_offset = 12 + padded_sig_len; + + if (data_size_ < header_offset + 4) return false; + std::memcpy(&record_count_, data_ptr_ + header_offset, 4); + + size_t index_offset = header_offset + 4; + index_ptr_ = reinterpret_cast(data_ptr_ + index_offset); + + return true; + } catch (...) { + return false; + } +} + +bool SuperBinaryDb::Fetch(const std::string& key, std::string* value) const { + if (!data_ptr_ || record_count_ == 0) return false; + + int left = 0; + int right = static_cast(record_count_) - 1; + + while (left <= right) { + int mid = left + (right - left) / 2; + uint32_t offset = index_ptr_[mid]; + if (offset >= data_size_) return false; + + const char* mid_key = data_ptr_ + offset; + + int cmp = std::strcmp(mid_key, key.c_str()); + if (cmp == 0) { + size_t key_len = std::strlen(mid_key); + uint32_t val_offset = offset + key_len + 1; + if (val_offset >= data_size_) return false; + + *value = std::string(data_ptr_ + val_offset); + return true; + } else if (cmp < 0) { + left = mid + 1; + } else { + right = mid - 1; + } + } + return false; +} + +bool SuperBinaryDb::CheckSignature(const std::string& expected_sig) const { + std::ifstream in(bin_path_, std::ios::binary); + if (!in) return false; + char magic[4]; + if (!in.read(magic, 4) || std::strncmp(magic, "SUPR", 4) != 0) return false; + uint32_t version; + in.read(reinterpret_cast(&version), 4); + uint32_t sig_len; + in.read(reinterpret_cast(&sig_len), 4); + + std::string sig(sig_len, '\0'); + if (!in.read(&sig[0], sig_len)) return false; + + return sig == expected_sig; +} + +bool SuperBinaryDb::Build(const std::string& bin_path, const std::string& signature, + const std::map& data) { + std::ofstream out(bin_path, std::ios::binary | std::ios::trunc); + if (!out) return false; + + out.write("SUPR", 4); + uint32_t version = 1; + out.write(reinterpret_cast(&version), 4); + + uint32_t sig_len = signature.length(); + out.write(reinterpret_cast(&sig_len), 4); + out.write(signature.data(), sig_len); + + uint32_t padded_sig_len = (sig_len + 3) & ~3; + if (padded_sig_len > sig_len) { + std::vector padding(padded_sig_len - sig_len, '\0'); + out.write(padding.data(), padding.size()); + } + + uint32_t count = data.size(); + out.write(reinterpret_cast(&count), 4); + + size_t index_start_pos = out.tellp(); + std::vector index_buffer(count, 0); + out.write(reinterpret_cast(index_buffer.data()), index_buffer.size() * sizeof(uint32_t)); + + uint32_t current_offset = out.tellp(); + int i = 0; + + for (const auto& kv : data) { + index_buffer[i] = current_offset; + out.write(kv.first.c_str(), kv.first.length() + 1); + current_offset += kv.first.length() + 1; + + out.write(kv.second.c_str(), kv.second.length() + 1); + current_offset += kv.second.length() + 1; + i++; + } + + out.seekp(index_start_pos); + out.write(reinterpret_cast(index_buffer.data()), index_buffer.size() * sizeof(uint32_t)); + return true; +} + +struct SuperDbCacheItem { + std::shared_ptr db; std::string files_sig; }; -static SuperDbCache& GetGlobalDbCache() { - static SuperDbCache cache; - return cache; +static std::unordered_map& GetGlobalDbCache() { + static std::unordered_map cache_map; + return cache_map; } static std::vector GetUtf8Offsets(const std::string& text) { @@ -40,8 +167,13 @@ static std::vector GetUtf8Offsets(const std::string& text) { else if (c < 0xE0) i += 2; else if (c < 0xF0) i += 3; else i += 4; + if (i > text.length()) { + i = text.length(); + } + } + if (offsets.empty() || offsets.back() != text.length()) { + offsets.push_back(text.length()); } - offsets.push_back(text.length()); return offsets; } @@ -63,16 +195,15 @@ static std::vector Split(const std::string& str, const std::string& return tokens; } +static std::mutex g_db_cache_mutex; + +// SuperFilterTranslation SuperFilterTranslation::SuperFilterTranslation( an inner, - const std::vector& rules, - an db, - Context* ctx, - const std::string& delimiter, - const std::string& comment_format, - bool is_chain) - : inner_(inner), rules_(rules), db_(db), ctx_(ctx), - delimiter_(delimiter), comment_format_(comment_format), is_chain_(is_chain) { + const SuperConfig& config, + std::shared_ptr db, + Context* ctx) + : inner_(inner), cfg_(config), db_(db), ctx_(ctx) { size_t start = 0; size_t end = ctx_->input().length(); @@ -84,14 +215,12 @@ SuperFilterTranslation::SuperFilterTranslation( } } - // Pre-calculate abbreviation matches upon input change. std::string seg_input = ctx_->input().substr(start, end - start); GenerateAbbrevCandidates(seg_input, start, end); UpdateExhausted(); } void SuperFilterTranslation::UpdateExhausted() { - // Lazy evaluation: fetch only the required amount of candidates from the inner translation. while (pending_candidates_.empty() && !inner_->exhausted()) { ProcessNextInner(); } @@ -102,32 +231,18 @@ void SuperFilterTranslation::UpdateExhausted() { an SuperFilterTranslation::Peek() { if (exhausted()) return nullptr; - // Dispatch priority: - // 1. Exact index matches (forced insertion) if (!index_cands_.empty() && (yield_count_ + 1) >= index_cands_.front().value) { return index_cands_.front().cand; } - - // 2. Quality threshold matches (dynamic insertion) if (!quality_cands_.empty()) { if (pending_candidates_.empty() || pending_candidates_.front()->quality() < quality_cands_.front().value) { return quality_cands_.front().cand; } } - - // 3. Regular pipeline candidates - if (!pending_candidates_.empty()) { - return pending_candidates_.front(); - } - - // 4. Flush remaining priority queues if the main pipeline is exhausted + if (!pending_candidates_.empty()) return pending_candidates_.front(); if (!index_cands_.empty()) return index_cands_.front().cand; if (!quality_cands_.empty()) return quality_cands_.front().cand; - - // 5. Fallback candidates (quality=0) - if (!lazy_cands_.empty()) { - return lazy_cands_.front(); - } + if (!lazy_cands_.empty()) return lazy_cands_.front(); return nullptr; } @@ -146,14 +261,13 @@ bool SuperFilterTranslation::Next() { return !exhausted(); } -// Forward Maximum Matching algorithm for segmenting and replacing long phrases. -std::string SuperFilterTranslation::SegmentConvert(const std::string& text, const std::string& prefix, bool sentence) { +std::string SuperFilterTranslation::SegmentConvert(const std::string& text, bool sentence) { if (!db_) return text; if (!sentence) { std::string val; - if (db_->Fetch(prefix + text, &val)) { - auto parts = Split(val, delimiter_); + if (db_->Fetch(text, &val)) { + auto parts = Split(val, cfg_.delimiter); return parts.empty() ? text : parts[0]; } return text; @@ -172,8 +286,8 @@ std::string SuperFilterTranslation::SegmentConvert(const std::string& text, cons for (size_t j = max_j; j > i; --j) { std::string sub_text = text.substr(offsets[i], offsets[j] - offsets[i]); std::string val; - if (db_->Fetch(prefix + sub_text, &val)) { - auto parts = Split(val, delimiter_); + if (db_->Fetch(sub_text, &val)) { + auto parts = Split(val, cfg_.delimiter); result += parts.empty() ? sub_text : parts[0]; i = j; matched = true; @@ -183,8 +297,8 @@ std::string SuperFilterTranslation::SegmentConvert(const std::string& text, cons if (!matched) { std::string single = text.substr(offsets[i], offsets[i+1] - offsets[i]); std::string val; - if (db_->Fetch(prefix + single, &val)) { - auto parts = Split(val, delimiter_); + if (db_->Fetch(single, &val)) { + auto parts = Split(val, cfg_.delimiter); result += parts.empty() ? single : parts[0]; } else { result += single; @@ -196,75 +310,73 @@ std::string SuperFilterTranslation::SegmentConvert(const std::string& text, cons } void SuperFilterTranslation::GenerateAbbrevCandidates(const std::string& input_code, size_t start, size_t end) { - if (!db_) return; + if (!db_ || cfg_.mode != "abbrev") return; abbrev_yielded_.clear(); - for (const auto& r : rules_) { - if (r.mode == "abbrev") { - bool is_active = r.always_on; - if (!is_active) { - // Dynamically check Rime options state - for (const auto& opt : r.options) { - if (ctx_->get_option(opt)) { is_active = true; break; } - } - } - if (!is_active) continue; - std::string val; - if (!db_->Fetch(r.prefix + input_code, &val)) { - std::string upper_code = input_code; - for (auto& c : upper_code) { - c = static_cast(std::toupper(static_cast(c))); - } - db_->Fetch(r.prefix + upper_code, &val); + if (!cfg_.tags.empty()) { + bool is_tag_match = false; + size_t seg_start = 0; + if (!ctx_->composition().empty()) { + const auto& seg = ctx_->composition().back(); + seg_start = seg.start; + for (const auto& req_tag : cfg_.tags) { + if (seg.HasTag(req_tag)) { is_tag_match = true; break; } } + } + + bool is_pure_chars = std::all_of(input_code.begin(), input_code.end(), + [](unsigned char c) { return c < 128 && std::isalnum(c); }); - if (!val.empty()) { - auto parts = Split(val, delimiter_); - int count = 0; - for (const auto& p : parts) { - std::string item_text = p; - std::string item_preedit = input_code; - - if (abbrev_yielded_.count(item_text)) continue; - abbrev_yielded_.insert(item_text); - count++; - - auto cand = New(r.cand_type, start, end, item_text, ""); - cand->set_preedit(item_preedit); - - if (count <= r.always_qty) { - if (r.order_type == "index") { - cand->set_quality(999); - index_cands_.push_back({cand, r.order_value + (count - 1)}); - } else if (r.order_type == "quality") { - cand->set_quality(r.order_value); - quality_cands_.push_back({cand, r.order_value}); - } - } else { - // Fallback candidates sink to the bottom with quality 0 - cand->set_quality(0); - lazy_cands_.push_back(cand); - } + if (!is_tag_match || seg_start != 0 || !is_pure_chars) return; + } + + bool is_active = cfg_.always_on; + if (!is_active) { + for (const auto& opt : cfg_.options) { + if (ctx_->get_option(opt)) { is_active = true; break; } + } + } + if (!is_active) return; + + std::string val; + if (!db_->Fetch(input_code, &val)) { + std::string upper_code = input_code; + for (auto& c : upper_code) { + c = static_cast(std::toupper(static_cast(c))); + } + db_->Fetch(upper_code, &val); + } + + if (!val.empty()) { + auto parts = Split(val, cfg_.delimiter); + int count = 0; + for (const auto& p : parts) { + std::string item_text = p; + std::string item_preedit = input_code; + + if (abbrev_yielded_.count(item_text)) continue; + abbrev_yielded_.insert(item_text); + count++; + + auto cand = New(cfg_.cand_type, start, end, item_text, ""); + cand->set_preedit(item_preedit); + + if (count <= cfg_.always_qty) { + if (cfg_.order_type == "index") { + cand->set_quality(999); + index_cands_.push_back({cand, cfg_.order_value + (count - 1)}); + } else if (cfg_.order_type == "quality") { + cand->set_quality(cfg_.order_value); + quality_cands_.push_back({cand, cfg_.order_value}); } + } else { + cand->set_quality(0); + lazy_cands_.push_back(cand); } } } - - std::sort(index_cands_.begin(), index_cands_.end(), [](const InjectCand& a, const InjectCand& b) { - return a.value < b.value; - }); - std::sort(quality_cands_.begin(), quality_cands_.end(), [](const InjectCand& a, const InjectCand& b) { - return a.value > b.value; - }); } -struct CandData { - std::string text; - std::string comment; - std::string cand_type; - bool is_original; -}; - void SuperFilterTranslation::ProcessNextInner() { if (inner_->exhausted()) return; auto cand = inner_->Peek(); @@ -272,353 +384,241 @@ void SuperFilterTranslation::ProcessNextInner() { if (!cand) return; - std::vector current_items; - current_items.push_back({cand->text(), cand->comment(), cand->type(), true}); - - if (db_) { - for (const auto& r : rules_) { - if (r.mode == "abbrev") continue; + if (!db_ || cfg_.mode == "abbrev") { + pending_candidates_.push_back(cand); + return; + } - bool is_active = r.always_on; - if (!is_active) { - for (const auto& opt : r.options) { - if (ctx_->get_option(opt)) { is_active = true; break; } - } + if (!cfg_.tags.empty()) { + bool is_tag_match = false; + if (!ctx_->composition().empty()) { + const auto& seg = ctx_->composition().back(); + for (const auto& req_tag : cfg_.tags) { + if (seg.HasTag(req_tag)) { is_tag_match = true; break; } } - if (!is_active) continue; - - std::vector next_items; + } + if (!is_tag_match) { + pending_candidates_.push_back(cand); + return; + } + } - for (const auto& item : current_items) { - std::string val; - if (r.sentence) { - std::string fmm_res = SegmentConvert(item.text, r.prefix, true); - if (fmm_res != item.text) val = fmm_res; - } else { - db_->Fetch(r.prefix + item.text, &val); - } + bool is_active = cfg_.always_on; + if (!is_active) { + for (const auto& opt : cfg_.options) { + if (ctx_->get_option(opt)) { is_active = true; break; } + } + } + + if (!is_active) { + pending_candidates_.push_back(cand); + return; + } - if (!val.empty()) { - auto parts = Split(val, delimiter_); - if (r.t9_mode) { - for (auto& p : parts) { - size_t delim_pos = p.find("=="); - if (delim_pos != std::string::npos) { - p = p.substr(0, delim_pos); - } - } - } - std::string rule_comment = ""; - if (r.comment_mode == "text" && !item.text.empty()) { - std::string cfmt = comment_format_; - size_t pos = cfmt.find("%s"); - if (pos != std::string::npos) { - cfmt.replace(pos, 2, item.text); - rule_comment = cfmt; - } else { - rule_comment = item.text; - } - } else if (r.comment_mode == "append") { - rule_comment = item.comment; - } + std::string val; + if (cfg_.sentence) { + std::string fmm_res = SegmentConvert(cand->text(), true); + if (fmm_res != cand->text()) val = fmm_res; + } else { + db_->Fetch(cand->text(), &val); + } - if (r.mode == "replace") { - for (size_t i = 0; i < parts.size(); ++i) { - std::string final_comment = (i == 0 && r.comment_mode == "none") ? "" : rule_comment; - std::string ctype = (i == 0 && item.is_original) ? item.cand_type : r.cand_type; - next_items.push_back({parts[i], final_comment, ctype, false}); - } - } else if (r.mode == "append") { - next_items.push_back(item); - for (const auto& p : parts) { - std::string final_comment = (r.comment_mode == "none") ? "" : rule_comment; - next_items.push_back({p, final_comment, r.cand_type, false}); - } - } else if (r.mode == "comment") { - std::string joined; - for(size_t i=0; itext().empty()) { + std::string cfmt = cfg_.comment_format; + size_t pos = cfmt.find("%s"); + if (pos != std::string::npos) { + cfmt.replace(pos, 2, cand->text()); + rule_comment = cfmt; + } else rule_comment = cand->text(); + } else if (cfg_.comment_mode == "append") { + rule_comment = cand->comment(); + } - // Pipeline flow control: Hand over the payload to the next rule if chain mode is enabled. - if (is_chain_) { - current_items = std::move(next_items); - } else { - std::vector parallel_merged; - for (const auto& og : current_items) parallel_merged.push_back(og); - for (const auto& nx : next_items) { - if (!nx.is_original) parallel_merged.push_back(nx); - } - current_items = std::move(parallel_merged); + if (cfg_.mode == "replace") { + for (size_t i = 0; i < parts.size(); ++i) { + std::string final_comment = (i == 0 && cfg_.comment_mode == "none") ? "" : rule_comment; + std::string ctype = (i == 0) ? cand->type() : cfg_.cand_type; + auto nc = New(ctype, cand->start(), cand->end(), parts[i], final_comment); + nc->set_quality(cand->quality()); + nc->set_preedit(cand->preedit()); + pending_candidates_.push_back(nc); + } + } else if (cfg_.mode == "append") { + pending_candidates_.push_back(cand); + for (const auto& p : parts) { + std::string final_comment = (cfg_.comment_mode == "none") ? "" : rule_comment; + auto nc = New(cfg_.cand_type, cand->start(), cand->end(), p, final_comment); + nc->set_quality(cand->quality()); + nc->set_preedit(cand->preedit()); + pending_candidates_.push_back(nc); } + } else if (cfg_.mode == "comment") { + std::string joined; + for(size_t i = 0; i < parts.size(); ++i) joined += parts[i] + (i < parts.size() - 1 ? " " : ""); + std::string cfmt = cfg_.comment_format; + size_t pos = cfmt.find("%s"); + if (pos != std::string::npos) cfmt.replace(pos, 2, joined); + else cfmt = joined; + + std::string new_comment; + if (cfg_.comment_mode == "none") new_comment = ""; + else if (cfg_.comment_mode == "text") new_comment = cfmt; + else new_comment = cand->comment() + cfmt; + + auto nc = New(cand->type(), cand->start(), cand->end(), cand->text(), new_comment); + nc->set_quality(cand->quality()); + nc->set_preedit(cand->preedit()); + pending_candidates_.push_back(nc); } - } - - for (const auto& result : current_items) { - auto nc = New(result.cand_type, cand->start(), cand->end(), result.text, result.comment); - nc->set_quality(cand->quality()); - nc->set_preedit(cand->preedit()); - pending_candidates_.push_back(nc); + } else { + pending_candidates_.push_back(cand); } } -SuperFilter::SuperFilter(const Ticket& ticket) : Filter(ticket) { +// SuperFilter +SuperFilter::SuperFilter(const Ticket& ticket) + : Filter(ticket), name_space_(ticket.name_space.empty() ? "super_filter" : ticket.name_space) { if (ticket.schema) { LoadConfig(ticket.schema->config()); - InitializeDb(); + InitializeBinaryDb(); } } -SuperFilter::~SuperFilter() { - // The globally cached LevelDb instance remains open across session lifetimes. -} +SuperFilter::~SuperFilter() {} void SuperFilter::LoadConfig(Config* config) { - config->GetString("super_filter/db_name", &db_name_); + config->GetString(name_space_ + "/db_name", &db_name_); + if (!db_name_.empty()) db_name_ = std::filesystem::path(db_name_).filename().string(); + if (db_name_.empty() || db_name_ == "." || db_name_ == "..") db_name_ = name_space_; - if (!db_name_.empty()) { - db_name_ = std::filesystem::path(db_name_).filename().string(); + std::string user_dir = string(rime_get_api()->get_user_data_dir()); + db_name_ = user_dir + "/build/super_filter_" + db_name_ + ".bin"; + + config->GetString(name_space_ + "/delimiter", &cfg_.delimiter); + config->GetString(name_space_ + "/comment_format", &cfg_.comment_format); + + auto opt_node = config->GetItem(name_space_ + "/option"); + if (auto opt_val = As(opt_node)) { + if (opt_val->str() == "true") cfg_.always_on = true; + else if (opt_val->str() != "false") cfg_.options.push_back(opt_val->str()); + } else if (auto opt_list = As(opt_node)) { + for (size_t j = 0; j < opt_list->size(); ++j) { + if (auto v = As(opt_list->GetAt(j))) cfg_.options.push_back(v->str()); + } } - if (db_name_.empty() || db_name_ == "." || db_name_ == "..") { - db_name_ = "super_filter"; + auto tag_node = config->GetItem(name_space_ + "/tags"); + if (!tag_node) tag_node = config->GetItem(name_space_ + "/tag"); + if (auto tag_val = As(tag_node)) { + cfg_.tags.push_back(tag_val->str()); + } else if (auto tag_list = As(tag_node)) { + for (size_t j = 0; j < tag_list->size(); ++j) { + if (auto v = As(tag_list->GetAt(j))) cfg_.tags.push_back(v->str()); + } + } + + config->GetString(name_space_ + "/mode", &cfg_.mode); + config->GetBool(name_space_ + "/sentence", &cfg_.sentence); + config->GetString(name_space_ + "/comment_mode", &cfg_.comment_mode); + config->GetString(name_space_ + "/cand_type", &cfg_.cand_type); + + if (auto t9_val = As(config->GetItem(name_space_ + "/t9_mode"))) { + cfg_.t9_mode = (t9_val->str() == "true"); + if (cfg_.t9_mode && cfg_.mode != "abbrev") cfg_.t9_mode = false; } - db_name_ = "data/" + db_name_; - - config->GetString("super_filter/delimiter", &delimiter_); - if (delimiter_.empty()) delimiter_ = "|"; - config->GetString("super_filter/comment_format", &comment_format_); - if (comment_format_.empty()) comment_format_ = "〔%s〕"; - config->GetBool("super_filter/chain", &chain_); - - auto root = config->GetItem("super_filter/rules"); - if (auto rule_list = As(root)) { - for (size_t i = 0; i < rule_list->size(); ++i) { - auto item = As(rule_list->GetAt(i)); - if (!item) continue; - - SuperRule rule; - - if (auto name_val = As(item->Get("name"))) { - rule.name = name_val->str(); - } else { - rule.name = "Rule_" + std::to_string(i + 1); - } - - auto opt_node = item->Get("option"); - if (auto opt_val = As(opt_node)) { - if (opt_val->str() == "true") { - rule.always_on = true; - } else if (opt_val->str() == "false") { - // Explicitly frozen rule, option vector remains empty. - } else { - rule.options.push_back(opt_val->str()); - } - } else if (auto opt_list = As(opt_node)) { - for (size_t j=0; jsize(); ++j) { - if (auto v = As(opt_list->GetAt(j))) rule.options.push_back(v->str()); - } - } - - // Discard disabled or misconfigured rules during the parse phase to save CPU cycles. - if (!rule.always_on && rule.options.empty()) { - LOG(INFO) << "super_filter: [" << rule.name << "] frozen or missing option, safely ignored."; - continue; - } - - if (auto mode_val = As(item->Get("mode"))) rule.mode = mode_val->str(); - else rule.mode = "append"; - - if (rule.mode != "append" && rule.mode != "replace" && rule.mode != "comment" && rule.mode != "abbrev") { - LOG(WARNING) << "super_filter: [" << rule.name << "] unsupported mode '" << rule.mode << "', skipping."; - continue; - } - - if (auto sent_val = As(item->Get("sentence"))) { - if (sent_val->str() == "true") rule.sentence = true; - } - - if (auto pre_val = As(item->Get("prefix"))) { - rule.prefix = pre_val->str(); - } else { - rule.prefix = ""; - } - - if (auto cmod_val = As(item->Get("comment_mode"))) rule.comment_mode = cmod_val->str(); - else rule.comment_mode = "none"; - - if (auto ctype_val = As(item->Get("cand_type"))) rule.cand_type = ctype_val->str(); - else rule.cand_type = "derived"; - - // 解析 T9 模式开关,并设立严格防火墙:仅允许 abbrev 模式使用 - if (auto t9_val = As(item->Get("t9_mode"))) { - rule.t9_mode = (t9_val->str() == "true"); - if (rule.t9_mode && rule.mode != "abbrev") { - LOG(WARNING) << "super_filter: [" << rule.name << "] t9_mode 仅支持在 abbrev 模式下开启,已自动忽略。"; - rule.t9_mode = false; - } - } - if (rule.mode == "abbrev") { - auto ord_val = As(item->Get("order")); - if (!ord_val) { - LOG(WARNING) << "super_filter: [" << rule.name << "] missing 'order' parameter in abbrev mode."; - continue; - } - auto parts = Split(ord_val->str(), ","); - if (parts.size() < 2) { - LOG(WARNING) << "super_filter: [" << rule.name << "] malformed 'order' format."; - continue; - } + if (cfg_.mode == "abbrev") { + std::string order_str; + config->GetString(name_space_ + "/order", &order_str); + if (!order_str.empty()) { + auto parts = Split(order_str, ","); + if (parts.size() >= 2) { try { - rule.order_type = parts[0]; - rule.order_value = std::stoi(parts[1]); - if (parts.size() >= 3) rule.always_qty = std::stoi(parts[2]); - } catch (...) { - LOG(WARNING) << "super_filter: [" << rule.name << "] parse exception in 'order'."; - continue; - } - } - - auto files_node = item->Get("files"); - if (!files_node || (!As(files_node) && !As(files_node))) { - LOG(WARNING) << "super_filter: [" << rule.name << "] missing 'files' dependency."; - continue; + cfg_.order_type = parts[0]; + cfg_.order_value = std::stoi(parts[1]); + if (parts.size() >= 3) cfg_.always_qty = std::stoi(parts[2]); + } catch (...) {} } - - if (auto files_list = As(item->Get("files"))) { - for(size_t j=0; jsize(); ++j) { - if (auto f = As(files_list->GetAt(j))) { - std::string filepath = f->str(); - if (!filepath.empty() && - filepath.front() != '/' && filepath.front() != '\\' && - filepath.find("..") == std::string::npos) { - rule.files.push_back(filepath); - } else { - LOG(WARNING) << "super_filter: [" << rule.name << "] 非法或不安全的词库路径被拦截: " << filepath; - } - } + } + } + + auto files_node = config->GetItem(name_space_ + "/files"); + if (auto files_list = As(files_node)) { + for(size_t j = 0; j < files_list->size(); ++j) { + if (auto f = As(files_list->GetAt(j))) { + std::string filepath = f->str(); + if (!filepath.empty() && filepath.front() != '/' && filepath.front() != '\\' && filepath.find("..") == std::string::npos) { + cfg_.files.push_back(filepath); } } - if (rule.files.empty()) { - LOG(WARNING) << "super_filter: [" << rule.name << "] 过滤后无可用的合法 files,已跳过装载!"; - continue; - } - - rules_.push_back(rule); } } } -// Generates a stringent signature combining prefixes, file paths, and system attributes -// to accurately trigger database rebuilds only when necessary. std::string SuperFilter::GenerateFilesSignature() { - std::string sig = "delim:" + delimiter_ + "||"; + std::string sig = "delim:" + cfg_.delimiter + "||"; std::string user_dir = string(rime_get_api()->get_user_data_dir()); std::error_code ec_exist; - for (const auto& rule : rules_) { - sig += "t9:" + std::to_string(rule.t9_mode) + "@"; - for (const auto& path : rule.files) { - sig += "prefix:" + rule.prefix + "@path:" + path + "="; - std::filesystem::path full_path = user_dir + "/" + path; + sig += "t9:" + std::to_string(cfg_.t9_mode) + "@"; + for (const auto& path : cfg_.files) { + sig += "path:" + path + "="; + std::filesystem::path full_path = user_dir + "/" + path; + + if (std::filesystem::exists(full_path, ec_exist) && !ec_exist) { + std::error_code ec_time, ec_size; + auto ftime = std::filesystem::last_write_time(full_path, ec_time); + auto fsize = std::filesystem::file_size(full_path, ec_size); - if (std::filesystem::exists(full_path, ec_exist) && !ec_exist) { - std::error_code ec_time; - auto ftime = std::filesystem::last_write_time(full_path, ec_time); - - std::error_code ec_size; - auto fsize = std::filesystem::file_size(full_path, ec_size); - - if (!ec_time && !ec_size) { - auto time_sec = std::chrono::duration_cast(ftime.time_since_epoch()).count(); - sig += std::to_string(fsize) + "_" + std::to_string(time_sec) + "|"; - } + if (!ec_time && !ec_size) { + auto time_sec = std::chrono::duration_cast(ftime.time_since_epoch()).count(); + sig += std::to_string(fsize) + "_" + std::to_string(time_sec) + "|"; } } } return sig; } -static std::mutex g_db_cache_mutex; -void SuperFilter::InitializeDb() { + +void SuperFilter::InitializeBinaryDb() { + if (cfg_.files.empty()) return; + std::lock_guard lock(g_db_cache_mutex); - auto& cache = GetGlobalDbCache(); + auto& cache_map = GetGlobalDbCache(); std::string current_sig = GenerateFilesSignature(); - // Cache Hit: Instantly mount the pre-opened LevelDb to eliminate I/O lag. - if (cache.db && cache.db_name == db_name_ && cache.files_sig == current_sig) { - db_ = cache.db; + auto it = cache_map.find(db_name_); + if (it != cache_map.end() && it->second.db && it->second.files_sig == current_sig) { + db_ = it->second.db; return; } - std::string user_dir = string(rime_get_api()->get_user_data_dir()); - std::error_code ec_dir; - std::filesystem::create_directories(user_dir + "/data", ec_dir); - if (ec_dir) { - LOG(ERROR) << "super_filter: 无法创建 data 文件夹 '" << (user_dir + "/data") - << "': " << ec_dir.message(); - return; - } - - auto* db_component = Db::Require("userdb"); - if (!db_component) return; - - an new_db = an(db_component->Create(db_name_)); - if (!new_db) return; - - bool need_rebuild = false; - - if (new_db->OpenReadOnly()) { - std::string db_sig; - new_db->MetaFetch("_files_sig", &db_sig); - if (db_sig != current_sig) need_rebuild = true; - new_db->Close(); - } else { - need_rebuild = true; - } + std::shared_ptr new_db = std::make_shared(db_name_); + bool need_rebuild = !new_db->CheckSignature(current_sig); if (need_rebuild) { - if (new_db->Open()) { - LOG(INFO) << "super_filter: Database schema updated, initiating LevelDb rebuild..."; - db_ = new_db; - RebuildDb(); - new_db->MetaUpdate("_files_sig", current_sig); - LOG(INFO) << "super_filter: LevelDb rebuild complete."; - new_db->Close(); - db_.reset(); - } + LOG(INFO) << "super_filter [" << name_space_ << "]: Rebuilding binary dictionary..."; + RebuildDb(); } - if (new_db->OpenReadOnly()) { - cache.db = new_db; - cache.db_name = db_name_; - cache.files_sig = current_sig; + if (new_db->Open()) { + SuperDbCacheItem item; + item.db = new_db; + item.files_sig = current_sig; + cache_map[db_name_] = item; db_ = new_db; } } -// Data structure for in-memory sorting before writing to LevelDb struct DictItem { std::string value; double weight; @@ -626,126 +626,96 @@ struct DictItem { }; void SuperFilter::RebuildDb() { - if (db_) { - auto accessor = db_->Query(""); - if (accessor) { - std::string key, value; - while (!accessor->exhausted()) { - if (accessor->GetNextRecord(&key, &value)) { - db_->Erase(key); - } - } - } - } std::string user_dir = string(rime_get_api()->get_user_data_dir()); - for (const auto& rule : rules_) { - // Build a temporary in-memory map to aggregate keys across multiple lines/files - std::unordered_map> merged_data; - int line_counter = 0; - - for (const auto& path : rule.files) { - std::string full_path = user_dir + "/" + path; - std::ifstream file(full_path); - if (!file.is_open()) { - LOG(WARNING) << "super_filter: 无法打开词库文件 (被占用或不存在): " << full_path; - continue; - } - std::string line; - while (std::getline(file, line)) { - if (line.empty() || line[0] == '#') continue; - - size_t sep1 = line.find_first_of(" \t"); - if (sep1 != std::string::npos) { - std::string key = line.substr(0, sep1); - std::string orig_key = key; - - static const char t9_map[26] = { - '2','2','2', '3','3','3', '4','4','4', '5','5','5', '6','6','6', - '7','7','7','7', '8','8','8', '9','9','9','9' - }; - - if (rule.t9_mode) { - for (char& c : key) { - if (c >= 'a' && c <= 'z') c = t9_map[c - 'a']; - else if (c >= 'A' && c <= 'Z') c = t9_map[c - 'A']; - } + std::error_code ec_dir; + std::filesystem::create_directories(user_dir + "/build", ec_dir); + + std::map final_binary_data; + std::map> merged_data; + int line_counter = 0; + + for (const auto& path : cfg_.files) { + std::string full_path = user_dir + "/" + path; + std::ifstream file(full_path); + if (!file.is_open()) continue; + + std::string line; + while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + + size_t sep1 = line.find_first_of(" \t"); + if (sep1 != std::string::npos) { + std::string key = line.substr(0, sep1); + std::string orig_key = key; + + static const char t9_map[26] = { + '2','2','2', '3','3','3', '4','4','4', '5','5','5', '6','6','6', + '7','7','7','7', '8','8','8', '9','9','9','9' + }; + + if (cfg_.t9_mode) { + for (char& c : key) { + if (c >= 'a' && c <= 'z') c = t9_map[c - 'a']; + else if (c >= 'A' && c <= 'Z') c = t9_map[c - 'A']; } + } - size_t val_start = line.find_first_not_of(" \t", sep1); - if (val_start != std::string::npos) { - std::string rest = line.substr(val_start); - rest.erase(rest.find_last_not_of("\r\n \t") + 1); // trim right + size_t val_start = line.find_first_not_of(" \t", sep1); + if (val_start != std::string::npos) { + std::string rest = line.substr(val_start); + rest.erase(rest.find_last_not_of("\r\n \t") + 1); - std::string val = rest; - - // 如果是 T9 模式,且值里还没有包含 ==,把原始拼音作为尾巴藏进去 - if (rule.t9_mode && val.find("==") == std::string::npos) { - val = val + "==" + orig_key; - } - double weight = 0.0; - - // Try to extract weight from a potential 3rd column - size_t last_delim = rest.find_last_of(" \t"); - if (last_delim != std::string::npos) { - size_t weight_start = rest.find_first_not_of(" \t", last_delim); - if (weight_start != std::string::npos) { - std::string weight_str = rest.substr(weight_start); - try { - size_t parsed_len; - weight = std::stod(weight_str, &parsed_len); - // Ensure the parsed number spans the entire rest of the string - if (parsed_len == weight_str.length()) { - val = rest.substr(0, last_delim); - val.erase(val.find_last_not_of(" \t") + 1); - } else { - weight = 0.0; - } - } catch (...) { - weight = 0.0; - } - } + std::string val = rest; + if (cfg_.t9_mode && val.find("==") == std::string::npos) { + val = val + "==" + orig_key; + } + + double weight = 0.0; + size_t last_delim = rest.find_last_of(" \t"); + if (last_delim != std::string::npos) { + size_t weight_start = rest.find_first_not_of(" \t", last_delim); + if (weight_start != std::string::npos) { + std::string weight_str = rest.substr(weight_start); + try { + size_t parsed_len; + weight = std::stod(weight_str, &parsed_len); + if (parsed_len == weight_str.length()) { + val = rest.substr(0, last_delim); + val.erase(val.find_last_not_of(" \t") + 1); + } else weight = 0.0; + } catch (...) { weight = 0.0; } } - - // Push into the map (grouped by prefix + key) - merged_data[rule.prefix + key].push_back({val, weight, line_counter++}); } + merged_data[key].push_back({val, weight, line_counter++}); } } } + } - // Sort items by weight and merge them into a single string for DB insertion - for (auto& kv : merged_data) { - auto& items = kv.second; - - // Sort logic: Descending by weight, Ascending by original read order - std::sort(items.begin(), items.end(), [](const DictItem& a, const DictItem& b) { - if (a.weight != b.weight) return a.weight > b.weight; - return a.order < b.order; - }); - - std::string final_val; - for (size_t i = 0; i < items.size(); ++i) { - final_val += items[i].value; - if (i < items.size() - 1) final_val += delimiter_; - } - - db_->Update(kv.first, final_val); + for (auto& kv : merged_data) { + auto& items = kv.second; + std::sort(items.begin(), items.end(), [](const DictItem& a, const DictItem& b) { + if (a.weight != b.weight) return a.weight > b.weight; + return a.order < b.order; + }); + + std::string final_val; + for (size_t i = 0; i < items.size(); ++i) { + final_val += items[i].value; + if (i < items.size() - 1) final_val += cfg_.delimiter; } + final_binary_data[kv.first] = final_val; } -} -an SuperFilter::Apply( - an translation, - CandidateList* candidates) { + std::string current_sig = GenerateFilesSignature(); + SuperBinaryDb::Build(db_name_, current_sig, final_binary_data); +} +an SuperFilter::Apply(an translation, CandidateList* candidates) { if (!translation) return nullptr; Context* ctx = engine_->context(); - - if (!ctx->IsComposing() || ctx->input().empty()) { - return translation; - } - - return New(translation, rules_, db_, ctx, delimiter_, comment_format_, chain_); + if (!ctx->IsComposing() || ctx->input().empty()) return translation; + return New(translation, cfg_, db_, ctx); } } // namespace rime \ No newline at end of file diff --git a/src/rime/gear/super_filter.h b/src/rime/gear/super_filter.h index e41dc44d14..430d3067f7 100644 --- a/src/rime/gear/super_filter.h +++ b/src/rime/gear/super_filter.h @@ -6,56 +6,74 @@ #include #include #include +#include +#include #include #include #include #include -#include #include #include +#include +#include + namespace rime { -// Representation of a single filter rule configured in YAML. -struct SuperRule { - std::string name; +struct SuperConfig { bool always_on = false; std::vector options; + std::vector tags; - std::string mode; // Supported modes: append, replace, comment, abbrev - bool sentence = false; // Enable FMM (Forward Maximum Matching) for long phrases - - std::string prefix; + std::string mode = "append"; + bool sentence = false; std::vector files; - - // 开启九宫格(T9)模式:构建词库时自动将编码转数字,并保留原编码为 preedit bool t9_mode = false; std::string cand_type = "derived"; - std::string comment_mode; // Supported modes: none, text, append - - // Injection parameters strictly for 'abbrev' mode - std::string order_type = "index"; // 'index' (absolute position) or 'quality' (score threshold) + std::string comment_mode = "none"; + std::string order_type = "index"; int order_value = 1; int always_qty = 1; + + std::string delimiter = "|"; + std::string comment_format = "〔%s〕"; }; -// Wrapper for candidates that require forced injection at specific positions or quality thresholds. struct InjectCand { an cand; int value; }; -// The core translation class implementing lazy evaluation and stream processing. +class SuperBinaryDb { +public: + SuperBinaryDb(const std::string& bin_path); + ~SuperBinaryDb(); + + bool Open(); + void Close(); + bool Fetch(const std::string& key, std::string* value) const; + bool CheckSignature(const std::string& expected_sig) const; + + static bool Build(const std::string& bin_path, const std::string& signature, + const std::map& data); + +private: + std::string bin_path_; + std::unique_ptr mapping_; + std::unique_ptr region_; + const char* data_ptr_ = nullptr; + size_t data_size_ = 0; + uint32_t record_count_ = 0; + const uint32_t* index_ptr_ = nullptr; +}; + class SuperFilterTranslation : public Translation { public: SuperFilterTranslation(an inner, - const std::vector& rules, - an db, - Context* ctx, - const std::string& delimiter, - const std::string& comment_format, - bool is_chain); + const SuperConfig& config, + std::shared_ptr db, + Context* ctx); an Peek() override; bool Next() override; @@ -65,19 +83,15 @@ class SuperFilterTranslation : public Translation { void ProcessNextInner(); void UpdateExhausted(); - std::string SegmentConvert(const std::string& text, const std::string& prefix, bool sentence); + std::string SegmentConvert(const std::string& text, bool sentence); an inner_; - std::vector rules_; - an db_; + SuperConfig cfg_; + std::shared_ptr db_; Context* ctx_; - std::string delimiter_; - std::string comment_format_; - bool is_chain_; int yield_count_ = 0; - // Priority queues for candidate distribution std::deque index_cands_; std::deque quality_cands_; std::deque> lazy_cands_; @@ -85,27 +99,30 @@ class SuperFilterTranslation : public Translation { std::unordered_set abbrev_yielded_; }; -// Filter component responsible for parsing configurations and managing the global LevelDb connection. class SuperFilter : public Filter { public: explicit SuperFilter(const Ticket& ticket); virtual ~SuperFilter(); - an Apply(an translation, - CandidateList* candidates) override; + an Apply(an translation, CandidateList* candidates) override; private: void LoadConfig(Config* config); - void InitializeDb(); + void InitializeBinaryDb(); std::string GenerateFilesSignature(); void RebuildDb(); - std::vector rules_; - an db_; + SuperConfig cfg_; + std::shared_ptr db_; std::string db_name_; - std::string delimiter_; - std::string comment_format_; - bool chain_ = false; + std::string name_space_; +}; + +class SuperFilterComponent : public SuperFilter::Component { +public: + SuperFilter* Create(const Ticket& ticket) override { + return new SuperFilter(ticket); + } }; } // namespace rime