fix(search): soften BM25 for Chinese question queries

simonsysun · simonsysun · commit c72196bfc2bb · 2026-05-02T21:28:08.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Chinese question-style queries now strip common question particles before
   FTS5 matching, so terms like `卵生动物有哪些？` can use the BM25 channel
   instead of falling back to vector-only retrieval.
+- Chinese question-style queries that use the normalized BM25 path now apply
+  that channel as a lighter ranking signal, reducing no-reranker over-promotion
+  of adjacent keyword-heavy passages.
 - Long paragraphs that follow a buffered heading now split at sentence
   boundaries instead of becoming one oversized chunk, reducing pathological
   chunks in generated/list-heavy Markdown while preserving fenced-code
diff --git a/seeklink/db.py b/seeklink/db.py
@@ -7,6 +7,7 @@
 import sys
 import threading
 from contextlib import contextmanager
+from dataclasses import dataclass
 from pathlib import Path
 
 import sqlite_vec
@@ -44,23 +45,32 @@ class CapabilityError(Exception):
     """Raised when the runtime environment doesn't meet requirements."""
 
 
+@dataclass(frozen=True, slots=True)
+class FTSQuery:
+    query: str
+    stripped_cjk_question_terms: bool = False
+
+
 def _contains_cjk(text: str) -> bool:
     return any("\u3400" <= ch <= "\u9fff" for ch in text)
 
 
-def _normalize_fts_query(query: str) -> str:
+def _prepare_jieba_fts_query(query: str) -> FTSQuery:
     """Strip common Chinese question particles from FTS5 MATCH queries.
 
     FTS5 treats space-separated query tokens as mandatory terms. For Chinese
     questions, jieba can tokenize function words like "有哪些" or "多少"; leaving
     them in the MATCH expression often makes the BM25 channel return no rows.
     """
     if not _contains_cjk(query):
-        return query
+        return FTSQuery(query)
 
     tokens = [token for token, _start, _end in JiebaTokenizer().tokenize(query)]
     kept = [token for token in tokens if token not in _CJK_QUERY_STOPWORDS]
-    return " ".join(kept) if kept else query
+    return FTSQuery(
+        " ".join(kept) if kept else query,
+        stripped_cjk_question_terms=len(kept) != len(tokens),
+    )
 
 
 class Database:
@@ -106,6 +116,12 @@ def _open(self) -> sqlite3.Connection:
     def conn(self) -> sqlite3.Connection:
         return self._conn
 
+    def prepare_fts_query(self, query: str) -> FTSQuery:
+        """Return the FTS5 MATCH query used by this database connection."""
+        if self._fts_tokenizer != "jieba":
+            return FTSQuery(query)
+        return _prepare_jieba_fts_query(query)
+
     def close(self) -> None:
         self._conn.close()
 
@@ -853,12 +869,7 @@ def search_fts_sources(
         if source_ids is not None and not source_ids:
             return []
         try:
-            fts_query = (
-                _normalize_fts_query(query)
-                if self._fts_tokenizer == "jieba"
-                else query
-            )
-            params: list[str | int] = [fts_query]
+            params: list[str | int] = [self.prepare_fts_query(query).query]
             source_filter = ""
             if source_ids is not None:
                 ordered_ids = sorted(source_ids)
@@ -950,12 +961,7 @@ def search_fts(
         """Full-text search via FTS5. Returns (Chunk, bm25_rank) pairs."""
         if source_ids is not None and not source_ids:
             return []
-        fts_query = (
-            _normalize_fts_query(query)
-            if self._fts_tokenizer == "jieba"
-            else query
-        )
-        params: list[str | int] = [fts_query]
+        params: list[str | int] = [self.prepare_fts_query(query).query]
         source_filter = ""
         if source_ids is not None:
             ordered_ids = sorted(source_ids)
diff --git a/seeklink/search.py b/seeklink/search.py
@@ -26,6 +26,7 @@
 RerankK = int | Literal["auto"]
 AUTO_RERANK_FAST_K = 5
 AUTO_RERANK_DEEP_K = 20
+_CJK_QUESTION_BM25_WEIGHT = 0.5
 _CJK_TECHNICAL_RERANK_TERMS = (
     "向量",
     "嵌入",
@@ -168,6 +169,8 @@ class SearchDiagnostics:
     indegree_ranks: dict[int, int] = field(default_factory=dict)
     first_stage_ranked_source_ids: list[int] = field(default_factory=list)
     rerank_candidate_source_ids: list[int] = field(default_factory=list)
+    effective_bm25_weight: float = 1.0
+    cjk_question_terms_stripped: bool = False
 
 
 def _contains_cjk(text: str) -> bool:
@@ -305,6 +308,12 @@ def search(
         if not allowed_source_ids and not metadata_expansion:
             return []
     filtered_source_ids = allowed_source_ids if has_filter else None
+    fts_query = db.prepare_fts_query(query)
+    effective_bm25_weight = (
+        _CJK_QUESTION_BM25_WEIGHT
+        if fts_query.stripped_cjk_question_terms and bm25_weight == 1.0
+        else bm25_weight
+    )
 
     # Channel 1: BM25 (chunk-level)
     fts_limit = 200 if has_filter else 50
@@ -387,7 +396,12 @@ def search(
     indeg_ranks = {sid: i + 1 for i, sid in enumerate(indeg_ranked)}
     base_scores = _rrf_fuse(
         [bm25_ranks, vec_ranks, indeg_ranks, title_ranks],
-        weights=[bm25_weight, vec_weight, indegree_weight, title_weight],
+        weights=[
+            effective_bm25_weight,
+            vec_weight,
+            indegree_weight,
+            title_weight,
+        ],
     )
     base_rank_by_source_id = {
         sid: rank
@@ -431,7 +445,12 @@ def search(
 
     # RRF fusion (4 channels)
     channel_ranks = [bm25_ranks, vec_ranks, indeg_ranks, title_ranks]
-    channel_weights = [bm25_weight, vec_weight, indegree_weight, title_weight]
+    channel_weights = [
+        effective_bm25_weight,
+        vec_weight,
+        indegree_weight,
+        title_weight,
+    ]
     if metadata_ranks:
         channel_ranks.append(metadata_ranks)
         channel_weights.append(metadata_weight)
@@ -460,6 +479,10 @@ def search(
         diagnostics.indegree_ranks = dict(indeg_ranks)
         diagnostics.first_stage_ranked_source_ids = list(first_stage_ranked)
         diagnostics.rerank_candidate_source_ids = list(ranked)
+        diagnostics.effective_bm25_weight = effective_bm25_weight
+        diagnostics.cjk_question_terms_stripped = (
+            fts_query.stripped_cjk_question_terms
+        )
 
     # Pick best chunk for each source (prefer BM25 chunk, fall back to vec)
     best_chunks: dict[int, Chunk] = {}
diff --git a/tests/test_db.py b/tests/test_db.py
@@ -274,6 +274,14 @@ def test_fts_chinese_question_terms_do_not_block_keyword_match(self, db: Databas
 
         assert [chunk.source_id for chunk, _ in results] == [src.id]
 
+    def test_prepare_fts_query_marks_chinese_question_terms(self, db: Database):
+        question = db.prepare_fts_query("卵生动物有哪些？")
+        ordinary = db.prepare_fts_query("知识管理")
+
+        assert question.stripped_cjk_question_terms is True
+        assert "有哪些" not in question.query
+        assert ordinary.stripped_cjk_question_terms is False
+
     def test_cascade_delete_cleans_fts(self, db: Database):
         src = _make_source(db)
         db.add_chunk(src.id, "Unique searchable content xyzzy", 0)
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -472,6 +472,46 @@ def test_vec_zero_weight(self, db: Database, embedder: Embedder, vault: Path):
         for r in results:
             assert r.score > 0
 
+    def test_cjk_question_terms_use_lighter_default_bm25_weight(self, db: Database):
+        source = db.add_source(
+            uid="cjk-question",
+            path="notes/oviparity.md",
+            title="卵生动物",
+        )
+        db.add_chunk(source.id, "卵生动物通过产卵进行繁殖。", 0)
+        diagnostics = SearchDiagnostics()
+
+        results = search(
+            db,
+            FtsOnlyEmbedder(),  # type: ignore[arg-type]
+            "卵生动物有哪些？",
+            diagnostics=diagnostics,
+        )
+
+        assert [result.path for result in results] == ["notes/oviparity.md"]
+        assert diagnostics.cjk_question_terms_stripped is True
+        assert diagnostics.effective_bm25_weight == 0.5
+
+    def test_explicit_bm25_weight_is_respected_for_cjk_question(self, db: Database):
+        source = db.add_source(
+            uid="cjk-question-explicit-weight",
+            path="notes/oviparity.md",
+            title="卵生动物",
+        )
+        db.add_chunk(source.id, "卵生动物通过产卵进行繁殖。", 0)
+        diagnostics = SearchDiagnostics()
+
+        search(
+            db,
+            FtsOnlyEmbedder(),  # type: ignore[arg-type]
+            "卵生动物有哪些？",
+            bm25_weight=0.0,
+            diagnostics=diagnostics,
+        )
+
+        assert diagnostics.cjk_question_terms_stripped is True
+        assert diagnostics.effective_bm25_weight == 0.0
+
 
 # ── v2: Tag/Folder filtering ─────────────────────────────────────