Skip to content

Commit c72196b

Browse files
committed
fix(search): soften BM25 for Chinese question queries
1 parent ef8e07d commit c72196b

5 files changed

Lines changed: 97 additions & 17 deletions

File tree

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1515
- Chinese question-style queries now strip common question particles before
1616
FTS5 matching, so terms like `卵生动物有哪些?` can use the BM25 channel
1717
instead of falling back to vector-only retrieval.
18+
- Chinese question-style queries that use the normalized BM25 path now apply
19+
that channel as a lighter ranking signal, reducing no-reranker over-promotion
20+
of adjacent keyword-heavy passages.
1821
- Long paragraphs that follow a buffered heading now split at sentence
1922
boundaries instead of becoming one oversized chunk, reducing pathological
2023
chunks in generated/list-heavy Markdown while preserving fenced-code

seeklink/db.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import sys
88
import threading
99
from contextlib import contextmanager
10+
from dataclasses import dataclass
1011
from pathlib import Path
1112

1213
import sqlite_vec
@@ -44,23 +45,32 @@ class CapabilityError(Exception):
4445
"""Raised when the runtime environment doesn't meet requirements."""
4546

4647

48+
@dataclass(frozen=True, slots=True)
49+
class FTSQuery:
50+
query: str
51+
stripped_cjk_question_terms: bool = False
52+
53+
4754
def _contains_cjk(text: str) -> bool:
4855
return any("\u3400" <= ch <= "\u9fff" for ch in text)
4956

5057

51-
def _normalize_fts_query(query: str) -> str:
58+
def _prepare_jieba_fts_query(query: str) -> FTSQuery:
5259
"""Strip common Chinese question particles from FTS5 MATCH queries.
5360
5461
FTS5 treats space-separated query tokens as mandatory terms. For Chinese
5562
questions, jieba can tokenize function words like "有哪些" or "多少"; leaving
5663
them in the MATCH expression often makes the BM25 channel return no rows.
5764
"""
5865
if not _contains_cjk(query):
59-
return query
66+
return FTSQuery(query)
6067

6168
tokens = [token for token, _start, _end in JiebaTokenizer().tokenize(query)]
6269
kept = [token for token in tokens if token not in _CJK_QUERY_STOPWORDS]
63-
return " ".join(kept) if kept else query
70+
return FTSQuery(
71+
" ".join(kept) if kept else query,
72+
stripped_cjk_question_terms=len(kept) != len(tokens),
73+
)
6474

6575

6676
class Database:
@@ -106,6 +116,12 @@ def _open(self) -> sqlite3.Connection:
106116
def conn(self) -> sqlite3.Connection:
107117
return self._conn
108118

119+
def prepare_fts_query(self, query: str) -> FTSQuery:
120+
"""Return the FTS5 MATCH query used by this database connection."""
121+
if self._fts_tokenizer != "jieba":
122+
return FTSQuery(query)
123+
return _prepare_jieba_fts_query(query)
124+
109125
def close(self) -> None:
110126
self._conn.close()
111127

@@ -853,12 +869,7 @@ def search_fts_sources(
853869
if source_ids is not None and not source_ids:
854870
return []
855871
try:
856-
fts_query = (
857-
_normalize_fts_query(query)
858-
if self._fts_tokenizer == "jieba"
859-
else query
860-
)
861-
params: list[str | int] = [fts_query]
872+
params: list[str | int] = [self.prepare_fts_query(query).query]
862873
source_filter = ""
863874
if source_ids is not None:
864875
ordered_ids = sorted(source_ids)
@@ -950,12 +961,7 @@ def search_fts(
950961
"""Full-text search via FTS5. Returns (Chunk, bm25_rank) pairs."""
951962
if source_ids is not None and not source_ids:
952963
return []
953-
fts_query = (
954-
_normalize_fts_query(query)
955-
if self._fts_tokenizer == "jieba"
956-
else query
957-
)
958-
params: list[str | int] = [fts_query]
964+
params: list[str | int] = [self.prepare_fts_query(query).query]
959965
source_filter = ""
960966
if source_ids is not None:
961967
ordered_ids = sorted(source_ids)

seeklink/search.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
RerankK = int | Literal["auto"]
2727
AUTO_RERANK_FAST_K = 5
2828
AUTO_RERANK_DEEP_K = 20
29+
_CJK_QUESTION_BM25_WEIGHT = 0.5
2930
_CJK_TECHNICAL_RERANK_TERMS = (
3031
"向量",
3132
"嵌入",
@@ -168,6 +169,8 @@ class SearchDiagnostics:
168169
indegree_ranks: dict[int, int] = field(default_factory=dict)
169170
first_stage_ranked_source_ids: list[int] = field(default_factory=list)
170171
rerank_candidate_source_ids: list[int] = field(default_factory=list)
172+
effective_bm25_weight: float = 1.0
173+
cjk_question_terms_stripped: bool = False
171174

172175

173176
def _contains_cjk(text: str) -> bool:
@@ -305,6 +308,12 @@ def search(
305308
if not allowed_source_ids and not metadata_expansion:
306309
return []
307310
filtered_source_ids = allowed_source_ids if has_filter else None
311+
fts_query = db.prepare_fts_query(query)
312+
effective_bm25_weight = (
313+
_CJK_QUESTION_BM25_WEIGHT
314+
if fts_query.stripped_cjk_question_terms and bm25_weight == 1.0
315+
else bm25_weight
316+
)
308317

309318
# Channel 1: BM25 (chunk-level)
310319
fts_limit = 200 if has_filter else 50
@@ -387,7 +396,12 @@ def search(
387396
indeg_ranks = {sid: i + 1 for i, sid in enumerate(indeg_ranked)}
388397
base_scores = _rrf_fuse(
389398
[bm25_ranks, vec_ranks, indeg_ranks, title_ranks],
390-
weights=[bm25_weight, vec_weight, indegree_weight, title_weight],
399+
weights=[
400+
effective_bm25_weight,
401+
vec_weight,
402+
indegree_weight,
403+
title_weight,
404+
],
391405
)
392406
base_rank_by_source_id = {
393407
sid: rank
@@ -431,7 +445,12 @@ def search(
431445

432446
# RRF fusion (4 channels)
433447
channel_ranks = [bm25_ranks, vec_ranks, indeg_ranks, title_ranks]
434-
channel_weights = [bm25_weight, vec_weight, indegree_weight, title_weight]
448+
channel_weights = [
449+
effective_bm25_weight,
450+
vec_weight,
451+
indegree_weight,
452+
title_weight,
453+
]
435454
if metadata_ranks:
436455
channel_ranks.append(metadata_ranks)
437456
channel_weights.append(metadata_weight)
@@ -460,6 +479,10 @@ def search(
460479
diagnostics.indegree_ranks = dict(indeg_ranks)
461480
diagnostics.first_stage_ranked_source_ids = list(first_stage_ranked)
462481
diagnostics.rerank_candidate_source_ids = list(ranked)
482+
diagnostics.effective_bm25_weight = effective_bm25_weight
483+
diagnostics.cjk_question_terms_stripped = (
484+
fts_query.stripped_cjk_question_terms
485+
)
463486

464487
# Pick best chunk for each source (prefer BM25 chunk, fall back to vec)
465488
best_chunks: dict[int, Chunk] = {}

tests/test_db.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,14 @@ def test_fts_chinese_question_terms_do_not_block_keyword_match(self, db: Databas
274274

275275
assert [chunk.source_id for chunk, _ in results] == [src.id]
276276

277+
def test_prepare_fts_query_marks_chinese_question_terms(self, db: Database):
278+
question = db.prepare_fts_query("卵生动物有哪些?")
279+
ordinary = db.prepare_fts_query("知识管理")
280+
281+
assert question.stripped_cjk_question_terms is True
282+
assert "有哪些" not in question.query
283+
assert ordinary.stripped_cjk_question_terms is False
284+
277285
def test_cascade_delete_cleans_fts(self, db: Database):
278286
src = _make_source(db)
279287
db.add_chunk(src.id, "Unique searchable content xyzzy", 0)

tests/test_search.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,46 @@ def test_vec_zero_weight(self, db: Database, embedder: Embedder, vault: Path):
472472
for r in results:
473473
assert r.score > 0
474474

475+
def test_cjk_question_terms_use_lighter_default_bm25_weight(self, db: Database):
476+
source = db.add_source(
477+
uid="cjk-question",
478+
path="notes/oviparity.md",
479+
title="卵生动物",
480+
)
481+
db.add_chunk(source.id, "卵生动物通过产卵进行繁殖。", 0)
482+
diagnostics = SearchDiagnostics()
483+
484+
results = search(
485+
db,
486+
FtsOnlyEmbedder(), # type: ignore[arg-type]
487+
"卵生动物有哪些?",
488+
diagnostics=diagnostics,
489+
)
490+
491+
assert [result.path for result in results] == ["notes/oviparity.md"]
492+
assert diagnostics.cjk_question_terms_stripped is True
493+
assert diagnostics.effective_bm25_weight == 0.5
494+
495+
def test_explicit_bm25_weight_is_respected_for_cjk_question(self, db: Database):
496+
source = db.add_source(
497+
uid="cjk-question-explicit-weight",
498+
path="notes/oviparity.md",
499+
title="卵生动物",
500+
)
501+
db.add_chunk(source.id, "卵生动物通过产卵进行繁殖。", 0)
502+
diagnostics = SearchDiagnostics()
503+
504+
search(
505+
db,
506+
FtsOnlyEmbedder(), # type: ignore[arg-type]
507+
"卵生动物有哪些?",
508+
bm25_weight=0.0,
509+
diagnostics=diagnostics,
510+
)
511+
512+
assert diagnostics.cjk_question_terms_stripped is True
513+
assert diagnostics.effective_bm25_weight == 0.0
514+
475515

476516
# ── v2: Tag/Folder filtering ─────────────────────────────────────
477517

0 commit comments

Comments
 (0)