|
26 | 26 | RerankK = int | Literal["auto"] |
27 | 27 | AUTO_RERANK_FAST_K = 5 |
28 | 28 | AUTO_RERANK_DEEP_K = 20 |
| 29 | +_CJK_QUESTION_BM25_WEIGHT = 0.5 |
29 | 30 | _CJK_TECHNICAL_RERANK_TERMS = ( |
30 | 31 | "向量", |
31 | 32 | "嵌入", |
@@ -168,6 +169,8 @@ class SearchDiagnostics: |
168 | 169 | indegree_ranks: dict[int, int] = field(default_factory=dict) |
169 | 170 | first_stage_ranked_source_ids: list[int] = field(default_factory=list) |
170 | 171 | rerank_candidate_source_ids: list[int] = field(default_factory=list) |
| 172 | + effective_bm25_weight: float = 1.0 |
| 173 | + cjk_question_terms_stripped: bool = False |
171 | 174 |
|
172 | 175 |
|
173 | 176 | def _contains_cjk(text: str) -> bool: |
@@ -305,6 +308,12 @@ def search( |
305 | 308 | if not allowed_source_ids and not metadata_expansion: |
306 | 309 | return [] |
307 | 310 | filtered_source_ids = allowed_source_ids if has_filter else None |
| 311 | + fts_query = db.prepare_fts_query(query) |
| 312 | + effective_bm25_weight = ( |
| 313 | + _CJK_QUESTION_BM25_WEIGHT |
| 314 | + if fts_query.stripped_cjk_question_terms and bm25_weight == 1.0 |
| 315 | + else bm25_weight |
| 316 | + ) |
308 | 317 |
|
309 | 318 | # Channel 1: BM25 (chunk-level) |
310 | 319 | fts_limit = 200 if has_filter else 50 |
@@ -387,7 +396,12 @@ def search( |
387 | 396 | indeg_ranks = {sid: i + 1 for i, sid in enumerate(indeg_ranked)} |
388 | 397 | base_scores = _rrf_fuse( |
389 | 398 | [bm25_ranks, vec_ranks, indeg_ranks, title_ranks], |
390 | | - weights=[bm25_weight, vec_weight, indegree_weight, title_weight], |
| 399 | + weights=[ |
| 400 | + effective_bm25_weight, |
| 401 | + vec_weight, |
| 402 | + indegree_weight, |
| 403 | + title_weight, |
| 404 | + ], |
391 | 405 | ) |
392 | 406 | base_rank_by_source_id = { |
393 | 407 | sid: rank |
@@ -431,7 +445,12 @@ def search( |
431 | 445 |
|
432 | 446 | # RRF fusion (4 channels) |
433 | 447 | channel_ranks = [bm25_ranks, vec_ranks, indeg_ranks, title_ranks] |
434 | | - channel_weights = [bm25_weight, vec_weight, indegree_weight, title_weight] |
| 448 | + channel_weights = [ |
| 449 | + effective_bm25_weight, |
| 450 | + vec_weight, |
| 451 | + indegree_weight, |
| 452 | + title_weight, |
| 453 | + ] |
435 | 454 | if metadata_ranks: |
436 | 455 | channel_ranks.append(metadata_ranks) |
437 | 456 | channel_weights.append(metadata_weight) |
@@ -460,6 +479,10 @@ def search( |
460 | 479 | diagnostics.indegree_ranks = dict(indeg_ranks) |
461 | 480 | diagnostics.first_stage_ranked_source_ids = list(first_stage_ranked) |
462 | 481 | diagnostics.rerank_candidate_source_ids = list(ranked) |
| 482 | + diagnostics.effective_bm25_weight = effective_bm25_weight |
| 483 | + diagnostics.cjk_question_terms_stripped = ( |
| 484 | + fts_query.stripped_cjk_question_terms |
| 485 | + ) |
463 | 486 |
|
464 | 487 | # Pick best chunk for each source (prefer BM25 chunk, fall back to vec) |
465 | 488 | best_chunks: dict[int, Chunk] = {} |
|
0 commit comments