Skip to content

Commit baa137c

Browse files
committed
test(blind): support graded relevance labels
1 parent a711e83 commit baa137c

5 files changed

Lines changed: 123 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1313
- `seeklink search --rerank-k N` and `seeklink search --no-rerank` let callers trade precision for latency per query without changing the global reranker configuration.
1414
- `seeklink search --rerank-k auto` chooses a 5- or 20-candidate reranker budget from query shape, keeping exact title / alias, English, and ordinary CJK queries fast while giving filtered and CJK technical queries deeper reranking.
1515
- The blind-test runner now accepts `--rerank-k N`, `--rerank-k auto`, and `--no-rerank`, and records requested plus resolved reranking metadata in result JSON for latency / quality sweeps.
16+
- The blind-test runner now accepts optional graded `relevance:` labels in `queries.yaml`, using them for nDCG@10 while keeping `expected_paths` as hard Recall/MRR targets.
1617

1718
### Fixed
1819
- `seeklink search --rerank-k N` now limits the number of candidates passed to the cross-encoder even when `N` is lower than `--top-k`; the remaining results keep first-stage RRF order.

docs/blind-test.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ invocation.
4848
- "notes/fsrs-algorithm.md"
4949
- "notes/spaced-repetition.md"
5050
- "logs/2026-W15.md"
51+
relevance:
52+
"notes/fsrs-algorithm.md": 3
53+
"notes/spaced-repetition.md": 3
54+
"logs/2026-W15.md": 2
55+
"notes/forgetting-curve.md": 1
5156
tags: [cjk, common]
5257
expansion:
5358
- "间隔重复 遗忘曲线 FSRS"
@@ -73,13 +78,17 @@ averages.
7378
memory. No synthetic queries.
7479
2. For each, list 2-5 `expected_paths` you'd be annoyed if not in top 10.
7580
Hard must-hit semantics — not "would be nice".
76-
3. **Skip queries where a substring of the query exactly matches a note
81+
3. Optionally add `relevance:` grades for nDCG: `3` = direct answer,
82+
`2` = strong supporting context, `1` = related but insufficient.
83+
`expected_paths` default to grade `3`; extra relevance labels do not
84+
affect Recall@10 or MRR.
85+
4. **Skip queries where a substring of the query exactly matches a note
7786
title.** Those hit the title channel trivially and test nothing about
7887
expansion. Prefer queries where notes use different vocabulary than the
7988
query itself.
80-
4. Fill in `expansion:` with 2-3 hand-crafted alternates: lexical form,
89+
5. Fill in `expansion:` with 2-3 hand-crafted alternates: lexical form,
8190
semantic paraphrase, hypothetical answer sentence (HyDE style).
82-
5. Tag each query for slicing: `cjk`, `english`, `cjk-en-mixed`, `long`,
91+
6. Tag each query for slicing: `cjk`, `english`, `cjk-en-mixed`, `long`,
8392
`short`, `ambiguous`, `technical`, `common`.
8493

8594
**Ground-truth stability**: commit `queries.yaml` alongside a vault-state
@@ -94,6 +103,8 @@ For each `(query, config)` pair (recorded by the runner):
94103
- `titles` — top-10 titles (for the human blind scorer)
95104
- `snippets` — top-10 content previews (for the human blind scorer)
96105
- `scores` — fused scores (not directly compared across configs)
106+
- `relevance` — graded labels used for nDCG (`expected_paths` default to
107+
grade `3`)
97108
- `latency_ms` — wall-clock for the full query call chain (model load
98109
excluded — runner initializes once and warms up)
99110
- `rerank_k` — first-stage candidate count passed to the reranker (`0`
@@ -104,6 +115,8 @@ For each `(query, config)` pair (recorded by the runner):
104115
`cjk_technical`, `filter`, `default`, or `fixed`)
105116
- `recall_at_10` — fraction of `expected_paths` in top-10
106117
- `mrr` — reciprocal rank of first expected hit in top-10 (0 if none)
118+
- `ndcg_at_10` — graded nDCG when `relevance:` labels exist, otherwise
119+
binary nDCG over `expected_paths`
107120

108121
Aggregates:
109122

tests/blind/queries.example.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66
expected_paths:
77
- "notes/fsrs-algorithm.md"
88
- "notes/spaced-repetition.md"
9+
relevance:
10+
"notes/fsrs-algorithm.md": 3
11+
"notes/spaced-repetition.md": 3
12+
"notes/forgetting-curve.md": 2
913
tags: [cjk, core-concept]
1014
expansion:
1115
- "间隔重复 遗忘曲线 FSRS"

tests/blind/run.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,33 @@ class QuerySpec:
6060
query: str
6161
intent: str | None
6262
expected_paths: list[str]
63+
relevance: dict[str, float]
6364
tags: list[str]
6465
expansion: list[str] | None
6566

6667

68+
def _parse_relevance(raw: object, expected_paths: list[str]) -> dict[str, float]:
69+
relevance = {path: 3.0 for path in expected_paths}
70+
if raw is None:
71+
return relevance
72+
if not isinstance(raw, dict):
73+
raise ValueError("relevance must be a mapping of path -> grade")
74+
75+
for path, grade in raw.items():
76+
if not isinstance(path, str) or not path:
77+
raise ValueError("relevance paths must be non-empty strings")
78+
try:
79+
numeric_grade = float(grade)
80+
except (TypeError, ValueError) as e:
81+
raise ValueError(
82+
f"relevance grade for {path!r} must be numeric"
83+
) from e
84+
if numeric_grade < 0:
85+
raise ValueError(f"relevance grade for {path!r} must be >= 0")
86+
relevance[path] = numeric_grade
87+
return relevance
88+
89+
6790
def _parse_rerank_k(raw: str) -> RerankK:
6891
if raw == "auto":
6992
return raw
@@ -94,6 +117,7 @@ class ResultRow:
94117
titles: list[str | None]
95118
snippets: list[str]
96119
scores: list[float]
120+
relevance: dict[str, float]
97121
latency_ms: float
98122
reranker_active: bool
99123
recall_at_10: float
@@ -118,11 +142,13 @@ def load_queries(path: Path) -> list[QuerySpec]:
118142
f"queries.yaml entry {i}: missing required field "
119143
f"('query' and 'expected_paths' are mandatory)"
120144
)
145+
expected_paths = list(r["expected_paths"])
121146
specs.append(
122147
QuerySpec(
123148
query=r["query"],
124149
intent=r.get("intent"),
125-
expected_paths=list(r["expected_paths"]),
150+
expected_paths=expected_paths,
151+
relevance=_parse_relevance(r.get("relevance"), expected_paths),
126152
tags=list(r.get("tags", [])),
127153
expansion=list(r["expansion"]) if r.get("expansion") else None,
128154
)
@@ -165,6 +191,7 @@ def _result_row(
165191
titles=titles,
166192
snippets=snippets,
167193
scores=scores,
194+
relevance=dict(spec.relevance),
168195
latency_ms=latency_ms,
169196
reranker_active=reranker_active,
170197
rerank_k=rerank_k if reranker_active else 0,
@@ -174,7 +201,9 @@ def _result_row(
174201
average_precision_at_10=average_precision_at_k(
175202
hits, spec.expected_paths, k=10
176203
),
177-
ndcg_at_10=ndcg_at_k(hits, spec.expected_paths, k=10),
204+
ndcg_at_10=ndcg_at_k(
205+
hits, spec.expected_paths, k=10, relevance=spec.relevance
206+
),
178207
last_expected_rank=last_expected_rank(hits, spec.expected_paths, k=10),
179208
resolved_rerank_k=resolved_rerank_k if reranker_active else 0,
180209
rerank_k_reason=rerank_k_reason if reranker_active else None,

tests/test_blind_runner_aggregates.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,76 @@
1616
)
1717

1818

19+
class TestLoadQueries:
20+
def test_load_queries_defaults_expected_paths_to_grade_three(self, tmp_path: Path):
21+
path = tmp_path / "queries.yaml"
22+
path.write_text(
23+
"""
24+
- query: memory
25+
expected_paths:
26+
- notes/answer.md
27+
tags: [english]
28+
""",
29+
encoding="utf-8",
30+
)
31+
32+
specs = blind_run.load_queries(path)
33+
34+
assert specs[0].relevance == {"notes/answer.md": 3.0}
35+
36+
def test_load_queries_accepts_graded_relevance_labels(self, tmp_path: Path):
37+
path = tmp_path / "queries.yaml"
38+
path.write_text(
39+
"""
40+
- query: transformer retrieval
41+
expected_paths:
42+
- notes/attention.md
43+
relevance:
44+
notes/attention.md: 3
45+
notes/agent-memory.md: 2
46+
notes/rrf.md: 1
47+
tags: [english]
48+
""",
49+
encoding="utf-8",
50+
)
51+
52+
specs = blind_run.load_queries(path)
53+
54+
assert specs[0].relevance == {
55+
"notes/attention.md": 3.0,
56+
"notes/agent-memory.md": 2.0,
57+
"notes/rrf.md": 1.0,
58+
}
59+
60+
def test_result_row_uses_graded_relevance_for_ndcg(self):
61+
spec = blind_run.QuerySpec(
62+
query="transformer retrieval",
63+
intent=None,
64+
expected_paths=["notes/answer.md"],
65+
relevance={
66+
"notes/answer.md": 3.0,
67+
"notes/support.md": 2.0,
68+
},
69+
tags=["english"],
70+
expansion=None,
71+
)
72+
73+
row = blind_run._result_row(
74+
spec=spec,
75+
config="A",
76+
hits=["notes/support.md", "notes/answer.md"],
77+
titles=[None, None],
78+
snippets=["", ""],
79+
scores=[1.0, 0.5],
80+
latency_ms=10.0,
81+
reranker_active=True,
82+
rerank_k=5,
83+
)
84+
85+
assert row.mrr == pytest.approx(0.5)
86+
assert row.ndcg_at_10 > row.mrr
87+
88+
1989
def _row(
2090
*,
2191
query: str,
@@ -35,6 +105,7 @@ def _row(
35105
titles=[],
36106
snippets=[],
37107
scores=[],
108+
relevance={},
38109
latency_ms=latency,
39110
reranker_active=True,
40111
recall_at_10=recall,

0 commit comments

Comments
 (0)