test(blind): support graded relevance labels

simonsysun · simonsysun · commit baa137cf5a80 · 2026-04-26T15:38:40.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `seeklink search --rerank-k N` and `seeklink search --no-rerank` let callers trade precision for latency per query without changing the global reranker configuration.
 - `seeklink search --rerank-k auto` chooses a 5- or 20-candidate reranker budget from query shape, keeping exact title / alias, English, and ordinary CJK queries fast while giving filtered and CJK technical queries deeper reranking.
 - The blind-test runner now accepts `--rerank-k N`, `--rerank-k auto`, and `--no-rerank`, and records requested plus resolved reranking metadata in result JSON for latency / quality sweeps.
+- The blind-test runner now accepts optional graded `relevance:` labels in `queries.yaml`, using them for nDCG@10 while keeping `expected_paths` as hard Recall/MRR targets.
 
 ### Fixed
 - `seeklink search --rerank-k N` now limits the number of candidates passed to the cross-encoder even when `N` is lower than `--top-k`; the remaining results keep first-stage RRF order.
diff --git a/docs/blind-test.md b/docs/blind-test.md
@@ -48,6 +48,11 @@ invocation.
     - "notes/fsrs-algorithm.md"
     - "notes/spaced-repetition.md"
     - "logs/2026-W15.md"
+  relevance:
+    "notes/fsrs-algorithm.md": 3
+    "notes/spaced-repetition.md": 3
+    "logs/2026-W15.md": 2
+    "notes/forgetting-curve.md": 1
   tags: [cjk, common]
   expansion:
     - "间隔重复 遗忘曲线 FSRS"
@@ -73,13 +78,17 @@ averages.
    memory. No synthetic queries.
 2. For each, list 2-5 `expected_paths` you'd be annoyed if not in top 10.
    Hard must-hit semantics — not "would be nice".
-3. **Skip queries where a substring of the query exactly matches a note
+3. Optionally add `relevance:` grades for nDCG: `3` = direct answer,
+   `2` = strong supporting context, `1` = related but insufficient.
+   `expected_paths` default to grade `3`; extra relevance labels do not
+   affect Recall@10 or MRR.
+4. **Skip queries where a substring of the query exactly matches a note
    title.** Those hit the title channel trivially and test nothing about
    expansion. Prefer queries where notes use different vocabulary than the
    query itself.
-4. Fill in `expansion:` with 2-3 hand-crafted alternates: lexical form,
+5. Fill in `expansion:` with 2-3 hand-crafted alternates: lexical form,
    semantic paraphrase, hypothetical answer sentence (HyDE style).
-5. Tag each query for slicing: `cjk`, `english`, `cjk-en-mixed`, `long`,
+6. Tag each query for slicing: `cjk`, `english`, `cjk-en-mixed`, `long`,
    `short`, `ambiguous`, `technical`, `common`.
 
 **Ground-truth stability**: commit `queries.yaml` alongside a vault-state
@@ -94,6 +103,8 @@ For each `(query, config)` pair (recorded by the runner):
 - `titles` — top-10 titles (for the human blind scorer)
 - `snippets` — top-10 content previews (for the human blind scorer)
 - `scores` — fused scores (not directly compared across configs)
+- `relevance` — graded labels used for nDCG (`expected_paths` default to
+  grade `3`)
 - `latency_ms` — wall-clock for the full query call chain (model load
   excluded — runner initializes once and warms up)
 - `rerank_k` — first-stage candidate count passed to the reranker (`0`
@@ -104,6 +115,8 @@ For each `(query, config)` pair (recorded by the runner):
   `cjk_technical`, `filter`, `default`, or `fixed`)
 - `recall_at_10` — fraction of `expected_paths` in top-10
 - `mrr` — reciprocal rank of first expected hit in top-10 (0 if none)
+- `ndcg_at_10` — graded nDCG when `relevance:` labels exist, otherwise
+  binary nDCG over `expected_paths`
 
 Aggregates:
 
diff --git a/tests/blind/queries.example.yaml b/tests/blind/queries.example.yaml
@@ -6,6 +6,10 @@
   expected_paths:
     - "notes/fsrs-algorithm.md"
     - "notes/spaced-repetition.md"
+  relevance:
+    "notes/fsrs-algorithm.md": 3
+    "notes/spaced-repetition.md": 3
+    "notes/forgetting-curve.md": 2
   tags: [cjk, core-concept]
   expansion:
     - "间隔重复 遗忘曲线 FSRS"
diff --git a/tests/blind/run.py b/tests/blind/run.py
@@ -60,10 +60,33 @@ class QuerySpec:
     query: str
     intent: str | None
     expected_paths: list[str]
+    relevance: dict[str, float]
     tags: list[str]
     expansion: list[str] | None
 
 
+def _parse_relevance(raw: object, expected_paths: list[str]) -> dict[str, float]:
+    relevance = {path: 3.0 for path in expected_paths}
+    if raw is None:
+        return relevance
+    if not isinstance(raw, dict):
+        raise ValueError("relevance must be a mapping of path -> grade")
+
+    for path, grade in raw.items():
+        if not isinstance(path, str) or not path:
+            raise ValueError("relevance paths must be non-empty strings")
+        try:
+            numeric_grade = float(grade)
+        except (TypeError, ValueError) as e:
+            raise ValueError(
+                f"relevance grade for {path!r} must be numeric"
+            ) from e
+        if numeric_grade < 0:
+            raise ValueError(f"relevance grade for {path!r} must be >= 0")
+        relevance[path] = numeric_grade
+    return relevance
+
+
 def _parse_rerank_k(raw: str) -> RerankK:
     if raw == "auto":
         return raw
@@ -94,6 +117,7 @@ class ResultRow:
     titles: list[str | None]
     snippets: list[str]
     scores: list[float]
+    relevance: dict[str, float]
     latency_ms: float
     reranker_active: bool
     recall_at_10: float
@@ -118,11 +142,13 @@ def load_queries(path: Path) -> list[QuerySpec]:
                 f"queries.yaml entry {i}: missing required field "
                 f"('query' and 'expected_paths' are mandatory)"
             )
+        expected_paths = list(r["expected_paths"])
         specs.append(
             QuerySpec(
                 query=r["query"],
                 intent=r.get("intent"),
-                expected_paths=list(r["expected_paths"]),
+                expected_paths=expected_paths,
+                relevance=_parse_relevance(r.get("relevance"), expected_paths),
                 tags=list(r.get("tags", [])),
                 expansion=list(r["expansion"]) if r.get("expansion") else None,
             )
@@ -165,6 +191,7 @@ def _result_row(
         titles=titles,
         snippets=snippets,
         scores=scores,
+        relevance=dict(spec.relevance),
         latency_ms=latency_ms,
         reranker_active=reranker_active,
         rerank_k=rerank_k if reranker_active else 0,
@@ -174,7 +201,9 @@ def _result_row(
         average_precision_at_10=average_precision_at_k(
             hits, spec.expected_paths, k=10
         ),
-        ndcg_at_10=ndcg_at_k(hits, spec.expected_paths, k=10),
+        ndcg_at_10=ndcg_at_k(
+            hits, spec.expected_paths, k=10, relevance=spec.relevance
+        ),
         last_expected_rank=last_expected_rank(hits, spec.expected_paths, k=10),
         resolved_rerank_k=resolved_rerank_k if reranker_active else 0,
         rerank_k_reason=rerank_k_reason if reranker_active else None,
diff --git a/tests/test_blind_runner_aggregates.py b/tests/test_blind_runner_aggregates.py
@@ -16,6 +16,76 @@
 )
 
 
+class TestLoadQueries:
+    def test_load_queries_defaults_expected_paths_to_grade_three(self, tmp_path: Path):
+        path = tmp_path / "queries.yaml"
+        path.write_text(
+            """
+- query: memory
+  expected_paths:
+    - notes/answer.md
+  tags: [english]
+""",
+            encoding="utf-8",
+        )
+
+        specs = blind_run.load_queries(path)
+
+        assert specs[0].relevance == {"notes/answer.md": 3.0}
+
+    def test_load_queries_accepts_graded_relevance_labels(self, tmp_path: Path):
+        path = tmp_path / "queries.yaml"
+        path.write_text(
+            """
+- query: transformer retrieval
+  expected_paths:
+    - notes/attention.md
+  relevance:
+    notes/attention.md: 3
+    notes/agent-memory.md: 2
+    notes/rrf.md: 1
+  tags: [english]
+""",
+            encoding="utf-8",
+        )
+
+        specs = blind_run.load_queries(path)
+
+        assert specs[0].relevance == {
+            "notes/attention.md": 3.0,
+            "notes/agent-memory.md": 2.0,
+            "notes/rrf.md": 1.0,
+        }
+
+    def test_result_row_uses_graded_relevance_for_ndcg(self):
+        spec = blind_run.QuerySpec(
+            query="transformer retrieval",
+            intent=None,
+            expected_paths=["notes/answer.md"],
+            relevance={
+                "notes/answer.md": 3.0,
+                "notes/support.md": 2.0,
+            },
+            tags=["english"],
+            expansion=None,
+        )
+
+        row = blind_run._result_row(
+            spec=spec,
+            config="A",
+            hits=["notes/support.md", "notes/answer.md"],
+            titles=[None, None],
+            snippets=["", ""],
+            scores=[1.0, 0.5],
+            latency_ms=10.0,
+            reranker_active=True,
+            rerank_k=5,
+        )
+
+        assert row.mrr == pytest.approx(0.5)
+        assert row.ndcg_at_10 > row.mrr
+
+
 def _row(
     *,
     query: str,
@@ -35,6 +105,7 @@ def _row(
         titles=[],
         snippets=[],
         scores=[],
+        relevance={},
         latency_ms=latency,
         reranker_active=True,
         recall_at_10=recall,