perf(reranker): cap passages before MLX scoring

simonsysun · simonsysun · commit ce6744b34293 · 2026-04-28T12:24:31.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 - Full-vault indexing now embeds chunks in length-sorted batches instead of one file at a time, improving first-run indexing throughput on real Markdown vaults while preserving single-file indexing behavior and the existing SQLite schema.
+- The MLX reranker now caps each passage to the first 200 tokens before scoring, reducing warm-query latency on long chunks while preserving the full result preview and `seeklink get` output.
 
 ### Fixed
 - `seeklink search --rerank-k N` now limits the number of candidates passed to the cross-encoder even when `N` is lower than `--top-k`; the remaining results keep first-stage RRF order.
diff --git a/TODOS.md b/TODOS.md
@@ -6,14 +6,14 @@ ship if and when they become worth the cost.
 ## Search quality and features
 
 ### Cross-encoder performance optimization
-The MLX reranker (`Qwen3-Reranker-0.6B`) runs at ~60 ms per pair on M3-class
-Apple Silicon, totaling ~1.2–2.7 s for 20 candidates on realistic vault
-chunks. Possible reductions:
-
-- Batch inference (process all pairs in one forward pass instead of
-  sequentially).
-- Passage truncation (cap at ~200 tokens for reranking, use full text only
-  for final display).
+The MLX reranker (`Qwen3-Reranker-0.6B`) is still the main warm-query
+latency cost on realistic vault chunks. Passage text is now capped before
+reranking; remaining possible reductions:
+
+- Hardware-specific batching or sequence-classification reranker probes.
+  Gate on real blind-test latency because MLX batch throughput depends on
+  prompt length and padding.
+- Better query routing so only ambiguous queries pay the full rerank budget.
 
 ### Additional CLI subcommands
 Helpers exist inside `seeklink/app.py` but are not exposed on the CLI:
diff --git a/seeklink/reranker.py b/seeklink/reranker.py
@@ -8,8 +8,10 @@
 passage independently), at the cost of ~60ms per pair.
 
 Implementation uses MLX (Apple's native ML framework) which runs on
-Metal GPU, achieving ~1.2s for 20 pairs on M3 Air. The model uses a
-yes/no prompt format per Qwen3-Reranker's official usage guide:
+Metal GPU. Candidate passages are capped before scoring so the reranker
+does not spend most of its time on long chunks whose opening text is
+already enough for relevance judgments. The model uses a yes/no prompt
+format per Qwen3-Reranker's official usage guide:
 the model outputs logits for 'yes' and 'no' tokens, and we convert
 the yes-probability to a relevance score.
 
@@ -31,6 +33,7 @@
 _DEFAULT_INSTRUCTION = (
     "Given a web search query, retrieve relevant passages that answer the query."
 )
+_MAX_PASSAGE_TOKENS = 200
 
 
 class Reranker:
@@ -75,10 +78,39 @@ def _ensure_model(self) -> None:
                 )
                 self._disabled = True
 
+    def _token_list(self, text: str) -> list[int]:
+        """Tokenize text into a flat Python list."""
+        tokens = self._tokenizer.encode(text, return_tensors=None)
+        if not isinstance(tokens, list):
+            tokens = tokens.tolist()
+        if tokens and isinstance(tokens[0], list):
+            tokens = tokens[0]
+        return list(tokens)
+
+    def _truncate_passage(self, passage: str) -> str:
+        """Cap passage text used by the reranker; display text remains full."""
+        tokens = self._token_list(passage)
+        if len(tokens) <= _MAX_PASSAGE_TOKENS:
+            return passage
+
+        head = tokens[:_MAX_PASSAGE_TOKENS]
+        decode = getattr(self._tokenizer, "decode", None)
+        if decode is not None:
+            try:
+                return decode(head, skip_special_tokens=True)
+            except TypeError:
+                return decode(head)
+            except Exception:
+                logger.debug("Reranker passage decode failed; using char fallback")
+
+        # Conservative fallback for unusual tokenizers without decode().
+        return passage[:1200]
+
     def _score_one(self, query: str, passage: str) -> float:
         """Score a single (query, passage) pair. Returns 0-1 probability."""
         import mlx.core as mx
 
+        passage = self._truncate_passage(passage)
         prompt = (
             f"Instruct: {_DEFAULT_INSTRUCTION}\n"
             f"Query: {query}\n"
@@ -90,14 +122,8 @@ def _score_one(self, query: str, passage: str) -> float:
         )
         text += "<think>\n"
 
-        tokens = self._tokenizer.encode(text, return_tensors=None)
-        if isinstance(tokens, list):
-            input_ids = mx.array([tokens])
-        else:
-            input_ids = mx.array(tokens)
-            if input_ids.ndim == 1:
-                input_ids = input_ids[None]
-
+        tokens = self._token_list(text)
+        input_ids = mx.array([tokens])
         logits = self._model(input_ids)
         last_logits = logits[0, -1, :]
         mx.eval(last_logits)
diff --git a/tests/test_reranker.py b/tests/test_reranker.py
@@ -0,0 +1,114 @@
+"""Tests for the MLX reranker wrapper without loading real MLX models."""
+
+from __future__ import annotations
+
+import math
+import sys
+import types
+
+import numpy as np
+import pytest
+
+import seeklink.reranker as reranker_mod
+from seeklink.reranker import Reranker
+
+
+@pytest.fixture
+def fake_mlx(monkeypatch):
+    mlx_module = types.ModuleType("mlx")
+    core_module = types.ModuleType("mlx.core")
+    core_module.array = lambda value: np.array(value, dtype=np.int64)
+    core_module.eval = lambda *args, **kwargs: None
+    mlx_module.core = core_module
+    monkeypatch.setitem(sys.modules, "mlx", mlx_module)
+    monkeypatch.setitem(sys.modules, "mlx.core", core_module)
+
+
+class FakeTokenizer:
+    pad_token_id = 0
+    eos_token_id = 0
+
+    def convert_tokens_to_ids(self, token: str) -> int:
+        return {"yes": 1, "no": 2}[token]
+
+    def apply_chat_template(self, messages, tokenize: bool, add_generation_prompt: bool):
+        assert tokenize is False
+        assert add_generation_prompt is True
+        return messages[0]["content"]
+
+    def encode(self, text: str, return_tensors=None) -> list[int]:
+        assert return_tensors is None
+        if "Document: " not in text:
+            return [1] * len(text)
+        passage = text.split("Document: ", 1)[1].split("<think>", 1)[0]
+        marker = max(1, len(passage))
+        return [1] * (3 + len(passage)) + [marker]
+
+    def decode(self, tokens: list[int], skip_special_tokens: bool = True) -> str:
+        assert skip_special_tokens is True
+        return "x" * len(tokens)
+
+
+class RecordingModel:
+    def __init__(self, *, fail_all: bool = False):
+        self.fail_all = fail_all
+        self.shapes: list[tuple[int, int]] = []
+
+    def __call__(self, input_ids):
+        arr = np.asarray(input_ids)
+        self.shapes.append(tuple(arr.shape))
+        if self.fail_all:
+            raise RuntimeError("fake model failure")
+
+        logits = np.zeros((arr.shape[0], arr.shape[1], 3), dtype=np.float32)
+        for row_index, row in enumerate(arr):
+            non_padding = np.flatnonzero(row != 0)
+            last_real = int(non_padding[-1])
+            marker = float(row[last_real])
+            logits[row_index, last_real, 1] = marker
+            logits[row_index, last_real, 2] = 0.0
+            if last_real != arr.shape[1] - 1:
+                logits[row_index, -1, 1] = -100.0
+                logits[row_index, -1, 2] = 100.0
+        return logits
+
+
+def _ready_reranker(model: RecordingModel) -> Reranker:
+    reranker = Reranker()
+    reranker._model = model
+    reranker._tokenizer = FakeTokenizer()
+    reranker._token_yes = 1
+    reranker._token_no = 2
+    return reranker
+
+
+def _sigmoid(value: float) -> float:
+    return math.exp(value) / (math.exp(value) + 1.0)
+
+
+def test_rerank_caps_long_passages_before_scoring(fake_mlx, monkeypatch):
+    monkeypatch.setattr(reranker_mod, "_MAX_PASSAGE_TOKENS", 2)
+    model = RecordingModel()
+    reranker = _ready_reranker(model)
+
+    scores = reranker.rerank("query", ["abcdef"])
+
+    assert scores == pytest.approx([_sigmoid(2)])
+    assert model.shapes == [(1, 6)]
+
+
+def test_rerank_keeps_short_passages_intact(fake_mlx, monkeypatch):
+    monkeypatch.setattr(reranker_mod, "_MAX_PASSAGE_TOKENS", 10)
+    model = RecordingModel()
+    reranker = _ready_reranker(model)
+
+    scores = reranker.rerank("query", ["abc"])
+
+    assert scores == pytest.approx([_sigmoid(3)])
+    assert model.shapes == [(1, 7)]
+
+
+def test_rerank_returns_none_when_inference_fails(fake_mlx):
+    reranker = _ready_reranker(RecordingModel(fail_all=True))
+
+    assert reranker.rerank("query", ["passage"]) is None