Make score fast

luciaquirke · luciaquirke · commit c85e9ea48c9b · 2026-03-02T23:34:01.000Z
diff --git a/bergson/__init__.py b/bergson/__init__.py
@@ -2,8 +2,6 @@
 
 from .builders import (
     Builder,
-    InMemorySequenceBuilder,
-    InMemoryTokenBuilder,
     create_builder,
 )
 from .collection import collect_gradients
@@ -40,8 +38,6 @@
     "load_token_gradients",
     "TokenGradients",
     "Builder",
-    "InMemorySequenceBuilder",
-    "InMemoryTokenBuilder",
     "create_builder",
     "fit_normalizers",
     "Attributor",
diff --git a/bergson/__main__.py b/bergson/__main__.py
@@ -1,7 +1,4 @@
-import shutil
-from copy import deepcopy
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Optional, Union
 
 from simple_parsing import ArgumentParser, ConflictResolution
@@ -17,27 +14,11 @@
     TrackstarConfig,
 )
 from .hessians.hessian_approximations import approximate_hessians
-from .process_grads import mix_preconditioners
 from .query.query_index import query
 from .reduce import reduce
 from .score.score import score_dataset
-
-
-def validate_run_path(index_cfg: IndexConfig):
-    """Validate the run path."""
-    if index_cfg.distributed.rank != 0:
-        return
-
-    for path in [Path(index_cfg.run_path), Path(index_cfg.partial_run_path)]:
-        if not path.exists():
-            continue
-
-        if index_cfg.overwrite:
-            shutil.rmtree(path)
-        else:
-            raise FileExistsError(
-                f"Run path {path} already exists. Use --overwrite to overwrite it."
-            )
+from .trackstar import trackstar
+from .utils.worker_utils import validate_run_path
 
 
 @dataclass
@@ -150,70 +131,17 @@ class Trackstar:
 
     index_cfg: IndexConfig
 
-    trackstar_cfg: TrackstarConfig
-
     score_cfg: ScoreConfig
 
     preprocess_cfg: PreprocessConfig
 
+    trackstar_cfg: TrackstarConfig
+
     def execute(self):
-        """Run the full trackstar pipeline: preconditioners -> mix -> build -> score."""
-        run_path = self.index_cfg.run_path
-        value_precond_path = f"{run_path}/value_preconditioner"
-        query_precond_path = f"{run_path}/query_preconditioner"
-        mixed_precond_path = f"{run_path}/mixed_preconditioner"
-        query_path = f"{run_path}/query"
-        scores_path = f"{run_path}/scores"
-
-        # Step 1: Compute normalizers and preconditioners on value dataset
-        print("Step 1/5: Computing normalizers and preconditioners on value dataset...")
-        value_precond_cfg = deepcopy(self.index_cfg)
-        value_precond_cfg.run_path = value_precond_path
-        value_precond_cfg.skip_index = True
-        value_precond_cfg.skip_preconditioners = False
-        validate_run_path(value_precond_cfg)
-        build(value_precond_cfg, self.preprocess_cfg)
-
-        # Step 2: Compute normalizers and preconditioners on query dataset
-        print("Step 2/5: Computing normalizers and preconditioners on query dataset...")
-        query_precond_cfg = deepcopy(self.index_cfg)
-        query_precond_cfg.run_path = query_precond_path
-        query_precond_cfg.data = self.trackstar_cfg.query
-        query_precond_cfg.skip_index = True
-        query_precond_cfg.skip_preconditioners = False
-        validate_run_path(query_precond_cfg)
-        build(query_precond_cfg, self.preprocess_cfg)
-
-        # Step 3: Mix query and value preconditioners
-        print("Step 3/5: Mixing preconditioners...")
-        mix_preconditioners(
-            query_path=query_precond_path,
-            index_path=value_precond_path,
-            output_path=mixed_precond_path,
-            mixing_coefficient=self.trackstar_cfg.mixing_coefficient,
+        trackstar(
+            self.index_cfg, self.score_cfg, self.preprocess_cfg, self.trackstar_cfg
         )
 
-        # Step 4: Build per-item query gradient index
-        print("Step 4/5: Building query gradient index...")
-        query_cfg = deepcopy(self.index_cfg)
-        query_cfg.run_path = query_path
-        query_cfg.data = self.trackstar_cfg.query
-        query_cfg.processor_path = query_precond_path
-        query_cfg.skip_preconditioners = True
-        validate_run_path(query_cfg)
-        build(query_cfg, self.preprocess_cfg)
-
-        # Step 5: Score value dataset against query using mixed preconditioner
-        print("Step 5/5: Scoring value dataset...")
-        score_index_cfg = deepcopy(self.index_cfg)
-        score_index_cfg.run_path = scores_path
-        score_index_cfg.processor_path = value_precond_path
-        score_index_cfg.skip_preconditioners = True
-        self.score_cfg.query_path = query_path
-        self.preprocess_cfg.preconditioner_path = mixed_precond_path
-        validate_run_path(score_index_cfg)
-        score_dataset(score_index_cfg, self.score_cfg, self.preprocess_cfg)
-
 
 @dataclass
 class Main:
diff --git a/bergson/builders.py b/bergson/builders.py
@@ -80,12 +80,8 @@ def __init__(
         grad_sizes: dict[str, int],
         dtype: torch.dtype,
         *,
-        attribute_tokens: bool = False,
-        path: Path | None = None,
-        reduce_cfg: ReduceConfig | None = None,
-        preprocess_cfg: PreprocessConfig | None = None,
+        path: Path,
     ):
-        assert path is not None
         self.grad_sizes = grad_sizes
         self.num_items = len(data)
         np_dtype = convert_dtype_to_np(dtype)
@@ -157,8 +153,6 @@ def __init__(
         grad_sizes: dict[str, int],
         dtype: torch.dtype,
         *,
-        attribute_tokens: bool = False,
-        path: Path | None = None,
         reduce_cfg: ReduceConfig | None = None,
         preprocess_cfg: PreprocessConfig | None = None,
     ):
@@ -293,11 +287,6 @@ def __init__(
         data: Dataset,
         grad_sizes: dict[str, int],
         dtype: torch.dtype,
-        *,
-        attribute_tokens: bool = False,
-        path: Path | None = None,
-        reduce_cfg: ReduceConfig | None = None,
-        preprocess_cfg: PreprocessConfig | None = None,
     ):
         self.grad_sizes = grad_sizes
         self.num_items = len(data)
@@ -356,12 +345,10 @@ def __init__(
         grad_sizes: dict[str, int],
         dtype: torch.dtype,
         *,
-        attribute_tokens: bool = False,
-        path: Path | None = None,
+        path: Path,
         reduce_cfg: ReduceConfig | None = None,
         preprocess_cfg: PreprocessConfig | None = None,
     ):
-        assert path is not None
         self.grad_sizes = grad_sizes
         self.num_items = len(data)
         self.reduce_cfg = reduce_cfg
@@ -484,16 +471,22 @@ def create_builder(
     * no ``path``                           → :class:`InMemorySequenceBuilder`
     """
     if path is not None:
-        cls = TokenBuilder if attribute_tokens else SequenceBuilder
-    else:
-        cls = InMemoryTokenBuilder if attribute_tokens else InMemorySequenceBuilder
-
-    return cls(
+        if attribute_tokens:
+            return TokenBuilder(data, grad_sizes, dtype, path=path)
+        return SequenceBuilder(
+            data,
+            grad_sizes,
+            dtype,
+            path=path,
+            reduce_cfg=reduce_cfg,
+            preprocess_cfg=preprocess_cfg,
+        )
+    if attribute_tokens:
+        return InMemoryTokenBuilder(data, grad_sizes, dtype)
+    return InMemorySequenceBuilder(
         data,
         grad_sizes,
         dtype,
-        attribute_tokens=attribute_tokens,
-        path=path,
         reduce_cfg=reduce_cfg,
         preprocess_cfg=preprocess_cfg,
     )
diff --git a/bergson/score/scorer.py b/bergson/score/scorer.py
@@ -4,18 +4,6 @@
 from bergson.score.score_writer import ScoreWriter
 
 
-@torch.compile(fullgraph=True)
-def _cosine_score(
-    index_grads: torch.Tensor,
-    query_grads_t: torch.Tensor,
-) -> torch.Tensor:
-    """Matmul + unit normalization."""
-    scores = index_grads @ query_grads_t
-    i_norm = index_grads.pow(2).sum(dim=1).sqrt().clamp_min_(1e-12).unsqueeze(1)
-    scores.div_(i_norm)
-    return scores
-
-
 class Scorer:
     """
     Scores training gradients against query gradients.
@@ -80,17 +68,25 @@ def __init__(
         self.writer = writer
 
         # Load preconditioner: H^(-1/2) for split, H^(-1) for one-sided
-        self.preconditioners = get_trackstar_preconditioner(
+        preconditioners = get_trackstar_preconditioner(
             preconditioner_path,
             device=device,
             power=-0.5 if unit_normalize else -1,
             return_dtype=dtype,
         )
+
+        # Stack preconditioners for batched matmul in score().
+        # Shape: [n_modules, dim_per_mod, dim_per_mod]
+        if preconditioners and unit_normalize:
+            self.precond_stack = torch.stack([preconditioners[m] for m in modules])
+        else:
+            self.precond_stack = None
+
         # Precondition query grads per module, then cat into a single tensor
-        if self.preconditioners:
+        if preconditioners:
             q_list = [
                 query_grads[m].to(device=self.device, dtype=self.dtype)
-                @ self.preconditioners[m]
+                @ preconditioners[m]
                 for m in modules
             ]
         else:
@@ -112,27 +108,34 @@ def __call__(
     @torch.inference_mode()
     def score(self, index_grads: dict[str, torch.Tensor]) -> torch.Tensor:
         """Compute scores for a batch of gradients."""
-        # Device transfer and (optionally split) preconditioning of index grads.
-        # One-sided mode (unit_normalize=False) only preconditions the query.
-        i_list = []
-        for m in self.modules:
-            g = index_grads[m].to(self.device, self.dtype, non_blocking=True)
-            if (
-                self.unit_normalize
-                and self.preconditioners
-                and m in self.preconditioners
-            ):
-                g = g @ self.preconditioners[m]
-            i_list.append(g)
-
-        all_index = torch.cat(i_list, dim=-1)
+        if self.precond_stack is not None:
+            # Batched preconditioning: [batch, n_modules, dim] @ [n_modules, dim, dim]
+            g = torch.stack(
+                [
+                    index_grads[m].to(self.device, self.dtype, non_blocking=True)
+                    for m in self.modules
+                ],
+                dim=1,
+            )
+            all_index = (
+                torch.bmm(g.permute(1, 0, 2), self.precond_stack)
+                .permute(1, 0, 2)
+                .reshape(g.shape[0], -1)
+            )
+        else:
+            all_index = torch.cat(
+                [
+                    index_grads[m].to(self.device, self.dtype, non_blocking=True)
+                    for m in self.modules
+                ],
+                dim=-1,
+            )
+
+        scores = all_index @ self.query_grads_t
 
         if self.unit_normalize:
-            scores = _cosine_score(all_index, self.query_grads_t)
-        else:
-            # Compiled score adds overhead for dot-product-only
-            # where the single matmul is already fast.
-            scores = all_index @ self.query_grads_t
+            i_norm = all_index.pow(2).sum(dim=1).sqrt().clamp_min_(1e-12).unsqueeze(1)
+            scores.div_(i_norm)
 
         if self.score_mode == "nearest":
             return scores.max(dim=-1).values
diff --git a/bergson/trackstar.py b/bergson/trackstar.py
@@ -0,0 +1,76 @@
+from copy import deepcopy
+
+from .build import build
+from .config import (
+    IndexConfig,
+    PreprocessConfig,
+    ScoreConfig,
+    TrackstarConfig,
+)
+from .process_grads import mix_preconditioners
+from .score.score import score_dataset
+from .utils.worker_utils import validate_run_path
+
+
+def trackstar(
+    index_cfg: IndexConfig,
+    score_cfg: ScoreConfig,
+    preprocess_cfg: PreprocessConfig,
+    trackstar_cfg: TrackstarConfig,
+):
+    """Run the full trackstar pipeline: preconditioners -> mix -> build -> score."""
+    run_path = index_cfg.run_path
+    value_precond_path = f"{run_path}/value_preconditioner"
+    query_precond_path = f"{run_path}/query_preconditioner"
+    mixed_precond_path = f"{run_path}/mixed_preconditioner"
+    query_path = f"{run_path}/query"
+    scores_path = f"{run_path}/scores"
+
+    # Step 1: Compute normalizers and preconditioners on value dataset
+    print("Step 1/5: Computing normalizers and preconditioners on value dataset...")
+    value_precond_cfg = deepcopy(index_cfg)
+    value_precond_cfg.run_path = value_precond_path
+    value_precond_cfg.skip_index = True
+    value_precond_cfg.skip_preconditioners = False
+    validate_run_path(value_precond_cfg)
+    build(value_precond_cfg, preprocess_cfg)
+
+    # Step 2: Compute normalizers and preconditioners on query dataset
+    print("Step 2/5: Computing normalizers and preconditioners on query dataset...")
+    query_precond_cfg = deepcopy(index_cfg)
+    query_precond_cfg.run_path = query_precond_path
+    query_precond_cfg.data = trackstar_cfg.query
+    query_precond_cfg.skip_index = True
+    query_precond_cfg.skip_preconditioners = False
+    validate_run_path(query_precond_cfg)
+    build(query_precond_cfg, preprocess_cfg)
+
+    # Step 3: Mix query and value preconditioners
+    print("Step 3/5: Mixing preconditioners...")
+    mix_preconditioners(
+        query_path=query_precond_path,
+        index_path=value_precond_path,
+        output_path=mixed_precond_path,
+        mixing_coefficient=trackstar_cfg.mixing_coefficient,
+    )
+
+    # Step 4: Build per-item query gradient index
+    print("Step 4/5: Building query gradient index...")
+    query_cfg = deepcopy(index_cfg)
+    query_cfg.run_path = query_path
+    query_cfg.data = trackstar_cfg.query
+    query_cfg.processor_path = query_precond_path
+    query_cfg.skip_preconditioners = True
+    validate_run_path(query_cfg)
+    build(query_cfg, preprocess_cfg)
+
+    # Step 5: Score value dataset against query using mixed preconditioner
+    print("Step 5/5: Scoring value dataset...")
+    score_index_cfg = deepcopy(index_cfg)
+    score_index_cfg.run_path = scores_path
+    score_index_cfg.processor_path = value_precond_path
+    score_index_cfg.skip_preconditioners = True
+    score_cfg.query_path = query_path
+    preprocess_cfg.preconditioner_path = mixed_precond_path
+    validate_run_path(score_index_cfg)
+    score_dataset(score_index_cfg, score_cfg, preprocess_cfg)
diff --git a/bergson/utils/worker_utils.py b/bergson/utils/worker_utils.py