EleutherAI
diff --git a/‎bergson/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎bergson/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bergson/__main__.py‎
Lines changed: 21 additions & 10 deletions b/‎bergson/__main__.py‎
Lines changed: 21 additions & 10 deletions
diff --git a/‎bergson/build.py‎
Lines changed: 9 additions & 3 deletions b/‎bergson/build.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎bergson/collection.py‎
Lines changed: 3 additions & 1 deletion b/‎bergson/collection.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎bergson/collector/gradient_collectors.py‎
Lines changed: 5 additions & 1 deletion b/‎bergson/collector/gradient_collectors.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎bergson/collector/in_memory_collector.py‎
Lines changed: 5 additions & 1 deletion b/‎bergson/collector/in_memory_collector.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎bergson/config.py‎
Lines changed: 30 additions & 22 deletions b/‎bergson/config.py‎
Lines changed: 30 additions & 22 deletions
@@ -8,6 +8,7 @@
     AttentionConfig,
     DataConfig,
     IndexConfig,
+    PreprocessConfig,
     QueryConfig,
     ReduceConfig,
     ScoreConfig,
@@ -50,6 +51,7 @@
     "IndexConfig",
     "DataConfig",
     "AttentionConfig",
+    "PreprocessConfig",
     "Scorer",
     "ScoreConfig",
     "ReduceConfig",
 
@@ -10,6 +10,7 @@
 from .config import (
     HessianConfig,
     IndexConfig,
+    PreprocessConfig,
     QueryConfig,
     ReduceConfig,
     ScoreConfig,
@@ -44,14 +45,16 @@ class Build:
 
     index_cfg: IndexConfig
 
+    preprocess_cfg: PreprocessConfig
+
     def execute(self):
         """Build the gradient index."""
         if self.index_cfg.skip_index and self.index_cfg.skip_preconditioners:
             raise ValueError("Either skip_index or skip_preconditioners must be False")
 
         validate_run_path(self.index_cfg)
 
-        build(self.index_cfg)
+        build(self.index_cfg, self.preprocess_cfg)
 
 
 @dataclass
@@ -60,12 +63,14 @@ class Preconditioners:
 
     index_cfg: IndexConfig
 
+    preprocess_cfg: PreprocessConfig
+
     def execute(self):
         """Compute normalizers and preconditioners."""
         self.index_cfg.skip_index = True
         self.index_cfg.skip_preconditioners = False
         validate_run_path(self.index_cfg)
-        build(self.index_cfg)
+        build(self.index_cfg, self.preprocess_cfg)
 
 
 @dataclass
@@ -76,6 +81,8 @@ class Reduce:
 
     reduce_cfg: ReduceConfig
 
+    preprocess_cfg: PreprocessConfig
+
     def execute(self):
         """Reduce a gradient index."""
         if self.index_cfg.projection_dim != 0:
@@ -85,7 +92,7 @@ def execute(self):
 
         validate_run_path(self.index_cfg)
 
-        reduce(self.index_cfg, self.reduce_cfg)
+        reduce(self.index_cfg, self.reduce_cfg, self.preprocess_cfg)
 
 
 @dataclass
@@ -96,6 +103,8 @@ class Score:
 
     index_cfg: IndexConfig
 
+    preprocess_cfg: PreprocessConfig
+
     def execute(self):
         """Score a dataset against an existing gradient index."""
         assert self.score_cfg.query_path
@@ -107,7 +116,7 @@ def execute(self):
 
         validate_run_path(self.index_cfg)
 
-        score_dataset(self.index_cfg, self.score_cfg)
+        score_dataset(self.index_cfg, self.score_cfg, self.preprocess_cfg)
 
 
 @dataclass
@@ -144,6 +153,8 @@ class Trackstar:
 
     score_cfg: ScoreConfig
 
+    preprocess_cfg: PreprocessConfig
+
     def execute(self):
         """Run the full trackstar pipeline: preconditioners -> build -> score."""
         run_path = self.index_cfg.run_path
@@ -159,7 +170,7 @@ def execute(self):
         value_precond_cfg.skip_index = True
         value_precond_cfg.skip_preconditioners = False
         validate_run_path(value_precond_cfg)
-        build(value_precond_cfg)
+        build(value_precond_cfg, self.preprocess_cfg)
 
         # Step 2: Compute normalizers and preconditioners on query dataset
         print("Step 2/4: Computing normalizers and preconditioners on query dataset...")
@@ -169,7 +180,7 @@ def execute(self):
         query_precond_cfg.skip_index = True
         query_precond_cfg.skip_preconditioners = False
         validate_run_path(query_precond_cfg)
-        build(query_precond_cfg)
+        build(query_precond_cfg, self.preprocess_cfg)
 
         # Step 3: Build per-item query gradient index
         print("Step 3/4: Building query gradient index...")
@@ -179,7 +190,7 @@ def execute(self):
         query_cfg.processor_path = query_precond_path
         query_cfg.skip_preconditioners = True
         validate_run_path(query_cfg)
-        build(query_cfg)
+        build(query_cfg, self.preprocess_cfg)
 
         # Step 4: Score value dataset against query using both preconditioners
         print("Step 4/4: Scoring value dataset...")
@@ -188,10 +199,10 @@ def execute(self):
         score_index_cfg.processor_path = value_precond_path
         score_index_cfg.skip_preconditioners = True
         self.score_cfg.query_path = query_path
-        self.score_cfg.query_preconditioner_path = query_precond_path
-        self.score_cfg.index_preconditioner_path = value_precond_path
+        self.preprocess_cfg.query_preconditioner_path = query_precond_path
+        self.preprocess_cfg.index_preconditioner_path = value_precond_path
         validate_run_path(score_index_cfg)
-        score_dataset(score_index_cfg, self.score_cfg)
+        score_dataset(score_index_cfg, self.score_cfg, self.preprocess_cfg)
 
 
 @dataclass
 
@@ -10,7 +10,7 @@
 from tqdm.auto import tqdm
 
 from bergson.collection import collect_gradients
-from bergson.config import IndexConfig
+from bergson.config import IndexConfig, PreprocessConfig
 from bergson.data import allocate_batches
 from bergson.distributed import launch_distributed_run
 from bergson.utils.auto_batch_size import maybe_auto_batch_size
@@ -27,6 +27,7 @@ def build_worker(
     local_rank: int,
     world_size: int,
     cfg: IndexConfig,
+    preprocess_cfg: PreprocessConfig,
     ds: Dataset | IterableDataset,
 ):
     """
@@ -108,7 +109,7 @@ def flush(kwargs):
             processor.save(cfg.partial_run_path)
 
 
-def build(index_cfg: IndexConfig):
+def build(index_cfg: IndexConfig, preprocess_cfg: PreprocessConfig):
     """
     Build a gradient index by distributing work across all available GPUs.
 
@@ -117,6 +118,8 @@ def build(index_cfg: IndexConfig):
     index_cfg : IndexConfig
         Specifies the run path, dataset, model, tokenizer, PEFT adapters,
         and many other gradient collection settings.
+    preprocess_cfg : PreprocessConfig
+        Preprocessing configuration for gradient normalization/preconditioning.
     """
     if index_cfg.debug:
         setup_reproducibility()
@@ -128,7 +131,10 @@ def build(index_cfg: IndexConfig):
     ds = setup_data_pipeline(index_cfg)
 
     launch_distributed_run(
-        "build", build_worker, [index_cfg, ds], index_cfg.distributed
+        "build",
+        build_worker,
+        [index_cfg, preprocess_cfg, ds],
+        index_cfg.distributed,
     )
 
     rank = index_cfg.distributed.rank
 
@@ -3,7 +3,7 @@
 
 from bergson.collector.collector import CollectorComputer
 from bergson.collector.gradient_collectors import GradientCollector
-from bergson.config import AttentionConfig, IndexConfig, ReduceConfig
+from bergson.config import AttentionConfig, IndexConfig, PreprocessConfig, ReduceConfig
 from bergson.gradients import GradientProcessor
 from bergson.score.scorer import Scorer
 
@@ -19,6 +19,7 @@ def collect_gradients(
     attention_cfgs: dict[str, AttentionConfig] | None = None,
     scorer: Scorer | None = None,
     reduce_cfg: ReduceConfig | None = None,
+    preprocess_cfg: PreprocessConfig | None = None,
 ):
     """
     Compute gradients using the hooks specified in the GradientCollector.
@@ -31,6 +32,7 @@ def collect_gradients(
         data=data,
         scorer=scorer,
         reduce_cfg=reduce_cfg,
+        preprocess_cfg=preprocess_cfg,
         attention_cfgs=attention_cfgs or {},
         filter_modules=cfg.filter_modules,
     )
 
@@ -10,7 +10,7 @@
 from torch import Tensor
 
 from bergson.collector.collector import HookCollectorBase
-from bergson.config import IndexConfig, ReduceConfig
+from bergson.config import IndexConfig, PreprocessConfig, ReduceConfig
 from bergson.data import Builder, create_builder
 from bergson.gradients import (
     AdafactorNormalizer,
@@ -46,6 +46,9 @@ class GradientCollector(HookCollectorBase):
     reduce_cfg: ReduceConfig | None = None
     """Configuration for in-run gradient reduction."""
 
+    preprocess_cfg: PreprocessConfig | None = None
+    """Configuration for gradient preprocessing."""
+
     builder: Builder | None = None
     """Handles writing gradients to disk. Created in setup() if save_index is True."""
 
@@ -95,6 +98,7 @@ def setup(self) -> None:
                 attribute_tokens=self.cfg.attribute_tokens,
                 path=self.cfg.partial_run_path,
                 reduce_cfg=self.reduce_cfg,
+                preprocess_cfg=self.preprocess_cfg,
             )
         else:
             self.builder = None
 
@@ -11,7 +11,7 @@
 from torch import Tensor, nn
 
 from bergson.collector.collector import HookCollectorBase
-from bergson.config import IndexConfig, ReduceConfig
+from bergson.config import IndexConfig, PreprocessConfig, ReduceConfig
 from bergson.data import Builder, create_builder
 from bergson.gradients import (
     AdafactorNormalizer,
@@ -52,6 +52,9 @@ class InMemoryCollector(HookCollectorBase):
     reduce_cfg: ReduceConfig | None = None
     """Configuration for in-run gradient reduction."""
 
+    preprocess_cfg: PreprocessConfig | None = None
+    """Configuration for gradient preprocessing."""
+
     builder: Builder | None = None
     """Handles writing gradients. Created in setup()."""
 
@@ -109,6 +112,7 @@ def setup(self) -> None:
                 self.save_dtype,
                 attribute_tokens=self.cfg.attribute_tokens,
                 reduce_cfg=self.reduce_cfg,
+                preprocess_cfg=self.preprocess_cfg,
             )
 
     def teardown(self) -> None:
 
@@ -265,6 +265,23 @@ class QueryConfig:
     its top results as rows with columns: query, result, result_index, score."""
 
 
+@dataclass
+class PreprocessConfig:
+    """Config for gradient preprocessing, shared across build, reduce, and score."""
+
+    unit_normalize: bool = False
+    """Whether to unit normalize the gradients."""
+
+    query_preconditioner_path: str | None = None
+    """Path to a precomputed preconditioner for query gradients."""
+
+    index_preconditioner_path: str | None = None
+    """Path to a precomputed preconditioner for index gradients."""
+
+    mixing_coefficient: float = 0.99
+    """Weight for mixing query vs index preconditioner (1.0 = query only)."""
+
+
 @dataclass
 class ScoreConfig:
     """Config for querying an index on the fly."""
@@ -280,25 +297,8 @@ class ScoreConfig:
             similar query gradient (the maximum score).
         `individual`: compute a separate score for each query gradient."""
 
-    query_preconditioner_path: str | None = None
-    """Path to a precomputed preconditioner to be applied to
-    the query dataset gradients."""
-
-    index_preconditioner_path: str | None = None
-    """Path to a precomputed preconditioner to be applied to
-    the query dataset gradients. This does not affect the
-    ability to compute a new preconditioner during the query."""
-
-    mixing_coefficient: float = 0.99
-    """Coefficient to weight the application of the query preconditioner
-    and the pre-computed index preconditioner. 0.0 means only use the
-    index preconditioner and 1.0 means only use the query preconditioner."""
-
-    modules: list[str] = field(default_factory=list)
-    """Modules to use for the query. If empty, all modules will be used."""
-
-    unit_normalize: bool = False
-    """Whether to unit normalize the gradients before computing the scores."""
+    skip_query_preprocess: bool = False
+    """Skip query preprocessing if already applied during reduce."""
 
     batch_size: int = 1024
     """Batch size for processing the query dataset."""
@@ -307,16 +307,24 @@ class ScoreConfig:
     """Precision (dtype) to convert the query and index gradients to before
     computing the scores. If "auto", the model's gradient dtype is used."""
 
+    modules: list[str] = field(default_factory=list)
+    """Modules to use for the query. If empty, all modules will be used."""
+
 
 @dataclass
 class ReduceConfig:
-    """Config for reducing the gradients."""
+    """Config for reducing a dataset into a standalone query."""
 
     method: Literal["mean", "sum"] = "mean"
     """Method for reducing the gradients."""
 
-    unit_normalize: bool = False
-    """Whether to unit normalize the gradients before reducing them."""
+    modules: list[str] = field(default_factory=list)
+    """Modules to use for the query. If empty, all modules will be used."""
+
+    normalize_reduced_grad: bool = False
+    """Whether to unit normalize the reduced query gradient. This has
+    no effect on future score rankings but does affect the magnitude of
+    the scores."""
 
 
 @dataclass