EleutherAI
diff --git a/‎CLAUDE.md‎
Lines changed: 14 additions & 2 deletions b/‎CLAUDE.md‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 1 deletion b/‎README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions b/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions
diff --git a/‎bergson/config.py‎
Lines changed: 2 additions & 1 deletion b/‎bergson/config.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bergson/distributed.py‎
Lines changed: 1 addition & 1 deletion b/‎bergson/distributed.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bergson/query/attributor.py‎
Lines changed: 13 additions & 2 deletions b/‎bergson/query/attributor.py‎
Lines changed: 13 additions & 2 deletions
@@ -1,4 +1,4 @@
-Always test your changes by running the appropriate script or CLI command.
+Always test your changes by running the appropriate script or CLI command. Never complete a task without testing your changes until the script or CLI command runs without issues for 3 minutes+ (at minimum). If you find an error unrelated to your task, at minimum quote the exact error back to me when you have completed your task and offer to investigate and fix it.
 
 ## Project Structure and Conventions
 
@@ -12,12 +12,22 @@ Use dataclasses for config, and use simple_parsing to parse the CLI configs data
 
 Never save logs, scripts, and other random development into the root of a project. Create an appropriate directory such as runs/ or scripts/ and add it to the .gitignore.
 
+torch.cuda.empty_cache() doesn't do what you hope it will do - don't use it.
+
+Put imports at the top of the file unless you have a good reason to do otherwise.
+
 # Development
 
 You can call CLI commands without prefixing `python -m`, like `bergson build`.
 
 Use `pre-commit run --all-files` if you forget to install pre-commit and it doesn't run in the hook.
 
+Run bash commands in the dedicated tmux pane named "claude" if it is available.
+
+Don't keep default run path values inside low level code - if a module calls another module, the higher level module should always pass through inject a base path.
+
+Don't save data to a directory that is not in the gitignore - especially the data/ directory.
+
 Don't remove large datasets from the HF cache without asking.
 
 ### Tests
@@ -26,4 +36,6 @@ Mark tests requiring GPUs with `@pytest.mark.skipif(not torch.cuda.is_available(
 
 ### Environment Setup
 
-If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate && pip install pytest`.
+If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate`.
+
+You can pull secrets from .env.
@@ -6,8 +6,9 @@ We view attribution as a counterfactual question: **_If we "unlearned" this trai
 ## Core features
 
 - Gradient store for serial queries. We provide collection-time gradient compression for efficient storage, and integrate with FAISS for fast KNN search over large stores.
-- On-the-fly queries. Query gradients without compression or disk I/O overhead via a single pass over a dataset with a set of precomputed query gradients.
+- On-the-fly queries. Query gradients without disk I/O overhead via a single pass over a dataset with a set of precomputed query gradients.
   - Experiment with multiple query strategies based on [LESS](https://arxiv.org/pdf/2402.04333).
+  - Ideal for compression-free gradients.
 - Train‑time gradient collection. Capture gradients produced during training with a ~17% performance overhead.
 - Scalable. We use [FSDP2](https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html), BitsAndBytes, and other performance optimizations to support large models, datasets, and clusters.
 - Integrated with HuggingFace Transformers and Datasets. We also support on-disk datasets in a variety of formats.
 
@@ -6,7 +6,6 @@
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .cli.auto_batch_size import AutoBatchSize
 from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
 from .query.query_index import query
 from .reduce import reduce
@@ -103,7 +102,7 @@ def execute(self):
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query, Reduce, Score, AutoBatchSize]
+    command: Union[Build, Query, Reduce, Score]
 
     def execute(self):
         """Run the script."""
 
@@ -201,7 +201,8 @@ class IndexConfig:
     """Configuration for multi-node distributed preconditioner computation."""
 
     max_tokens: int | None = None
-    """The maximum number of tokens to process. If None, all tokens will be processed. Only available for Dataset."""
+    """Max tokens to process. If None, all tokens processed. Dataset only.
+    This experimental feature may be removed in the future."""
 
     @property
     def partial_run_path(self) -> Path:
 
@@ -82,7 +82,7 @@ def launch_distributed_run(
                 newline = "\n"
                 raise RuntimeError(
                     f"{process_name} failed with {len(result.failures)} process "
-                    f"failure(s): {newline.join(result.failures)}"
+                    f"failure(s): {newline.join([str(f) for f in result.failures])}"
                 )
         finally:
             if ctx is not None:
 
@@ -78,7 +78,7 @@ def __init__(
         # Load the gradients into memory
         mmap = load_gradients(index_path)
         assert mmap.dtype.names is not None
-        # Copy gradients into device memory (handles bfloat16/V2 void types)
+        # Copy gradients into device memory
         self.grads = {
             name: numpy_to_tensor(mmap[name]).to(device=device, dtype=dtype)
             for name in mmap.dtype.names
@@ -91,8 +91,19 @@ def __init__(
             norm = torch.cat(
                 [self.grads[name] for name in self.ordered_modules], dim=1
             ).norm(dim=1, keepdim=True)
+
             for name in self.grads:
-                self.grads[name] /= norm
+                # Divide by norm (may create NaN/inf if norm is zero)
+                normalized = self.grads[name] / norm
+                # Convert NaN/inf to 0 and warn if any were found
+                if not torch.isfinite(normalized).all():
+                    print(
+                        f"Warning: NaN/inf values detected after normalization in "
+                        f"{name}, converting to 0"
+                    )
+                self.grads[name] = torch.nan_to_num(
+                    normalized, nan=0.0, posinf=0.0, neginf=0.0
+                )
 
     def search(
         self,
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def launch_distributed_run(`
`82`	`82`	`newline = "\n"`
`83`	`83`	`raise RuntimeError(`
`84`	`84`	`f"{process_name} failed with {len(result.failures)} process "`
`85`		`- f"failure(s): {newline.join(result.failures)}"`
	`85`	`+ f"failure(s): {newline.join([str(f) for f in result.failures])}"`
`86`	`86`	`)`
`87`	`87`	`finally:`
`88`	`88`	`if ctx is not None:`