EleutherAI
diff --git a/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions b/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions
diff --git a/‎bergson/config.py‎
Lines changed: 2 additions & 1 deletion b/‎bergson/config.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎bergson/distributed.py‎
Lines changed: 1 addition & 1 deletion b/‎bergson/distributed.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bergson/query/attributor.py‎
Lines changed: 13 additions & 2 deletions b/‎bergson/query/attributor.py‎
Lines changed: 13 additions & 2 deletions
@@ -6,7 +6,6 @@
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .cli.auto_batch_size import AutoBatchSize
 from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
 from .query.query_index import query
 from .reduce import reduce
@@ -103,7 +102,7 @@ def execute(self):
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query, Reduce, Score, AutoBatchSize]
+    command: Union[Build, Query, Reduce, Score]
 
     def execute(self):
         """Run the script."""
 
@@ -201,7 +201,8 @@ class IndexConfig:
     """Configuration for multi-node distributed preconditioner computation."""
 
     max_tokens: int | None = None
-    """The maximum number of tokens to process. If None, all tokens will be processed. Only available for Dataset."""
+    """Max tokens to process. If None, all tokens processed. Dataset only.
+    This experimental feature may be removed in the future."""
 
     @property
     def partial_run_path(self) -> Path:
 
@@ -82,7 +82,7 @@ def launch_distributed_run(
                 newline = "\n"
                 raise RuntimeError(
                     f"{process_name} failed with {len(result.failures)} process "
-                    f"failure(s): {newline.join(result.failures)}"
+                    f"failure(s): {newline.join([str(f) for f in result.failures])}"
                 )
         finally:
             if ctx is not None:
 
@@ -78,7 +78,7 @@ def __init__(
         # Load the gradients into memory
         mmap = load_gradients(index_path)
         assert mmap.dtype.names is not None
-        # Copy gradients into device memory (handles bfloat16/V2 void types)
+        # Copy gradients into device memory
         self.grads = {
             name: numpy_to_tensor(mmap[name]).to(device=device, dtype=dtype)
             for name in mmap.dtype.names
@@ -91,8 +91,19 @@ def __init__(
             norm = torch.cat(
                 [self.grads[name] for name in self.ordered_modules], dim=1
             ).norm(dim=1, keepdim=True)
+
             for name in self.grads:
-                self.grads[name] /= norm
+                # Divide by norm (may create NaN/inf if norm is zero)
+                normalized = self.grads[name] / norm
+                # Convert NaN/inf to 0 and warn if any were found
+                if not torch.isfinite(normalized).all():
+                    print(
+                        f"Warning: NaN/inf values detected after normalization in "
+                        f"{name}, converting to 0"
+                    )
+                self.grads[name] = torch.nan_to_num(
+                    normalized, nan=0.0, posinf=0.0, neginf=0.0
+                )
 
     def search(
         self,
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ def launch_distributed_run(`
`82`	`82`	`newline = "\n"`
`83`	`83`	`raise RuntimeError(`
`84`	`84`	`f"{process_name} failed with {len(result.failures)} process "`
`85`		`- f"failure(s): {newline.join(result.failures)}"`
	`85`	`+ f"failure(s): {newline.join([str(f) for f in result.failures])}"`
`86`	`86`	`)`
`87`	`87`	`finally:`
`88`	`88`	`if ctx is not None:`