EleutherAI
diff --git a/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions b/‎bergson/cli/auto_batch_size.py‎
Lines changed: 0 additions & 127 deletions
diff --git a/‎bergson/config.py‎
Lines changed: 1 addition & 1 deletion b/‎bergson/config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bergson/query/attributor.py‎
Lines changed: 12 additions & 1 deletion b/‎bergson/query/attributor.py‎
Lines changed: 12 additions & 1 deletion
@@ -6,7 +6,6 @@
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .cli.auto_batch_size import AutoBatchSize
 from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
 from .query.query_index import query
 from .reduce import reduce
@@ -103,7 +102,7 @@ def execute(self):
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query, Reduce, Score, AutoBatchSize]
+    command: Union[Build, Query, Reduce, Score]
 
     def execute(self):
         """Run the script."""
 
@@ -201,7 +201,7 @@ class IndexConfig:
     """Configuration for multi-node distributed preconditioner computation."""
 
     max_tokens: int | None = None
-    """The maximum number of tokens to process. If None, all tokens will be processed. Only available for Dataset."""
+    """Max tokens to process. If None, all tokens processed. Dataset only."""
 
     @property
     def partial_run_path(self) -> Path:
 
@@ -91,8 +91,19 @@ def __init__(
             norm = torch.cat(
                 [self.grads[name] for name in self.ordered_modules], dim=1
             ).norm(dim=1, keepdim=True)
+
             for name in self.grads:
-                self.grads[name] /= norm
+                # Divide by norm (may create NaN/inf if norm is zero)
+                normalized = self.grads[name] / norm
+                # Convert NaN/inf to 0 and warn if any were found
+                if not torch.isfinite(normalized).all():
+                    print(
+                        f"Warning: NaN/inf values detected after normalization in "
+                        f"{name}, converting to 0"
+                    )
+                self.grads[name] = torch.nan_to_num(
+                    normalized, nan=0.0, posinf=0.0, neginf=0.0
+                )
 
     def search(
         self,