EleutherAI
diff --git a/‎CLAUDE.md‎
Lines changed: 1 addition & 1 deletion b/‎CLAUDE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_bergson.py‎
Lines changed: 33 additions & 22 deletions b/‎benchmarks/benchmark_bergson.py‎
Lines changed: 33 additions & 22 deletions
diff --git a/‎benchmarks/benchmark_bergson_cli.py‎
Lines changed: 23 additions & 12 deletions b/‎benchmarks/benchmark_bergson_cli.py‎
Lines changed: 23 additions & 12 deletions
diff --git a/‎benchmarks/benchmark_dattri.py‎
Lines changed: 21 additions & 11 deletions b/‎benchmarks/benchmark_dattri.py‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎benchmarks/benchmark_utils.py‎
Lines changed: 15 additions & 6 deletions b/‎benchmarks/benchmark_utils.py‎
Lines changed: 15 additions & 6 deletions
@@ -24,4 +24,4 @@ Mark tests requiring GPUs with `@pytest.mark.skipif(not torch.cuda.is_available(
 
 ### Environment Setup
 
-If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate && pip install pytest`.
+If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate && pip install pytest`.
@@ -5,8 +5,8 @@
 import json
 import os
 import sys
-import traceback
 import time
+import traceback
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import Optional, Union
@@ -18,6 +18,13 @@
 from torch.distributed.fsdp import fully_shard
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from benchmarks.benchmark_utils import (
+    DEFAULT_DATASET,
+    MODEL_SPECS,
+    get_run_path,
+    parse_tokens,
+    timestamp,
+)
 from bergson.build import build
 from bergson.collector.collector import CollectorComputer
 from bergson.collector.in_memory_collector import InMemoryCollector
@@ -32,13 +39,6 @@
     get_optimal_batch_size,
 )
 from bergson.utils.utils import assert_type, get_layer_list
-from benchmarks.benchmark_utils import (
-    DEFAULT_DATASET,
-    MODEL_SPECS,
-    parse_tokens,
-    timestamp,
-    get_run_path,
-)
 
 SCHEMA_VERSION = 1
 DEFAULT_TRAIN_SPLIT = "train"
@@ -70,7 +70,9 @@ class RunRecord:
     notes: str | None
     error: str | None
     num_gpus: int = 1  # Default for backwards compatibility
-    token_batch_size: int | None = None  # Auto-determined or configured token batch size
+    token_batch_size: int | None = (
+        None  # Auto-determined or configured token batch size
+    )
 
 
 @dataclass
@@ -181,7 +183,9 @@ def execute(self) -> None:
         run_path = (
             Path(self.cfg.run_path).resolve()
             if self.cfg.run_path
-            else get_run_path(run_root, spec, train_tokens, eval_tokens, self.cfg.tag, num_gpus)
+            else get_run_path(
+                run_root, spec, train_tokens, eval_tokens, self.cfg.tag, num_gpus
+            )
         )
 
         start_wall = timestamp()
@@ -218,7 +222,7 @@ def execute(self) -> None:
                 spec.hf_id, torch_dtype=torch.bfloat16, device_map=device_map
             )
 
-            model = model.cuda() # type: ignore
+            model = model.cuda()  # type: ignore
 
             # Wrap model with FSDP
             embed = model.get_input_embeddings()
@@ -275,10 +279,12 @@ def tokenize(batch):
             eval_dataset = eval_dataset.map(tokenize, batched=True)
 
             train_dataset.set_format(
-                type="torch", columns=["input_ids", "attention_mask", "labels", "length"]
+                type="torch",
+                columns=["input_ids", "attention_mask", "labels", "length"],
             )
             eval_dataset.set_format(
-                type="torch", columns=["input_ids", "attention_mask", "labels", "length"]
+                type="torch",
+                columns=["input_ids", "attention_mask", "labels", "length"],
             )
 
             # Determine optimal token_batch_size if requested
@@ -331,8 +337,7 @@ def tokenize(batch):
 
             # Create batches for CollectorComputer
             batches = allocate_batches(
-                train_dataset["length"],  # type: ignore
-                optimal_token_batch_size
+                train_dataset["length"], optimal_token_batch_size  # type: ignore
             )
 
             # Use CollectorComputer to process training data
@@ -347,7 +352,8 @@ def tokenize(batch):
 
             # Concatenate all training gradients
             train_grads_flat = {
-                name: torch.cat(grads, dim=0) for name, grads in train_collector.gradients.items()
+                name: torch.cat(grads, dim=0)
+                for name, grads in train_collector.gradients.items()
             }
 
             reduce_time = time.perf_counter() - reduce_start
@@ -363,7 +369,9 @@ def tokenize(batch):
             all_scores = []
 
             # Limit eval examples
-            eval_subset = eval_dataset.select(range(min(self.cfg.max_eval_examples, len(eval_dataset))))
+            eval_subset = eval_dataset.select(
+                range(min(self.cfg.max_eval_examples, len(eval_dataset)))
+            )
 
             for i in range(len(eval_subset)):
                 # Create single-example dataset
@@ -387,16 +395,17 @@ def tokenize(batch):
 
                 # Concatenate test gradients
                 test_grads = {
-                    name: torch.cat(grads, dim=0) for name, grads in test_collector.gradients.items()
+                    name: torch.cat(grads, dim=0)
+                    for name, grads in test_collector.gradients.items()
                 }
 
                 # Compute inner products (no normalization, no preconditioning)
                 scores = torch.zeros(len(train_dataset), device="cpu")
                 for name in test_grads:
                     if name in train_grads_flat:
-                        scores += (
-                            test_grads[name] @ train_grads_flat[name].T
-                        ).squeeze(0)
+                        scores += (test_grads[name] @ train_grads_flat[name].T).squeeze(
+                            0
+                        )
 
                 all_scores.append(scores)
 
@@ -474,7 +483,9 @@ def execute(self) -> None:
         run_path = (
             Path(self.cfg.run_path).resolve()
             if self.cfg.run_path
-            else get_run_path(run_root, spec, train_tokens, eval_tokens, self.cfg.tag, 1)
+            else get_run_path(
+                run_root, spec, train_tokens, eval_tokens, self.cfg.tag, 1
+            )
         )
 
         start_wall = timestamp()
 
@@ -2,8 +2,9 @@
 
 from __future__ import annotations
 
-import shutil
 import json
+import platform
+import shutil
 import subprocess
 import sys
 import time
@@ -12,15 +13,17 @@
 from typing import Optional
 
 from simple_parsing import ArgumentParser, ConflictResolution, field
-import platform
 
-from bergson.utils.auto_batch_size import determine_batch_size_cli, get_optimal_batch_size
 from benchmarks.benchmark_utils import (
     DEFAULT_DATASET,
     MODEL_SPECS,
+    get_run_path,
     parse_tokens,
     timestamp,
-    get_run_path,
+)
+from bergson.utils.auto_batch_size import (
+    determine_batch_size_cli,
+    get_optimal_batch_size,
 )
 
 SCHEMA_VERSION = 1
@@ -102,6 +105,7 @@ def get_hardware_info() -> str:
     """Get hardware information string."""
     try:
         import torch
+
         gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
         gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0
         return f"{platform.node()} ({gpu_count}x {gpu_name})"
@@ -128,7 +132,11 @@ def run_cli_command(cmd: list[str], description: str) -> tuple[bool, float, str]
         )
         elapsed = time.perf_counter() - start
         if result.returncode != 0:
-            return False, elapsed, f"{description} failed with return code {result.returncode}"
+            return (
+                False,
+                elapsed,
+                f"{description} failed with return code {result.returncode}",
+            )
         print(f"{description} completed in {elapsed:.2f}s")
         return True, elapsed, ""
     except Exception as e:
@@ -219,14 +227,19 @@ def execute(self) -> None:
                     f"   Completed at {existing_run.end_time} "
                     f"(runtime: {existing_run.total_runtime_seconds:.1f}s)"
                 )
-                print(
-                    f"   Use --skip_existing=False to force re-run"
-                )
+                print("   Use --skip_existing=False to force re-run")
                 return
         benchmark_path = (
             Path(self.run_cfg.run_path).resolve()
             if self.run_cfg.run_path
-            else get_run_path(run_root, spec, train_tokens, eval_seqs, self.run_cfg.tag, self.run_cfg.num_gpus)
+            else get_run_path(
+                run_root,
+                spec,
+                train_tokens,
+                eval_seqs,
+                self.run_cfg.tag,
+                self.run_cfg.num_gpus,
+            )
         )
 
         # Create directories for bergson artifacts
@@ -351,9 +364,7 @@ def execute(self) -> None:
         end_wall = timestamp()
 
         token_batch_size = (
-            optimal_token_batch_size 
-            if self.run_cfg.auto_batch_size 
-            else None
+            optimal_token_batch_size if self.run_cfg.auto_batch_size else None
         )
 
         record = CLIRunRecord(
 
@@ -2,9 +2,9 @@
 
 from __future__ import annotations
 
-import os
 import argparse
 import json
+import os
 import sys
 import textwrap
 import time
@@ -18,12 +18,15 @@
 from dattri.task import AttributionTask
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from bergson.utils.utils import assert_type
-
 # Import from same directory
 from benchmarks.benchmark_utils import (
-    MODEL_SPECS, DEFAULT_DATASET, parse_tokens, timestamp, get_run_path
+    DEFAULT_DATASET,
+    MODEL_SPECS,
+    get_run_path,
+    parse_tokens,
+    timestamp,
 )
+from bergson.utils.utils import assert_type
 
 SCHEMA_VERSION = 1
 DEFAULT_TRAIN_SPLIT = "train"
@@ -123,7 +126,9 @@ def tokenize(batch):
 
         # Select enough examples
         total_needed = train_examples_needed + eval_examples_needed
-        train_dataset = train_dataset.select(range(min(total_needed, len(train_dataset))))
+        train_dataset = train_dataset.select(
+            range(min(total_needed, len(train_dataset)))
+        )
 
         eval_dataset = train_dataset.select(
             range(train_examples_needed, train_examples_needed + eval_examples_needed)
@@ -140,7 +145,9 @@ def collate_fn(batch):
             # Dattri expects tuples of (input_ids, labels) where labels = input_ids for language modeling
             # Keep on CPU - dattri will handle device placement
             input_ids = torch.stack([item["input_ids"] for item in batch])
-            labels = input_ids.clone()  # For language modeling, labels are the same as input_ids
+            labels = (
+                input_ids.clone()
+            )  # For language modeling, labels are the same as input_ids
             return (input_ids, labels)
 
         train_loader = torch.utils.data.DataLoader(
@@ -156,7 +163,7 @@ def collate_fn(batch):
 
         # Get model device
         model_device = next(model.parameters()).device
-        
+
         def loss_func(params, data_target_pair):
             x, y = data_target_pair
             # Ensure data is on the same device as model
@@ -169,7 +176,7 @@ def loss_func(params, data_target_pair):
             if isinstance(output, tuple):
                 logits = output[0]  # First element is logits
             else:
-                logits = output.logits if hasattr(output, 'logits') else output
+                logits = output.logits if hasattr(output, "logits") else output
             shift_logits = logits[:, :-1].contiguous()
             shift_labels = y[:, 1:].contiguous()
             loss = nn.CrossEntropyLoss()(
@@ -180,8 +187,8 @@ def loss_func(params, data_target_pair):
         # Create task
         task = AttributionTask(
             loss_func=loss_func,
-                       model=model,
-                       checkpoints=model.state_dict(),
+            model=model,
+            checkpoints=model.state_dict(),
         )
 
         # Create attributor and cache
@@ -203,6 +210,7 @@ def loss_func(params, data_target_pair):
         status = "error"
         error_message = repr(exc)
         import traceback
+
         traceback.print_exc()
 
     runtime = time.perf_counter() - start
@@ -273,7 +281,9 @@ def main(argv: list[str] | None = None) -> None:
     )
     run_parser.add_argument("--batch-size", type=int, default=4)
     run_parser.add_argument("--max-length", type=int, default=512)
-    run_parser.add_argument("--num-gpus", type=int, default=1, help="Number of GPUs to use")
+    run_parser.add_argument(
+        "--num-gpus", type=int, default=1, help="Number of GPUs to use"
+    )
     run_parser.add_argument("--dataset", default=DEFAULT_DATASET)
     run_parser.add_argument("--train-split", default=DEFAULT_TRAIN_SPLIT)
     run_parser.add_argument("--eval-split", default=DEFAULT_EVAL_SPLIT)
 
@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from datetime import datetime
-from datetime import timezone
+from datetime import datetime, timezone
 from pathlib import Path
 
 from datasets import Dataset, load_from_disk
@@ -9,6 +8,7 @@
 TOKENIZED_DATASET_PATH = "data/EleutherAI/SmolLM2-135M-10B-tokenized"
 MAX_BENCHMARK_LENGTH = 1024
 
+
 @dataclass(frozen=True)
 class ModelSpec:
     key: str
@@ -44,7 +44,12 @@ def get_run_path(
 
 
 def timestamp() -> str:
-    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+    return (
+        datetime.now(timezone.utc)
+        .replace(microsecond=0)
+        .isoformat()
+        .replace("+00:00", "Z")
+    )
 
 
 def format_tokens(tokens: int) -> str:
@@ -111,7 +116,9 @@ def load_benchmark_dataset(
     total_tokens_before = sum(len(tokens) for tokens in ds["input_ids"])
     num_examples_before = len(ds)
 
-    print(f"Dataset loaded: {num_examples_before:,} examples, {total_tokens_before:,} tokens")
+    print(
+        f"Dataset loaded: {num_examples_before:,} examples, {total_tokens_before:,} tokens"
+    )
 
     # Filter to only sequences >= min_length
     print(f"Filtering sequences to length >= {min_length}...")
@@ -124,9 +131,11 @@ def load_benchmark_dataset(
     num_examples_removed = num_examples_before - num_examples_after
     tokens_removed = total_tokens_before - total_tokens_after
 
-    print(f"\nFiltered dataset:")
+    print("\nFiltered dataset:")
     print(f"  Examples: {num_examples_after:,} (removed {num_examples_removed:,})")
     print(f"  Tokens: {total_tokens_after:,} (removed {tokens_removed:,})")
-    print(f"  Average length: {total_tokens_after / num_examples_after:.1f} tokens/example")
+    print(
+        f"  Average length: {total_tokens_after / num_examples_after:.1f} tokens/example"
+    )
 
     return ds
Original file line number	Diff line number	Diff line change
@@ -24,4 +24,4 @@ Mark tests requiring GPUs with `@pytest.mark.skipif(not torch.cuda.is_available(
`24`	`24`
`25`	`25`	`### Environment Setup`
`26`	`26`
`27`		-If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate && pip install pytest`.
	`27`	+If you use need to use a venv, create and/or activate it with `python3 -m venv .venv && source .venv/bin/activate && pip install pytest`.