fix claude issues

LouisYRYJ · LouisYRYJ · commit 9210d509268a · 2026-01-13T17:01:17.000Z
diff --git a/bergson/collector/collector.py b/bergson/collector/collector.py
@@ -610,13 +610,18 @@ def fwd_bwd(model, x: Tensor, y: Tensor, batch: dict):
     return fwd_bwd
 
 
-def fwd_bwd_hessian_factory(cfg: HessianConfig) -> Callable:
+def fwd_bwd_hessian_factory(
+    index_cfg: IndexConfig, hessian_cfg: HessianConfig
+) -> Callable:
     def fwd_bwd_hessian(model, x: Tensor, y: Tensor, batch: dict):
         logits = model(x).logits[:, :-1]
         masks = y[:, 1:] != -100
-        denoms = masks.sum(dim=1, dtype=model.dtype)
-
-        if not cfg.use_dataset_labels:
+        denoms = (
+            masks.sum(dim=1, dtype=model.dtype)
+            if index_cfg.loss_reduction == "mean"
+            else 1.0
+        )
+        if not hessian_cfg.use_dataset_labels:
             losses = F.cross_entropy(
                 logits.reshape(-1, logits.size(-1)),
                 y[:, 1:].flatten(),
@@ -636,6 +641,7 @@ def fwd_bwd_hessian(model, x: Tensor, y: Tensor, batch: dict):
                 sampled_tokens.flatten(),
                 reduction="none",
             ).reshape_as(y[:, 1:])
+            losses = losses.sum(1) / denoms
 
         losses.sum().backward()
         model.zero_grad()
diff --git a/bergson/hessians/eigenvectors.py b/bergson/hessians/eigenvectors.py
@@ -307,7 +307,6 @@ def compute_eigendecomposition(
     )
 
     gc.collect()
-    torch.cuda.empty_cache()
 
 
 def _merge_and_shard_eigenvectors(
@@ -365,6 +364,5 @@ def _merge_and_shard_eigenvectors(
 
         del tensor
         gc.collect()
-        torch.cuda.empty_cache()
 
     return result_dict
diff --git a/bergson/hessians/hessian_approximations.py b/bergson/hessians/hessian_approximations.py
@@ -20,6 +20,7 @@
 from bergson.hessians.kfac import CovarianceCollector
 from bergson.hessians.tkfac import TraceCovarianceCollector
 from bergson.utils.utils import (
+    convert_precision_to_torch,
     setup_reproducibility,
     validate_batch_size,
 )
@@ -81,7 +82,7 @@ def hessian_worker(
     rank: int,
     local_rank: int,
     world_size: int,
-    cfg: IndexConfig,
+    index_cfg: IndexConfig,
     hessian_cfg: HessianConfig,
     ds: Dataset,
 ):
@@ -135,35 +136,37 @@ def hessian_worker(
             world_size=world_size,
         )
 
-    model, target_modules = setup_model_and_peft(cfg)
+    model, target_modules = setup_model_and_peft(index_cfg)
 
-    attention_cfgs = {module: cfg.attention for module in cfg.split_attention_modules}
+    attention_cfgs = {
+        module: index_cfg.attention for module in index_cfg.split_attention_modules
+    }
 
     kwargs = {
         "model": model,
         "data": ds,
-        "cfg": cfg,
+        "cfg": index_cfg,
         "hessian_cfg": hessian_cfg,
         "target_modules": target_modules,
         "attention_cfgs": attention_cfgs,
     }
 
-    batches = allocate_batches(ds["length"], cfg.token_batch_size)
+    batches = allocate_batches(ds["length"], index_cfg.token_batch_size)
     kwargs["batches"] = batches
     collect_hessians(**kwargs)
 
     total_processed = torch.load(
-        f"{cfg.partial_run_path}/total_processed.pt",
+        f"{index_cfg.partial_run_path}/total_processed.pt",
         map_location="cpu",
         weights_only=False,
     )
 
     compute_eigendecomposition(
-        os.path.join(cfg.partial_run_path, "activation_sharded"),
+        os.path.join(index_cfg.partial_run_path, "activation_sharded"),
         total_processed=total_processed,
     )
     compute_eigendecomposition(
-        os.path.join(cfg.partial_run_path, "gradient_sharded"),
+        os.path.join(index_cfg.partial_run_path, "gradient_sharded"),
         total_processed=total_processed,
     )
 
@@ -174,7 +177,7 @@ def hessian_worker(
 def collect_hessians(
     model: PreTrainedModel,
     data: Dataset,
-    cfg: IndexConfig,
+    index_cfg: IndexConfig,
     *,
     batches: list[list[int]] | None = None,
     target_modules: set[str] | None = None,
@@ -190,14 +193,14 @@ def collect_hessians(
     hessian_dtype = (
         model.dtype
         if hessian_cfg.hessian_dtype == "auto"
-        else hessian_cfg.hessian_dtype
+        else convert_precision_to_torch(hessian_cfg.hessian_dtype)
     )
 
     collector_args = {
         "model": model.base_model,  # type: ignore
         "target_modules": target_modules,
         "attention_cfgs": attention_cfgs or {},
-        "path": str(cfg.partial_run_path),
+        "path": str(index_cfg.partial_run_path),
     }
     desc = f"Approximating Hessians with {hessian_cfg.method}"
     if ev_correction:
@@ -207,16 +210,16 @@ def collect_hessians(
         collector_args["dtype"] = hessian_dtype
         collector = HESSIAN_APPROXIMATIONS[hessian_cfg.method](**collector_args)
 
-    validate_batch_size(model, cfg.token_batch_size, collector)
+    validate_batch_size(model, index_cfg.token_batch_size, collector)
 
     computer = CollectorComputer(
         model=model,  # type: ignore
         data=data,
         collector=collector,
         batches=batches,
-        cfg=cfg,
+        cfg=index_cfg,
     )
 
-    computer.forward_backward = fwd_bwd_hessian_factory(hessian_cfg)
+    computer.forward_backward = fwd_bwd_hessian_factory(index_cfg, hessian_cfg)
 
     computer.run_with_collector_hooks(desc=desc)

Original file line number	Diff line number	Diff line change
`@@ -307,7 +307,6 @@ def compute_eigendecomposition(`
`307`	`307`	`)`
`308`	`308`
`309`	`309`	`gc.collect()`
`310`		`- torch.cuda.empty_cache()`
`311`	`310`
`312`	`311`
`313`	`312`	`def _merge_and_shard_eigenvectors(`
`@@ -365,6 +364,5 @@ def _merge_and_shard_eigenvectors(`
`365`	`364`
`366`	`365`	`del tensor`
`367`	`366`	`gc.collect()`
`368`		`- torch.cuda.empty_cache()`
`369`	`367`
`370`	`368`	`return result_dict`