Change default for loss normalization

mseeger · mseeger · commit f19556c7efd9 · 2026-04-29T21:47:50.000+02:00
diff --git a/keys_values/finetune/args.py b/keys_values/finetune/args.py
@@ -429,6 +429,7 @@ class TrainArgs:
             batch. Otherwise (`False`, the default), we average the sum of loss
             values per data case (by the number of non-masked target tokens),
             then use the uniform average over the batch.
+            Defaults to `True`.
     """
 
     save_interval: Optional[int] = 1000
@@ -459,7 +460,7 @@ class TrainArgs:
     intermed_save_interval: Optional[int] = None
     intermed_save_num: Optional[int] = None
     max_grad_norm: Optional[float] = 1.0
-    average_loss_per_batch: Optional[bool] = False
+    average_loss_per_batch: Optional[bool] = True
 
     def __post_init__(self) -> None:
         if self.lr_warmup_fraction and self.lr_warmup_steps:
diff --git a/keys_values/finetune/longcontext_full.py b/keys_values/finetune/longcontext_full.py
@@ -156,7 +156,7 @@ def setup(
         intermed_save_interval=None,
         intermed_save_num=None,
         max_grad_norm=1.0,
-        average_loss_per_batch=False,
+        average_loss_per_batch=True,
     ),
     eval: EvalArgs = EvalArgs(
         interval=600,
diff --git a/keys_values/kvcache/gradient/main.py b/keys_values/kvcache/gradient/main.py
@@ -240,7 +240,7 @@ def __init__(
         offload_device: Optional[torch.device] = None,
         offload_grad_accum: Optional[CPUOffloadAccumulateGradients] = None,
         track_unmatched_annotations: Optional[Callable[[int, int], bool]] = None,
-        average_loss_per_batch: bool = False,
+        average_loss_per_batch: bool = True,
         debug_gpt_model: Optional[GPT] = None,
         debug_intermediates: Optional[DebugIntermediates] = None,
         debug_profile_forward: bool = False,
@@ -318,7 +318,7 @@ def __init__(
                 `track_unmatched_annotations(layer_idx, chunk_idx)` is `True`,
                 where `chunk_idx` is the first chunk in the cell.
             average_loss_per_batch: See :meth:`LongContextInferenceModel.forward`.
-                Defaults to `False`.
+                Defaults to `True`.
 
         """
         if head_model is None:
diff --git a/keys_values/long_context.py b/keys_values/long_context.py
@@ -593,7 +593,7 @@ def forward(
             as the model is to be used for token generations.
 
         Some luss functions are defined over target tokens. For these:
-        If `average_loss_per_batch == False` (default), each loss value
+        If `average_loss_per_batch == False`, each loss value
         `l[b]` is normalized by the number `nz[b]` of (not ignored)
         target tokens: `l[b] = s[b] / nz[b]`, if `s[b]` is the sum of loss
         values over target tokens.