fix scaling of per token loss (#987)

zhuzilin · web-flow · commit 763f18d5bd24 · 2025-12-01T15:38:34.000+08:00
diff --git a/slime/backends/megatron_utils/loss.py b/slime/backends/megatron_utils/loss.py
@@ -702,9 +702,13 @@ def loss_function(
             raise ValueError(f"Unknown loss type: {args.loss_type}")
 
     # Here we need to divide by cp_size because to cancel the multiply in Megatron.
-    loss = (
-        loss * num_microbatches / args.global_batch_size * mpu.get_data_parallel_world_size(with_context_parallel=True)
-    )
+    if not args.calculate_per_token_loss:
+        loss = (
+            loss
+            * num_microbatches
+            / args.global_batch_size
+            * mpu.get_data_parallel_world_size(with_context_parallel=True)
+        )
 
     return (
         loss,