Support true on policy (THUDM#566)

fzyzcjy · zhuzilin · web-flow · commit 1669ca3b3c6b · 2025-10-24T14:54:03.000+08:00
Co-authored-by: Zilin Zhu &lt;zhuzilinallen@gmail.com&gt;
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -51,6 +51,12 @@ class FSDPTrainRayActor(TrainRayActor):
     def init(self, args: Namespace, role: str, wandb_run_id: str, with_ref: bool = False) -> int:  # type: ignore[override]
         super().init(args, role, wandb_run_id, with_ref)
 
+        if args.true_on_policy_mode:
+            from sglang.srt.batch_invariant_ops import enable_batch_invariant_mode
+
+            print("FSDPTrainRayActor call enable_batch_invariant_mode for true-on-policy")
+            enable_batch_invariant_mode()
+
         # Update rank and world_size for wandb secondary initialization (using actual distributed values)
         args.rank = dist.get_rank()
         args.world_size = dist.get_world_size()
@@ -454,6 +460,11 @@ def train(self, rollout_id: int, rollout_data_ref: Box) -> None:
             pg_clipfrac = sum_of_sample_mean(pg_clipfrac, response_lengths, loss_masks)
             ppo_kl = sum_of_sample_mean(ppo_kl.abs(), response_lengths, loss_masks)
 
+            train_rollout_logprob_diff = old_log_probs - rollout_log_probs
+            train_rollout_logprob_diff = sum_of_sample_mean(
+                train_rollout_logprob_diff, response_lengths, loss_masks
+            ).detach()
+
             loss = pg_loss
 
             if self.args.entropy_coef != 0:
@@ -477,6 +488,7 @@ def train(self, rollout_id: int, rollout_data_ref: Box) -> None:
                 "pg_loss": pg_loss.detach(),
                 "pg_clipfrac": pg_clipfrac.detach(),
                 "ppo_kl": ppo_kl.detach(),
+                "train_rollout_logprob_diff": train_rollout_logprob_diff,
             }
 
             if self.args.use_kl_loss:
diff --git a/slime/backends/fsdp_utils/arguments.py b/slime/backends/fsdp_utils/arguments.py
@@ -32,6 +32,7 @@ class FSDPArgs:
     # FSDP configuration
     fsdp_full_params: bool = False  # If True, use full_tensor; if False, use shard_tensor
 
+    deterministic_mode: bool = False  # This name must be the same as Megatron's
     # Profile
     record_memory_history: bool = False
     memory_snapshot_path: str = "snapshot.pickle"
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -98,6 +98,12 @@ def add_train_arguments(parser):
                 default="megatron",
                 help="The backend for training.",
             )
+            parser.add_argument(
+                "--true-on-policy-mode",
+                action="store_true",
+                default=False,
+                help="Whether to enable true-on-policy mode.",
+            )
 
             return parser