[on-policy distillation] support and related data handling (THUDM#673)

ahxt · web-flow · commit c59d8f046290 · 2025-11-12T20:27:02.000+08:00
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -484,7 +484,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
             temperature=self.args.rollout_temperature,
         )
         packed_batch["cur_log_probs"] = log_probs
-        
+
         shifted_logits = logits.squeeze(0)[:-1]
         log_probs_full = torch.log_softmax(shifted_logits, dim=-1)
         probs = torch.softmax(shifted_logits, dim=-1)
@@ -554,7 +554,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
 
         entropy = torch.cat([batch["entropy"] for batch in unpacked_batches], dim=0)
         entropy_loss = sum_of_sample_mean(entropy, response_lengths, loss_masks)
-        
+
         loss = pg_loss - self.args.entropy_coef * entropy_loss
 
         if self.args.use_kl_loss:
diff --git a/slime/backends/megatron_utils/loss.py b/slime/backends/megatron_utils/loss.py
@@ -286,6 +286,21 @@ def compute_advantages_and_returns(args: Namespace, rollout_data: RolloutBatch)
         )
         returns = advantages
 
+    elif args.advantage_estimator == "on_policy_distillation":
+        student_log_probs = log_probs
+        teacher_log_probs = rollout_data.get("teacher_log_probs")
+        response_lengths = rollout_data.get("response_lengths")
+        device = student_log_probs[0].device
+        teacher_log_probs = [t_log_prob.to(device=device) for t_log_prob in teacher_log_probs]
+        teacher_log_probs = [
+            t_log_prob[-response_length:] for t_log_prob, response_length in zip(teacher_log_probs, response_lengths)
+        ]
+        advantages = [
+            teacher_log_prob - student_log_prob
+            for teacher_log_prob, student_log_prob in zip(teacher_log_probs, student_log_probs)
+        ]
+        returns = advantages
+
     else:
         raise NotImplementedError(f"advantage_estimator {args.advantage_estimator} is not supported. ")
 
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -249,6 +249,9 @@ def _convert_samples_to_train_data(self, samples: Union[list[Sample], list[list[
         if samples[0].train_metadata is not None:
             train_data["metadata"] = [sample.train_metadata for sample in samples]
 
+        if "teacher_log_probs" in samples[0].__dict__:
+            train_data["teacher_log_probs"] = [sample.teacher_log_probs for sample in samples]
+
         return train_data
 
 
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -672,7 +672,14 @@ def add_algo_arguments(parser):
             parser.add_argument(
                 "--advantage-estimator",
                 type=str,
-                choices=["grpo", "gspo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],
+                choices=[
+                    "grpo",
+                    "gspo",
+                    "reinforce_plus_plus",
+                    "reinforce_plus_plus_baseline",
+                    "ppo",
+                    "on_policy_distillation",
+                ],
                 default="grpo",
             )
             parser.add_argument(
diff --git a/slime/utils/data.py b/slime/utils/data.py
@@ -211,6 +211,7 @@ def get_partition(val):
         "sample_indices",
         "rollout_log_probs",
         "prompt",
+        "teacher_log_probs",
     ]:
         if key not in data:
             continue