[on-policy distillation] support and related data handling (#673)

ahxt · web-flow · commit 12dd6b2f7120 · 2025-11-12T20:27:02.000+08:00
diff --git a/examples/on_policy_distillation/on_policy_distillation.py b/examples/on_policy_distillation/on_policy_distillation.py
@@ -0,0 +1,39 @@
+import aiohttp
+import torch
+
+from slime.utils.types import Sample
+
+
+async def reward_func(args, sample, **kwargs):
+    payload = {
+        "text": sample.prompt + sample.response,
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 0,
+            "skip_special_tokens": False,
+        },
+        "return_logprob": True,
+        "logprob_start_len": 0,
+    }
+    session_kwargs = {}
+    async with aiohttp.ClientSession(**session_kwargs) as session:
+        async with session.post(args.rm_url, json=payload) as resp:
+            resp.raise_for_status()
+            return await resp.json()
+
+
+def post_process_rewards(args, samples: list[Sample], **kwargs):
+    rewards = [sample.get_reward_value(args) for sample in samples]
+    response_lengths = [sample.response_length for sample in samples]
+    teacher_log_probs = [
+        torch.tensor([item[0] for item in reward["meta_info"]["input_token_logprobs"][1:]], dtype=torch.float32)
+        for reward in rewards
+    ]
+    teacher_log_probs = [
+        t_log_prob[-response_length:] for t_log_prob, response_length in zip(teacher_log_probs, response_lengths)
+    ]
+
+    for sample, t_log_probs in zip(samples, teacher_log_probs):
+        sample.teacher_log_probs = t_log_probs
+
+    return teacher_log_probs, teacher_log_probs
diff --git a/examples/on_policy_distillation/run-qwen3-8B-opd.sh b/examples/on_policy_distillation/run-qwen3-8B-opd.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+
+# usage: bash examples/on_policy_distillation/run-qwen3-8B-opd.sh
+
+set -ex
+
+
+# Start the teacher model server
+TEACHER_IP="127.0.0.1" # Use localhost here, you can change it to your IP
+TEACHER_PORT=13141
+LOG_FILE="/tmp/sglang_$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 6).log"
+
+## Launch the teacher model server in the background
+CUDA_VISIBLE_DEVICES=7 python3 -m sglang.launch_server \
+    --model-path /root/Qwen3-32B \
+    --host 0.0.0.0 \
+    --port $TEACHER_PORT \
+    --tp 1 \
+    --chunked-prefill-size 4096 \
+    --mem-fraction-static 0.6 \
+    > "$LOG_FILE" 2>&1 &
+
+echo "Starting teacher model server..."
+
+## Wait for the teacher model server to be ready
+until curl -sf http://$TEACHER_IP:$TEACHER_PORT/health_generate > /dev/null; do
+    echo "Waiting for the teacher model server to start..."
+    tail -n 10 "$LOG_FILE"
+    sleep 5
+done
+
+echo "Teacher model server is up and running at $TEACHER_IP:$TEACHER_PORT."
+sleep 10
+
+
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+source "/root/slime/scripts/models/qwen3-8B.sh"
+
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Qwen3-8B
+   --ref-load /root/Qwen3-8B_torch_dist
+   --load /root/Qwen3-8B_slime/
+   --save /root/Qwen3-8B_slime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --apply-chat-template
+   --rollout-shuffle
+   --num-rollout 300
+   --rollout-batch-size 16
+   --n-samples-per-prompt 4
+   --rollout-max-response-len 16384
+   --rollout-temperature 0.8
+
+   --global-batch-size 64
+   --balance-data
+)
+
+RM_ARGS=(
+   --custom-rm-path examples.on_policy_distillation.on_policy_distillation.reward_func
+   --custom-reward-post-process-path examples.on_policy_distillation.on_policy_distillation.post_process_rewards
+   --rm-url http://$TEACHER_IP:$TEACHER_PORT/generate
+)
+
+EVAL_ARGS=(
+   # --eval-interval 20
+   # --eval-prompt-data aime ${DATA_DIR}/aime-2024/aime-2024.jsonl
+   # --n-samples-per-eval-prompt 16
+   # --eval-max-response-len 16384
+   # --eval-top-p 0.7
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+)
+
+GRPO_ARGS=(
+   --advantage-estimator on_policy_distillation
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+WANDB_ARGS=(
+   #--use-wandb
+   # --wandb-project slime-dev
+   # --wandb-group qwen3-8B-test
+   # --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.4
+)
+
+
+MISC_ARGS=(
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+)
+
+
+
+
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json='{
+     "env_vars": {
+        "PYTHONPATH": "/root/Megatron-LM/",
+        "CUDA_DEVICE_MAX_CONNECTIONS": "1"
+     }
+   }' \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 2 \
+   --rollout-num-gpus 4 \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]} \
+   ${RM_ARGS[@]}
+
+
+
+####clear after training
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+
+
+
+
+
+
+
+
+
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -484,7 +484,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
             temperature=self.args.rollout_temperature,
         )
         packed_batch["cur_log_probs"] = log_probs
-        
+
         shifted_logits = logits.squeeze(0)[:-1]
         log_probs_full = torch.log_softmax(shifted_logits, dim=-1)
         probs = torch.softmax(shifted_logits, dim=-1)
@@ -554,7 +554,7 @@ def _train_step(self, packed_batch, world_size, reported_accum, mbs_id, grad_acc
 
         entropy = torch.cat([batch["entropy"] for batch in unpacked_batches], dim=0)
         entropy_loss = sum_of_sample_mean(entropy, response_lengths, loss_masks)
-        
+
         loss = pg_loss - self.args.entropy_coef * entropy_loss
 
         if self.args.use_kl_loss:
diff --git a/slime/backends/megatron_utils/loss.py b/slime/backends/megatron_utils/loss.py
@@ -286,6 +286,21 @@ def compute_advantages_and_returns(args: Namespace, rollout_data: RolloutBatch)
         )
         returns = advantages
 
+    elif args.advantage_estimator == "on_policy_distillation":
+        student_log_probs = log_probs
+        teacher_log_probs = rollout_data.get("teacher_log_probs")
+        response_lengths = rollout_data.get("response_lengths")
+        device = student_log_probs[0].device
+        teacher_log_probs = [t_log_prob.to(device=device) for t_log_prob in teacher_log_probs]
+        teacher_log_probs = [
+            t_log_prob[-response_length:] for t_log_prob, response_length in zip(teacher_log_probs, response_lengths)
+        ]
+        advantages = [
+            teacher_log_prob - student_log_prob
+            for teacher_log_prob, student_log_prob in zip(teacher_log_probs, student_log_probs)
+        ]
+        returns = advantages
+
     else:
         raise NotImplementedError(f"advantage_estimator {args.advantage_estimator} is not supported. ")
 
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -249,6 +249,9 @@ def _convert_samples_to_train_data(self, samples: Union[list[Sample], list[list[
         if samples[0].train_metadata is not None:
             train_data["metadata"] = [sample.train_metadata for sample in samples]
 
+        if "teacher_log_probs" in samples[0].__dict__:
+            train_data["teacher_log_probs"] = [sample.teacher_log_probs for sample in samples]
+
         return train_data
 
 
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -672,7 +672,14 @@ def add_algo_arguments(parser):
             parser.add_argument(
                 "--advantage-estimator",
                 type=str,
-                choices=["grpo", "gspo", "reinforce_plus_plus", "reinforce_plus_plus_baseline", "ppo"],
+                choices=[
+                    "grpo",
+                    "gspo",
+                    "reinforce_plus_plus",
+                    "reinforce_plus_plus_baseline",
+                    "ppo",
+                    "on_policy_distillation",
+                ],
                 default="grpo",
             )
             parser.add_argument(
diff --git a/slime/utils/data.py b/slime/utils/data.py
@@ -211,6 +211,7 @@ def get_partition(val):
         "sample_indices",
         "rollout_log_probs",
         "prompt",
+        "teacher_log_probs",
     ]:
         if key not in data:
             continue