THUDM
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 100 additions & 2 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 100 additions & 2 deletions
diff --git a/‎.github/workflows/pr-test.yml.j2‎
Lines changed: 100 additions & 1 deletion b/‎.github/workflows/pr-test.yml.j2‎
Lines changed: 100 additions & 1 deletion
diff --git a/‎docs/en/advanced/on-policy-distillation.md‎
Lines changed: 111 additions & 0 deletions b/‎docs/en/advanced/on-policy-distillation.md‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎docs/en/index.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/en/index.rst‎
Lines changed: 1 addition & 1 deletion
@@ -142,7 +142,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}]
+        info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
@@ -283,7 +283,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}]
+        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
@@ -306,3 +306,101 @@ jobs:
       - name: Execute
         shell: bash
         run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
+
+
+  e2e-test-changed-detect:
+    if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-changed'))
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+    outputs:
+      matrix: ${{ steps.detect.outputs.matrix }}
+      has_tests: ${{ steps.detect.outputs.has_tests }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Detect changed tests
+        id: detect
+        shell: bash
+        run: |
+          CHANGED=$(git diff --name-only --diff-filter=AM origin/main...HEAD -- 'tests/test_*.py' || true)
+          if [ -z "$CHANGED" ]; then
+            echo "No new or modified test files found."
+            echo "has_tests=false" >> $GITHUB_OUTPUT
+            echo 'matrix={"info":[]}' >> $GITHUB_OUTPUT
+          else
+            echo "Changed test files:"
+            echo "$CHANGED"
+            MATRIX="["
+            FIRST=true
+            for filepath in $CHANGED; do
+              filename=$(basename "$filepath")
+              # Extract NUM_GPUS from the test file, default to 8
+              NGPU=$(grep -oP '^NUM_GPUS\s*=\s*\K\d+' "$filepath" | head -1)
+              NGPU=${NGPU:-8}
+              if [ "$FIRST" = true ]; then FIRST=false; else MATRIX+=","; fi
+              MATRIX+="{\"test_file\":\"$filename\",\"num_gpus\":$NGPU}"
+            done
+            MATRIX+="]"
+            echo "has_tests=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"info\":$MATRIX}" >> $GITHUB_OUTPUT
+            echo "Generated matrix: $MATRIX"
+          fi
+
+  e2e-test-changed:
+    needs: e2e-test-changed-detect
+    if: needs.e2e-test-changed-detect.outputs.has_tests == 'true'
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+        -e http_proxy=$http_proxy
+        -e https_proxy=$https_proxy
+        -e HTTP_PROXY=$HTTP_PROXY
+        -e HTTPS_PROXY=$HTTPS_PROXY
+        -v /mnt/nvme0n1/slime_ci:/data/slime_ci
+        -v /mnt/nvme0n1/slime_ci/models:/root/models
+        -v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.e2e-test-changed-detect.outputs.matrix) }}
+    defaults:
+      run:
+        working-directory: ${{ github.workspace }}
+    env:
+      GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+      SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
+      SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
+      SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
+      SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install
+        shell: bash
+        run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
+
+      - name: Execute
+        shell: bash
+        run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
@@ -26,6 +26,7 @@
         {'test_file': 'test_moonlight_16B_A3B_r3.py', 'num_gpus': 8, 'enable_eval': '0'},
         {'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8},
         {'test_file': 'test_qwen2.5_0.5B_debug_rollout_then_train.py', 'num_gpus': 8},
+        {'test_file': 'test_qwen2.5_0.5B_opd_sglang.py', 'num_gpus': 8},
       ],
     },
     'e2e-test-precision': {
@@ -63,6 +64,7 @@
         {'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2},
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2},
         {'test_file': 'test_qwen2.5_0.5B_debug_rollout_then_train.py', 'num_gpus': 8},
+        {'test_file': 'test_qwen2.5_0.5B_opd_sglang.py', 'num_gpus': 8},
       ],
     },
 } %>
@@ -135,4 +137,101 @@ jobs:
       - name: Execute
         shell: bash
         run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
-<% endfor %>
+<% endfor %>
+
+  e2e-test-changed-detect:
+    if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-changed'))
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+    outputs:
+      matrix: ${{ steps.detect.outputs.matrix }}
+      has_tests: ${{ steps.detect.outputs.has_tests }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Detect changed tests
+        id: detect
+        shell: bash
+        run: |
+          CHANGED=$(git diff --name-only --diff-filter=AM origin/main...HEAD -- 'tests/test_*.py' || true)
+          if [ -z "$CHANGED" ]; then
+            echo "No new or modified test files found."
+            echo "has_tests=false" >> $GITHUB_OUTPUT
+            echo 'matrix={"info":[]}' >> $GITHUB_OUTPUT
+          else
+            echo "Changed test files:"
+            echo "$CHANGED"
+            MATRIX="["
+            FIRST=true
+            for filepath in $CHANGED; do
+              filename=$(basename "$filepath")
+              # Extract NUM_GPUS from the test file, default to 8
+              NGPU=$(grep -oP '^NUM_GPUS\s*=\s*\K\d+' "$filepath" | head -1)
+              NGPU=${NGPU:-8}
+              if [ "$FIRST" = true ]; then FIRST=false; else MATRIX+=","; fi
+              MATRIX+="{\"test_file\":\"$filename\",\"num_gpus\":$NGPU}"
+            done
+            MATRIX+="]"
+            echo "has_tests=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"info\":$MATRIX}" >> $GITHUB_OUTPUT
+            echo "Generated matrix: $MATRIX"
+          fi
+
+  e2e-test-changed:
+    needs: e2e-test-changed-detect
+    if: needs.e2e-test-changed-detect.outputs.has_tests == 'true'
+    runs-on: self-hosted
+    container:
+      image: slimerl/slime:latest
+      options: >
+        --gpus all
+        --ipc=host
+        --shm-size=16g
+        --ulimit memlock=-1
+        --ulimit stack=67108864
+        --memory=0
+        --memory-swap=0
+        -e http_proxy=$http_proxy
+        -e https_proxy=$https_proxy
+        -e HTTP_PROXY=$HTTP_PROXY
+        -e HTTPS_PROXY=$HTTPS_PROXY
+        -v /mnt/nvme0n1/slime_ci:/data/slime_ci
+        -v /mnt/nvme0n1/slime_ci/models:/root/models
+        -v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.e2e-test-changed-detect.outputs.matrix) }}
+    defaults:
+      run:
+        working-directory: ${{ github.workspace }}
+    env:
+      GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
+      WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+      SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
+      SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
+      SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
+      SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install
+        shell: bash
+        run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
+
+      - name: Execute
+        shell: bash
+        run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
@@ -0,0 +1,111 @@
+# On-Policy Distillation
+
+On-policy distillation (OPD) enables a student model to learn from a larger teacher model by training on its own rollouts while matching the teacher's token-level log-probabilities. OPD is orthogonal to advantage estimators — it works as an additive KL penalty on top of any estimator (GRPO, PPO, REINFORCE++, etc.).
+
+## Key Arguments
+
+| Argument | Description |
+|----------|-------------|
+| `--use-opd` | Enable on-policy distillation. Required flag to use OPD. |
+| `--opd-type` | Type of OPD: `sglang` or `megatron`. Required when `--use-opd` is set. |
+| `--opd-kl-coef` | OPD KL penalty coefficient (default: 1.0). Controls the weight of the distillation signal relative to the RL advantage. |
+| `--opd-teacher-load` | Path to teacher Megatron checkpoint. **Required** when `--opd-type=megatron`, **must not be set** when `--opd-type=sglang`. |
+| `--opd-teacher-ckpt-step` | Optional checkpoint step for teacher model. |
+
+## How It Works
+
+OPD modifies the advantage computation by subtracting a KL penalty term that encourages the student to match the teacher's output distribution:
+
+$$
+\hat{A}_t = A_t - \lambda_{\text{opd}} \cdot D_{\text{KL}}(P_{\text{teacher}} \| P_{\text{student}})_t
+$$
+
+Where $A_t$ is the original advantage from the base estimator (e.g., GRPO), $\lambda_{\text{opd}}$ is `--opd-kl-coef`, and $D_{\text{KL}}$ is the token-level reverse KL divergence.
+
+This means OPD can be combined with any advantage estimator, including GRPO, PPO, REINFORCE++, and GSPO.
+
+## Two Teacher Modes
+
+### SGLang Mode (`--opd-type sglang`)
+
+The teacher runs on an external SGLang server. Teacher log-probs are obtained during the rollout phase.
+
+**When to use**: The teacher has a different architecture from the student, or the teacher is too large to load alongside the training model.
+
+**How it works**:
+1. An external SGLang server runs the teacher model.
+2. During rollout, the custom reward function (`slime.rollout.on_policy_distillation.reward_func`) sends each sample to the teacher server to obtain token-level log-probs.
+3. The custom post-processing function (`slime.rollout.on_policy_distillation.post_process_rewards`) trims the teacher log-probs to the response span and stores them in `sample.teacher_log_probs`.
+4. During training, the KL penalty is computed from the stored teacher log-probs and applied to advantages.
+
+**Configuration**:
+```bash
+--use-opd
+--opd-type sglang
+--opd-kl-coef 1.0
+--custom-rm-path slime.rollout.on_policy_distillation.reward_func
+--custom-reward-post-process-path slime.rollout.on_policy_distillation.post_process_rewards
+--rm-url http://<TEACHER_IP>:<TEACHER_PORT>/generate
+```
+
+### Megatron Mode (`--opd-type megatron`)
+
+The teacher model is loaded directly into Megatron via `--opd-teacher-load`. Teacher log-probs are computed during the training forward pass.
+
+**When to use**: The teacher has the same architecture as the student/reference model and fits in GPU memory.
+
+**How it works**:
+1. The teacher model is loaded as an additional Megatron model during initialization.
+2. During the training forward pass, the teacher model computes log-probs for each sample.
+3. The KL penalty is computed inline and applied to advantages.
+
+**Configuration**:
+```bash
+--use-opd
+--opd-type megatron
+--opd-kl-coef 1.0
+--opd-teacher-load /path/to/teacher_torch_dist
+```
+
+> **Note**: The teacher checkpoint must be in Megatron format (`torch_dist` or `torch`). You can convert from HuggingFace format using `tools/convert_hf_to_torch_dist.py`.
+
+## Running the Examples
+
+Complete example scripts are provided in `examples/on_policy_distillation/`:
+
+### SGLang Teacher
+
+```bash
+# 1. Download models and data
+hf download Qwen/Qwen3-32B --local-dir /root/Qwen3-32B
+hf download Qwen/Qwen3-8B --local-dir /root/Qwen3-8B
+hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/dapo-math-17k
+
+# 2. Convert student model
+cd /root/slime
+source scripts/models/qwen3-8B.sh
+PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
+    ${MODEL_ARGS[@]} \
+    --hf-checkpoint /root/Qwen3-8B \
+    --save /root/Qwen3-8B_torch_dist
+
+# 3. Run
+bash examples/on_policy_distillation/run-qwen3-8B-opd.sh
+```
+
+### Megatron Teacher
+
+```bash
+# 1. Convert both student and teacher models to Megatron format
+# 2. Run
+bash examples/on_policy_distillation/run-qwen3-8B-opd-megatron.sh
+```
+
+## Preliminary Results
+
+Using Qwen3-8B-Base model SFT-ed on part of the [OpenThoughts3-1.2M](https://huggingface.co/datasets/open-thoughts/OpenThoughts3-1.2M) dataset, on-policy distillation with a Qwen3-32B teacher on the remaining data yields:
+
+|                                  | Pass@1 |
+|-----------------------------------------------|--------|
+| Qwen3-8B-Base + SFT                           | 76%    |
+| Qwen3-8B-Base + SFT + On-Policy Distillation  | 94%    |
@@ -41,6 +41,7 @@ slime is the RL-framework behind GLM-4.7, GLM-4.6 and GLM-4.5. Apart from models
    :caption: Advanced Features
 
    advanced/slime-router.md
+   advanced/on-policy-distillation.md
    advanced/speculative-decoding.md
    advanced/low-precision.md
    advanced/reproducibility.md
@@ -57,7 +58,6 @@ slime is the RL-framework behind GLM-4.7, GLM-4.6 and GLM-4.5. Apart from models
    _examples_synced/fully_async/README.md
    _examples_synced/retool/README.md
    _examples_synced/multi_agent/README.md
-   _examples_synced/on_policy_distillation/README.md   
 
 .. toctree::
    :maxdepth: 1