Skip to content

Commit 16c2e01

Browse files
authored
feat: move OPD to slime/rollout, add CI test and docs (#1610)
1 parent 029bbed commit 16c2e01

File tree

10 files changed

+635
-8
lines changed

10 files changed

+635
-8
lines changed

.github/workflows/pr-test.yml

Lines changed: 100 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ jobs:
142142
strategy:
143143
fail-fast: false
144144
matrix:
145-
info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}]
145+
info: [{"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py", "use_deepep": "1", "use_fp8_rollout": "1"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_qwen3_30B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"enable_eval": "0", "num_gpus": 8, "test_file": "test_moonlight_16B_A3B_r3.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
146146
defaults:
147147
run:
148148
working-directory: ${{ github.workspace }}
@@ -283,7 +283,7 @@ jobs:
283283
strategy:
284284
fail-fast: false
285285
matrix:
286-
info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_colocated_2xGPU.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 2, "test_file": "test_qwen3_0.6B_fsdp_distributed.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}]
286+
info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 2, "test_file": "test_qwen3_4B_fsdp_true_on_policy.py"}, {"num_gpus": 8, "test_file": "test_qwen3_vl_4B_fsdp.py"}, {"num_gpus": 8, "test_file": "test_quick_start_glm4_9B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_30B_A3B.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ppo.py"}, {"num_gpus": 8, "test_file": "test_moonlight_16B_A3B.py"}, {"num_gpus": 8, "test_file": "test_mimo_7B_mtp_only_grad.py"}, {"num_gpus": 8, "test_file": "test_qwen3_0.6B_parallel_check.py"}, {"num_gpus": 4, "test_file": "test_qwen3_0.6B_megatron_fsdp_align.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py"}, {"num_gpus": 8, "test_file": "test_qwen3_4B_ckpt.py --async-save"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k.py"}, {"num_gpus": 2, "test_file": "test_qwen2.5_0.5B_gsm8k_async.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_debug_rollout_then_train.py"}, {"num_gpus": 8, "test_file": "test_qwen2.5_0.5B_opd_sglang.py"}]
287287
defaults:
288288
run:
289289
working-directory: ${{ github.workspace }}
@@ -306,3 +306,101 @@ jobs:
306306
- name: Execute
307307
shell: bash
308308
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
309+
310+
311+
e2e-test-changed-detect:
312+
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-changed'))
313+
runs-on: self-hosted
314+
container:
315+
image: slimerl/slime:latest
316+
options: >
317+
--gpus all
318+
--ipc=host
319+
--shm-size=16g
320+
--ulimit memlock=-1
321+
--ulimit stack=67108864
322+
--memory=0
323+
--memory-swap=0
324+
outputs:
325+
matrix: ${{ steps.detect.outputs.matrix }}
326+
has_tests: ${{ steps.detect.outputs.has_tests }}
327+
steps:
328+
- name: Checkout repository
329+
uses: actions/checkout@v4
330+
with:
331+
fetch-depth: 0
332+
333+
- name: Detect changed tests
334+
id: detect
335+
shell: bash
336+
run: |
337+
CHANGED=$(git diff --name-only --diff-filter=AM origin/main...HEAD -- 'tests/test_*.py' || true)
338+
if [ -z "$CHANGED" ]; then
339+
echo "No new or modified test files found."
340+
echo "has_tests=false" >> $GITHUB_OUTPUT
341+
echo 'matrix={"info":[]}' >> $GITHUB_OUTPUT
342+
else
343+
echo "Changed test files:"
344+
echo "$CHANGED"
345+
MATRIX="["
346+
FIRST=true
347+
for filepath in $CHANGED; do
348+
filename=$(basename "$filepath")
349+
# Extract NUM_GPUS from the test file, default to 8
350+
NGPU=$(grep -oP '^NUM_GPUS\s*=\s*\K\d+' "$filepath" | head -1)
351+
NGPU=${NGPU:-8}
352+
if [ "$FIRST" = true ]; then FIRST=false; else MATRIX+=","; fi
353+
MATRIX+="{\"test_file\":\"$filename\",\"num_gpus\":$NGPU}"
354+
done
355+
MATRIX+="]"
356+
echo "has_tests=true" >> $GITHUB_OUTPUT
357+
echo "matrix={\"info\":$MATRIX}" >> $GITHUB_OUTPUT
358+
echo "Generated matrix: $MATRIX"
359+
fi
360+
361+
e2e-test-changed:
362+
needs: e2e-test-changed-detect
363+
if: needs.e2e-test-changed-detect.outputs.has_tests == 'true'
364+
runs-on: self-hosted
365+
container:
366+
image: slimerl/slime:latest
367+
options: >
368+
--gpus all
369+
--ipc=host
370+
--shm-size=16g
371+
--ulimit memlock=-1
372+
--ulimit stack=67108864
373+
--memory=0
374+
--memory-swap=0
375+
-e http_proxy=$http_proxy
376+
-e https_proxy=$https_proxy
377+
-e HTTP_PROXY=$HTTP_PROXY
378+
-e HTTPS_PROXY=$HTTPS_PROXY
379+
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
380+
-v /mnt/nvme0n1/slime_ci/models:/root/models
381+
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
382+
strategy:
383+
fail-fast: false
384+
matrix: ${{ fromJson(needs.e2e-test-changed-detect.outputs.matrix) }}
385+
defaults:
386+
run:
387+
working-directory: ${{ github.workspace }}
388+
env:
389+
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
390+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
391+
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
392+
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
393+
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
394+
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
395+
396+
steps:
397+
- name: Checkout repository
398+
uses: actions/checkout@v4
399+
400+
- name: Install
401+
shell: bash
402+
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
403+
404+
- name: Execute
405+
shell: bash
406+
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}

.github/workflows/pr-test.yml.j2

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
{'test_file': 'test_moonlight_16B_A3B_r3.py', 'num_gpus': 8, 'enable_eval': '0'},
2727
{'test_file': 'test_mimo_7B_mtp_only_grad.py', 'num_gpus': 8},
2828
{'test_file': 'test_qwen2.5_0.5B_debug_rollout_then_train.py', 'num_gpus': 8},
29+
{'test_file': 'test_qwen2.5_0.5B_opd_sglang.py', 'num_gpus': 8},
2930
],
3031
},
3132
'e2e-test-precision': {
@@ -63,6 +64,7 @@
6364
{'test_file': 'test_qwen2.5_0.5B_gsm8k.py', 'num_gpus': 2},
6465
{'test_file': 'test_qwen2.5_0.5B_gsm8k_async.py', 'num_gpus': 2},
6566
{'test_file': 'test_qwen2.5_0.5B_debug_rollout_then_train.py', 'num_gpus': 8},
67+
{'test_file': 'test_qwen2.5_0.5B_opd_sglang.py', 'num_gpus': 8},
6668
],
6769
},
6870
} %>
@@ -135,4 +137,101 @@ jobs:
135137
- name: Execute
136138
shell: bash
137139
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
138-
<% endfor %>
140+
<% endfor %>
141+
142+
e2e-test-changed-detect:
143+
if: (github.event_name == 'workflow_dispatch') || (github.event.pull_request && contains(github.event.pull_request.labels.*.name, 'run-ci-changed'))
144+
runs-on: self-hosted
145+
container:
146+
image: slimerl/slime:latest
147+
options: >
148+
--gpus all
149+
--ipc=host
150+
--shm-size=16g
151+
--ulimit memlock=-1
152+
--ulimit stack=67108864
153+
--memory=0
154+
--memory-swap=0
155+
outputs:
156+
matrix: ${{ steps.detect.outputs.matrix }}
157+
has_tests: ${{ steps.detect.outputs.has_tests }}
158+
steps:
159+
- name: Checkout repository
160+
uses: actions/checkout@v4
161+
with:
162+
fetch-depth: 0
163+
164+
- name: Detect changed tests
165+
id: detect
166+
shell: bash
167+
run: |
168+
CHANGED=$(git diff --name-only --diff-filter=AM origin/main...HEAD -- 'tests/test_*.py' || true)
169+
if [ -z "$CHANGED" ]; then
170+
echo "No new or modified test files found."
171+
echo "has_tests=false" >> $GITHUB_OUTPUT
172+
echo 'matrix={"info":[]}' >> $GITHUB_OUTPUT
173+
else
174+
echo "Changed test files:"
175+
echo "$CHANGED"
176+
MATRIX="["
177+
FIRST=true
178+
for filepath in $CHANGED; do
179+
filename=$(basename "$filepath")
180+
# Extract NUM_GPUS from the test file, default to 8
181+
NGPU=$(grep -oP '^NUM_GPUS\s*=\s*\K\d+' "$filepath" | head -1)
182+
NGPU=${NGPU:-8}
183+
if [ "$FIRST" = true ]; then FIRST=false; else MATRIX+=","; fi
184+
MATRIX+="{\"test_file\":\"$filename\",\"num_gpus\":$NGPU}"
185+
done
186+
MATRIX+="]"
187+
echo "has_tests=true" >> $GITHUB_OUTPUT
188+
echo "matrix={\"info\":$MATRIX}" >> $GITHUB_OUTPUT
189+
echo "Generated matrix: $MATRIX"
190+
fi
191+
192+
e2e-test-changed:
193+
needs: e2e-test-changed-detect
194+
if: needs.e2e-test-changed-detect.outputs.has_tests == 'true'
195+
runs-on: self-hosted
196+
container:
197+
image: slimerl/slime:latest
198+
options: >
199+
--gpus all
200+
--ipc=host
201+
--shm-size=16g
202+
--ulimit memlock=-1
203+
--ulimit stack=67108864
204+
--memory=0
205+
--memory-swap=0
206+
-e http_proxy=$http_proxy
207+
-e https_proxy=$https_proxy
208+
-e HTTP_PROXY=$HTTP_PROXY
209+
-e HTTPS_PROXY=$HTTPS_PROXY
210+
-v /mnt/nvme0n1/slime_ci:/data/slime_ci
211+
-v /mnt/nvme0n1/slime_ci/models:/root/models
212+
-v /mnt/nvme0n1/slime_ci/datasets:/root/datasets
213+
strategy:
214+
fail-fast: false
215+
matrix: ${{ fromJson(needs.e2e-test-changed-detect.outputs.matrix) }}
216+
defaults:
217+
run:
218+
working-directory: ${{ github.workspace }}
219+
env:
220+
GITHUB_COMMIT_NAME: ${{ github.sha }}_${{ github.event.pull_request.number || 'non-pr' }}
221+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
222+
SLIME_TEST_ENABLE_INFINITE_RUN: ${{ (github.event_name == 'workflow_dispatch' && github.event.inputs.infinite_run) || 'false' }}
223+
SLIME_TEST_USE_DEEPEP: ${{ matrix.info.use_deepep || '0' }}
224+
SLIME_TEST_USE_FP8_ROLLOUT: ${{ matrix.info.use_fp8_rollout || '0' }}
225+
SLIME_TEST_ENABLE_EVAL: ${{ matrix.info.enable_eval || '1' }}
226+
227+
steps:
228+
- name: Checkout repository
229+
uses: actions/checkout@v4
230+
231+
- name: Install
232+
shell: bash
233+
run: cd $GITHUB_WORKSPACE && pip install -e . --no-deps --break-system-packages
234+
235+
- name: Execute
236+
shell: bash
237+
run: python tests/ci/gpu_lock_exec.py --count ${{ matrix.info.num_gpus }} -- python tests/${{ matrix.info.test_file }}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# On-Policy Distillation
2+
3+
On-policy distillation (OPD) enables a student model to learn from a larger teacher model by training on its own rollouts while matching the teacher's token-level log-probabilities. OPD is orthogonal to advantage estimators — it works as an additive KL penalty on top of any estimator (GRPO, PPO, REINFORCE++, etc.).
4+
5+
## Key Arguments
6+
7+
| Argument | Description |
8+
|----------|-------------|
9+
| `--use-opd` | Enable on-policy distillation. Required flag to use OPD. |
10+
| `--opd-type` | Type of OPD: `sglang` or `megatron`. Required when `--use-opd` is set. |
11+
| `--opd-kl-coef` | OPD KL penalty coefficient (default: 1.0). Controls the weight of the distillation signal relative to the RL advantage. |
12+
| `--opd-teacher-load` | Path to teacher Megatron checkpoint. **Required** when `--opd-type=megatron`, **must not be set** when `--opd-type=sglang`. |
13+
| `--opd-teacher-ckpt-step` | Optional checkpoint step for teacher model. |
14+
15+
## How It Works
16+
17+
OPD modifies the advantage computation by subtracting a KL penalty term that encourages the student to match the teacher's output distribution:
18+
19+
$$
20+
\hat{A}_t = A_t - \lambda_{\text{opd}} \cdot D_{\text{KL}}(P_{\text{teacher}} \| P_{\text{student}})_t
21+
$$
22+
23+
Where $A_t$ is the original advantage from the base estimator (e.g., GRPO), $\lambda_{\text{opd}}$ is `--opd-kl-coef`, and $D_{\text{KL}}$ is the token-level reverse KL divergence.
24+
25+
This means OPD can be combined with any advantage estimator, including GRPO, PPO, REINFORCE++, and GSPO.
26+
27+
## Two Teacher Modes
28+
29+
### SGLang Mode (`--opd-type sglang`)
30+
31+
The teacher runs on an external SGLang server. Teacher log-probs are obtained during the rollout phase.
32+
33+
**When to use**: The teacher has a different architecture from the student, or the teacher is too large to load alongside the training model.
34+
35+
**How it works**:
36+
1. An external SGLang server runs the teacher model.
37+
2. During rollout, the custom reward function (`slime.rollout.on_policy_distillation.reward_func`) sends each sample to the teacher server to obtain token-level log-probs.
38+
3. The custom post-processing function (`slime.rollout.on_policy_distillation.post_process_rewards`) trims the teacher log-probs to the response span and stores them in `sample.teacher_log_probs`.
39+
4. During training, the KL penalty is computed from the stored teacher log-probs and applied to advantages.
40+
41+
**Configuration**:
42+
```bash
43+
--use-opd
44+
--opd-type sglang
45+
--opd-kl-coef 1.0
46+
--custom-rm-path slime.rollout.on_policy_distillation.reward_func
47+
--custom-reward-post-process-path slime.rollout.on_policy_distillation.post_process_rewards
48+
--rm-url http://<TEACHER_IP>:<TEACHER_PORT>/generate
49+
```
50+
51+
### Megatron Mode (`--opd-type megatron`)
52+
53+
The teacher model is loaded directly into Megatron via `--opd-teacher-load`. Teacher log-probs are computed during the training forward pass.
54+
55+
**When to use**: The teacher has the same architecture as the student/reference model and fits in GPU memory.
56+
57+
**How it works**:
58+
1. The teacher model is loaded as an additional Megatron model during initialization.
59+
2. During the training forward pass, the teacher model computes log-probs for each sample.
60+
3. The KL penalty is computed inline and applied to advantages.
61+
62+
**Configuration**:
63+
```bash
64+
--use-opd
65+
--opd-type megatron
66+
--opd-kl-coef 1.0
67+
--opd-teacher-load /path/to/teacher_torch_dist
68+
```
69+
70+
> **Note**: The teacher checkpoint must be in Megatron format (`torch_dist` or `torch`). You can convert from HuggingFace format using `tools/convert_hf_to_torch_dist.py`.
71+
72+
## Running the Examples
73+
74+
Complete example scripts are provided in `examples/on_policy_distillation/`:
75+
76+
### SGLang Teacher
77+
78+
```bash
79+
# 1. Download models and data
80+
hf download Qwen/Qwen3-32B --local-dir /root/Qwen3-32B
81+
hf download Qwen/Qwen3-8B --local-dir /root/Qwen3-8B
82+
hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/dapo-math-17k
83+
84+
# 2. Convert student model
85+
cd /root/slime
86+
source scripts/models/qwen3-8B.sh
87+
PYTHONPATH=/root/Megatron-LM python tools/convert_hf_to_torch_dist.py \
88+
${MODEL_ARGS[@]} \
89+
--hf-checkpoint /root/Qwen3-8B \
90+
--save /root/Qwen3-8B_torch_dist
91+
92+
# 3. Run
93+
bash examples/on_policy_distillation/run-qwen3-8B-opd.sh
94+
```
95+
96+
### Megatron Teacher
97+
98+
```bash
99+
# 1. Convert both student and teacher models to Megatron format
100+
# 2. Run
101+
bash examples/on_policy_distillation/run-qwen3-8B-opd-megatron.sh
102+
```
103+
104+
## Preliminary Results
105+
106+
Using Qwen3-8B-Base model SFT-ed on part of the [OpenThoughts3-1.2M](https://huggingface.co/datasets/open-thoughts/OpenThoughts3-1.2M) dataset, on-policy distillation with a Qwen3-32B teacher on the remaining data yields:
107+
108+
| | Pass@1 |
109+
|-----------------------------------------------|--------|
110+
| Qwen3-8B-Base + SFT | 76% |
111+
| Qwen3-8B-Base + SFT + On-Policy Distillation | 94% |

docs/en/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ slime is the RL-framework behind GLM-4.7, GLM-4.6 and GLM-4.5. Apart from models
4141
:caption: Advanced Features
4242

4343
advanced/slime-router.md
44+
advanced/on-policy-distillation.md
4445
advanced/speculative-decoding.md
4546
advanced/low-precision.md
4647
advanced/reproducibility.md
@@ -57,7 +58,6 @@ slime is the RL-framework behind GLM-4.7, GLM-4.6 and GLM-4.5. Apart from models
5758
_examples_synced/fully_async/README.md
5859
_examples_synced/retool/README.md
5960
_examples_synced/multi_agent/README.md
60-
_examples_synced/on_policy_distillation/README.md
6161

6262
.. toctree::
6363
:maxdepth: 1

0 commit comments

Comments
 (0)