Skip to content

Commit 13ecee0

Browse files
CopilotCopilot
andcommitted
Add Qwen3.5 model support (27B dense and 35B-A3B MoE)
- New model plugin: slime_plugins/models/qwen3_5.py - Qwen3_5GatedDeltaNet with separate QKV/Z projections, conv1d, and flat QKV split - get_qwen3_5_spec replacing standard attention with linear attention per layer_types - New weight bridge: slime_plugins/mbridge/qwen3_5.py - Handles VLM weight prefix (model.language_model.layers) - Fused expert weight format for MoE (3D tensors -> per-expert slices) - MTP layer support with individual expert format - New HF converter: slime/backends/megatron_utils/megatron_to_hf/qwen3_5.py - TEGroupedMLP per-expert weight{i} -> HF fused expert format - Proper gate/up split for swiglu experts - Fix sglang_rollout.py: skip processor for text-only VLM models - Model configs and run scripts for both 27B and 35B-A3B Tested: Both models verified end-to-end with training. - 27B: TP=1 SGLang (8 engines), TP=2/PP=2/CP=2 Megatron, logprob_diff=0.017 - 35B-A3B: TP=2 SGLang (4 engines), EP=8 Megatron, logprob_diff=0.012 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent bd70add commit 13ecee0

File tree

10 files changed

+1157
-2
lines changed

10 files changed

+1157
-2
lines changed

scripts/models/qwen3.5-27B.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
MODEL_ARGS=(
2+
--spec "slime_plugins.models.qwen3_5" "get_qwen3_5_spec"
3+
4+
--disable-bias-linear
5+
--qk-layernorm
6+
--group-query-attention
7+
--num-attention-heads 24
8+
--num-query-groups 4
9+
--kv-channels 256
10+
--num-layers 64
11+
--hidden-size 5120
12+
--ffn-hidden-size 17408
13+
--use-gated-attention
14+
15+
--normalization RMSNorm
16+
--apply-layernorm-1p
17+
--position-embedding-type rope
18+
--norm-epsilon 1e-6
19+
--rotary-percent 0.25
20+
--swiglu
21+
--untie-embeddings-and-output-weights
22+
--vocab-size 248320
23+
24+
--rotary-base 10000000
25+
26+
# qwen3.5 specific
27+
--attention-output-gate
28+
)

scripts/models/qwen3.5-35B-A3B.sh

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
NLAYERS=40
2+
FIRST_K_DENSE_REPLACE=0
3+
4+
arr=()
5+
for ((i=0; i<NLAYERS; i++)); do
6+
if (( i < FIRST_K_DENSE_REPLACE )); then
7+
arr+=(0)
8+
else
9+
arr+=(1)
10+
fi
11+
done
12+
13+
printf -v MOE_LAYER_FREQ "[%s]" "$(IFS=', '; echo "${arr[*]}")"
14+
15+
16+
MODEL_ARGS=(
17+
--spec "slime_plugins.models.qwen3_5" "get_qwen3_5_spec"
18+
19+
--disable-bias-linear
20+
--qk-layernorm
21+
--group-query-attention
22+
--num-attention-heads 16
23+
--num-query-groups 2
24+
--kv-channels 256
25+
--num-layers 40
26+
--hidden-size 2048
27+
--ffn-hidden-size 512
28+
--use-gated-attention
29+
30+
--normalization RMSNorm
31+
--apply-layernorm-1p
32+
--position-embedding-type rope
33+
--norm-epsilon 1e-6
34+
--rotary-percent 0.25
35+
--swiglu
36+
--untie-embeddings-and-output-weights
37+
--vocab-size 248320
38+
39+
--rotary-base 10000000
40+
41+
# moe
42+
--moe-ffn-hidden-size 512
43+
--moe-shared-expert-intermediate-size 512
44+
--moe-router-score-function softmax
45+
--moe-token-dispatcher-type alltoall
46+
--moe-router-topk 8
47+
--moe-layer-freq $MOE_LAYER_FREQ
48+
--num-experts 256
49+
--moe-grouped-gemm
50+
--moe-token-drop-policy probs
51+
--moe-router-dtype fp32
52+
--moe-permute-fusion
53+
--moe-aux-loss-coeff 0
54+
55+
# qwen3.5 specific
56+
--attention-output-gate
57+
--moe-shared-expert-gate
58+
)

scripts/run-qwen3.5-27B.sh

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/bin/bash
2+
3+
# for rerun the task
4+
pkill -9 sglang
5+
sleep 3
6+
ray stop --force
7+
pkill -9 ray
8+
pkill -9 python
9+
sleep 3
10+
pkill -9 ray
11+
pkill -9 python
12+
13+
set -ex
14+
15+
# will prevent ray from buffering stdout/stderr
16+
export PYTHONBUFFERED=16
17+
18+
# unset proxy to avoid issues
19+
unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
20+
21+
NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
22+
if [ "$NVLINK_COUNT" -gt 0 ]; then
23+
HAS_NVLINK=1
24+
else
25+
HAS_NVLINK=0
26+
fi
27+
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
28+
29+
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
30+
source "${SCRIPT_DIR}/models/qwen3.5-27B.sh"
31+
32+
CKPT_ARGS=(
33+
--hf-checkpoint /root/Qwen3.5-27B
34+
--ref-load /root/Qwen3.5-27B_torch_dist/
35+
--load /root/Qwen3.5-27B_slime
36+
--save /root/Qwen3.5-27B_slime
37+
--save-interval 20
38+
)
39+
40+
ROLLOUT_ARGS=(
41+
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
42+
--input-key prompt
43+
--label-key label
44+
--apply-chat-template
45+
--rollout-shuffle
46+
--rm-type deepscaler
47+
--num-rollout 3000
48+
--rollout-batch-size 32
49+
--n-samples-per-prompt 8
50+
--rollout-max-response-len 8192
51+
--rollout-temperature 1
52+
53+
--global-batch-size 256
54+
--balance-data
55+
)
56+
57+
EVAL_ARGS=(
58+
--eval-interval 20
59+
--eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
60+
--n-samples-per-eval-prompt 16
61+
--eval-max-response-len 16384
62+
--eval-top-p 1
63+
)
64+
65+
PERF_ARGS=(
66+
--tensor-model-parallel-size 4
67+
--sequence-parallel
68+
--pipeline-model-parallel-size 1
69+
--context-parallel-size 1
70+
--expert-model-parallel-size 1
71+
--expert-tensor-parallel-size 1
72+
73+
--recompute-granularity full
74+
--recompute-method uniform
75+
--recompute-num-layers 1
76+
77+
# --micro-batch-size 1
78+
--use-dynamic-batch-size
79+
--max-tokens-per-gpu 20480
80+
)
81+
82+
GRPO_ARGS=(
83+
--advantage-estimator grpo
84+
--use-kl-loss
85+
--kl-loss-coef 0.00
86+
--kl-loss-type low_var_kl
87+
--entropy-coef 0.00
88+
--eps-clip 0.2
89+
--eps-clip-high 0.28
90+
)
91+
92+
OPTIMIZER_ARGS=(
93+
--optimizer adam
94+
--lr 1e-6
95+
--lr-decay-style constant
96+
--weight-decay 0.1
97+
--adam-beta1 0.9
98+
--adam-beta2 0.98
99+
100+
--optimizer-cpu-offload
101+
--overlap-cpu-optimizer-d2h-h2d
102+
--use-precision-aware-optimizer
103+
)
104+
105+
WANDB_ARGS=(
106+
#--use-wandb
107+
# --wandb-project slime-dev
108+
# --wandb-group qwen3.5-27B-test
109+
# --wandb-key ${WANDB_KEY}
110+
)
111+
112+
SGLANG_ARGS=(
113+
--rollout-num-gpus-per-engine 8
114+
--sglang-mem-fraction-static 0.7
115+
--sglang-cuda-graph-bs 1 2 4 8 $(seq 16 8 256)
116+
)
117+
118+
MISC_ARGS=(
119+
# default dropout in megatron is 0.1
120+
--attention-dropout 0.0
121+
--hidden-dropout 0.0
122+
# should be good for model performance
123+
--accumulate-allreduce-grads-in-fp32
124+
--attention-softmax-in-fp32
125+
# need to comment this when using model with MLA
126+
--attention-backend flash
127+
)
128+
129+
# launch the master node of ray in container
130+
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
131+
export no_proxy="127.0.0.1,${MASTER_ADDR}"
132+
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
133+
134+
# Build the runtime environment JSON with proper variable substitution
135+
RUNTIME_ENV_JSON="{
136+
\"env_vars\": {
137+
\"PYTHONPATH\": \"/root/Megatron-LM/\",
138+
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
139+
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
140+
\"no_proxy\": \"${no_proxy}\"
141+
}
142+
}"
143+
144+
ray job submit --address="http://127.0.0.1:8265" \
145+
--runtime-env-json="${RUNTIME_ENV_JSON}" \
146+
-- python3 train.py \
147+
--actor-num-nodes 1 \
148+
--actor-num-gpus-per-node 8 \
149+
--colocate \
150+
${MODEL_ARGS[@]} \
151+
${CKPT_ARGS[@]} \
152+
${ROLLOUT_ARGS[@]} \
153+
${OPTIMIZER_ARGS[@]} \
154+
${GRPO_ARGS[@]} \
155+
${WANDB_ARGS[@]} \
156+
${PERF_ARGS[@]} \
157+
${EVAL_ARGS[@]} \
158+
${SGLANG_ARGS[@]} \
159+
${MISC_ARGS[@]}

0 commit comments

Comments
 (0)