Skip to content

Commit 1c536cf

Browse files
authored
[script] Add example script for GLM4.7 Flash (#1467)
1 parent fe0cc35 commit 1c536cf

File tree

5 files changed

+217
-13
lines changed

5 files changed

+217
-13
lines changed

docs/en/advanced/speculative-decoding.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ For models with MTP layers (e.g., GLM-4.7, DeepSeek-V3/R1), simply add:
1111
--sglang-speculative-num-steps 3
1212
--sglang-speculative-eagle-topk 1
1313
--sglang-speculative-num-draft-tokens 4
14-
--sglang-enable-draft-weights-cpu-backup
1514
```
1615

1716
If you want to use a separately trained draft model (e.g., one trained with [SpecForge](https://docs.sglang.ai/SpecForge/)), also set:

scripts/models/glm4.7-30B-A3B.sh

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
1+
MOE_ROUTED_EXPERTS=64
2+
MOE_ACTIVE_ROUTED_EXPERTS=4
13
MOE_SHARED_EXPERTS=1
24

5+
NHIDDEN=2048
36
MOE_FFN_HIDDEN=1536
47
MOE_SHARED_EXPERT_INTERMEDIATE_SIZE=$((MOE_FFN_HIDDEN * MOE_SHARED_EXPERTS))
8+
FFN_HIDDEN=10240
59
N_DENSE_LAYERS=1
610
N_MOE_LAYERS=46
11+
NHEADS=20
712

813
MODEL_ARGS=(
914
--moe-layer-freq [0]*$N_DENSE_LAYERS+[1]*$N_MOE_LAYERS
10-
--num-experts 64
15+
--num-experts $MOE_ROUTED_EXPERTS
1116
--moe-shared-expert-intermediate-size $MOE_SHARED_EXPERT_INTERMEDIATE_SIZE
12-
--moe-router-topk 4
17+
--moe-router-topk $MOE_ACTIVE_ROUTED_EXPERTS
1318
--moe-grouped-gemm
1419
--moe-permute-fusion
1520
--moe-ffn-hidden-size $MOE_FFN_HIDDEN
@@ -21,10 +26,11 @@ MODEL_ARGS=(
2126
--moe-router-topk-scaling-factor 1.8
2227
--moe-aux-loss-coeff 0
2328
--moe-router-dtype fp32
29+
--make-vocab-size-divisible-by 64
2430
--num-layers $((N_DENSE_LAYERS + N_MOE_LAYERS))
25-
--hidden-size 2048
26-
--ffn-hidden-size 10240
27-
--num-attention-heads 20
31+
--hidden-size $NHIDDEN
32+
--ffn-hidden-size $FFN_HIDDEN
33+
--num-attention-heads $NHEADS
2834
--disable-bias-linear
2935
--add-qkv-bias
3036
--swiglu
@@ -42,5 +48,5 @@ MODEL_ARGS=(
4248
--qk-pos-emb-head-dim 64
4349
--vocab-size 154880
4450
--rotary-base 1000000
45-
--enable-experimental
46-
)
51+
--no-rope-fusion
52+
)

scripts/run-glm4.5-355B-A32B.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,9 @@ SGLANG_ARGS=(
119119

120120
# mtp
121121
--sglang-speculative-algorithm EAGLE
122-
--sglang-speculative-num-steps 1
122+
--sglang-speculative-num-steps 2
123123
--sglang-speculative-eagle-topk 1
124-
--sglang-speculative-num-draft-tokens 2
125-
--sglang-enable-draft-weights-cpu-backup
124+
--sglang-speculative-num-draft-tokens 3
126125

127126
)
128127

@@ -189,7 +188,7 @@ ray job submit --address="http://127.0.0.1:8265" \
189188
--actor-num-nodes 8 \
190189
--actor-num-gpus-per-node 8 \
191190
--colocate \
192-
--save-debug-rollout-data /mnt/zhuzilin/github-slime/data.pt \
191+
--save-debug-rollout-data data.pt \
193192
${MODEL_ARGS[@]} \
194193
${CKPT_ARGS[@]} \
195194
${ROLLOUT_ARGS[@]} \

scripts/run-glm4.7-30B-A3B.sh

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
#!/bin/bash
2+
3+
# for rerun the task
4+
pkill -9 sglang
5+
sleep 3
6+
ray stop --force
7+
pkill -9 ray
8+
pkill -9 python
9+
sleep 3
10+
pkill -9 ray
11+
pkill -9 python
12+
13+
set -ex
14+
15+
# will prevent ray from buffering stdout/stderr
16+
export PYTHONBUFFERED=16
17+
18+
NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
19+
if [ "$NVLINK_COUNT" -gt 0 ]; then
20+
HAS_NVLINK=1
21+
else
22+
HAS_NVLINK=0
23+
fi
24+
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
25+
26+
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
27+
source "${SCRIPT_DIR}/../scripts/models/glm4.7-30B-A3B.sh"
28+
29+
CKPT_ARGS=(
30+
--hf-checkpoint $BASE_DIR/GLM-4.7-Flash
31+
--ref-load $BASE_DIR/GLM-4.7-Flash_torch_dist/
32+
)
33+
34+
ROLLOUT_ARGS=(
35+
--prompt-data $BASE_DIR/dapo-math-17k/dapo-math-17k.jsonl
36+
--input-key prompt
37+
--label-key label
38+
--apply-chat-template
39+
--rollout-shuffle
40+
41+
--rm-type deepscaler
42+
43+
--num-rollout 3000
44+
--rollout-batch-size 128
45+
#--over-sampling-batch-size 256
46+
--n-samples-per-prompt 8
47+
--rollout-max-response-len 32768
48+
--rollout-temperature 1.0
49+
50+
--global-batch-size 1024
51+
#--balance-data
52+
)
53+
54+
EVAL_ARGS=(
55+
--eval-interval 20
56+
--eval-prompt-data aime24 $BASE_DIR/rl_data/aime-2024.jsonl
57+
--n-samples-per-eval-prompt 2
58+
--eval-max-response-len 16384
59+
--eval-temperature 0.6
60+
--eval-top-p 0.95
61+
)
62+
63+
PERF_ARGS=(
64+
--tensor-model-parallel-size 4
65+
--sequence-parallel
66+
--pipeline-model-parallel-size 2
67+
--context-parallel-size 2
68+
--expert-model-parallel-size 8
69+
--expert-tensor-parallel-size 1
70+
--decoder-last-pipeline-num-layers 23
71+
72+
--recompute-granularity full
73+
--recompute-method uniform
74+
--recompute-num-layers 1
75+
76+
--use-dynamic-batch-size
77+
--max-tokens-per-gpu 32768
78+
)
79+
80+
GRPO_ARGS=(
81+
--advantage-estimator grpo
82+
--use-kl-loss
83+
--kl-loss-coef 0.00
84+
--kl-loss-type low_var_kl
85+
--kl-coef 0.00
86+
--entropy-coef 0.00
87+
)
88+
89+
OPTIMIZER_ARGS=(
90+
--optimizer adam
91+
--lr 1e-6
92+
--lr-decay-style constant
93+
--weight-decay 0.1
94+
--adam-beta1 0.9
95+
--adam-beta2 0.98
96+
97+
--optimizer-cpu-offload
98+
--overlap-cpu-optimizer-d2h-h2d
99+
--use-precision-aware-optimizer
100+
)
101+
102+
WANDB_ARGS=(
103+
# --use-wandb
104+
# --wandb-project slime-dev
105+
# --wandb-group glm4.7-flash
106+
)
107+
108+
SGLANG_ARGS=(
109+
--rollout-num-gpus-per-engine 8
110+
--sglang-mem-fraction-static 0.8
111+
--sglang-enable-dp-attention
112+
--sglang-dp-size 8
113+
--sglang-enable-dp-lm-head
114+
--sglang-moe-dense-tp-size 1
115+
116+
# mtp
117+
--sglang-speculative-algorithm EAGLE
118+
--sglang-speculative-num-steps 2
119+
--sglang-speculative-eagle-topk 1
120+
--sglang-speculative-num-draft-tokens 3
121+
122+
--sglang-cuda-graph-max-bs 64
123+
124+
--sglang-max-running-requests 512
125+
)
126+
127+
MISC_ARGS=(
128+
# default dropout in megatron is 0.1
129+
--attention-dropout 0.0
130+
--hidden-dropout 0.0
131+
# should be good for model performance
132+
--accumulate-allreduce-grads-in-fp32
133+
--attention-softmax-in-fp32
134+
# need to comment this when using model with MLA
135+
--attention-backend flash
136+
137+
--moe-token-dispatcher-type flex
138+
--moe-enable-deepep
139+
)
140+
141+
# launch the master node of ray in container
142+
export MASTER_ADDR=${MLP_WORKER_0_HOST}
143+
export no_proxy="127.0.0.1,${MASTER_ADDR}"
144+
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats
145+
146+
for WORKER_IP in $(awk '{print $1}' /root/mpi_rack_hostfile); do
147+
if [[ "$WORKER_IP" == "$MLP_WORKER_0_HOST" ]]; then
148+
continue
149+
fi
150+
echo "Starting Ray worker on ${WORKER_IP}"
151+
ssh root@"${WORKER_IP}" \
152+
"pkill -9 sglang ; ray stop --force ; pkill -9 python ; ray start --address=${MASTER_ADDR}:6379 --num-gpus 8 --node-ip-address ${WORKER_IP} --disable-usage-stats" &
153+
done
154+
wait
155+
156+
ray job submit --address="http://127.0.0.1:8265" \
157+
--runtime-env-json='{
158+
"env_vars": {
159+
"no_proxy": "localhost,127.0.0.1,0.0.0.0,${MASTER_ADDR}",
160+
"GLOO_SOCKET_IFNAME": "${MLP_SOCKET_IFNAME}",
161+
"TP_SOCKET_IFNAME": "${MLP_SOCKET_IFNAME}",
162+
"MASTER_ADDR": "${MLP_WORKER_0_HOST}",
163+
"PYTHONPATH": "/root/Megatron-LM/",
164+
"NCCL_CUMEM_ENABLE": "0",
165+
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
166+
"NVTE_BWD_LAYERNORM_SM_MARGIN": "20",
167+
"NCCL_IB_TC": "160",
168+
"NCCL_PXN_DISABLE": "0",
169+
"NCCL_IB_GID_INDEX": "3",
170+
"NCCL_NET_GDR_LEVEL": "4",
171+
"NCCL_IB_RETRY_CNT": "7",
172+
"NCCL_IB_TIMEOUT": "32",
173+
"NCCL_IB_QPS_PER_CONNECTION": "8",
174+
"NCCL_P2P_LEVEL": "NVL",
175+
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
176+
"NCCL_NVLS_ENABLE": "0",
177+
"NCCL_MIN_CTAS": "4",
178+
"OMPI_MCA_pml": "ob1",
179+
"OMPI_MCA_btl": "^openib",
180+
"OMPI_MCA_routed": "direct",
181+
"OMPI_MCA_routed_radix": "1024",
182+
"OMPI_MCA_plm_rsh_no_tree_spawn": "1",
183+
"OMPI_MCA_oob_tcp_if_include": "${MLP_SOCKET_IFNAME}",
184+
"OMPI_MCA_btl_tcp_if_include": "${MLP_SOCKET_IFNAME}"
185+
}
186+
}' \
187+
-- python3 train.py \
188+
--actor-num-nodes 2 \
189+
--actor-num-gpus-per-node 8 \
190+
--colocate \
191+
--save-debug-rollout-data "data.pt" \
192+
${MODEL_ARGS[@]} \
193+
${CKPT_ARGS[@]} \
194+
${ROLLOUT_ARGS[@]} \
195+
${OPTIMIZER_ARGS[@]} \
196+
${GRPO_ARGS[@]} \
197+
${WANDB_ARGS[@]} \
198+
${PERF_ARGS[@]} \
199+
${EVAL_ARGS[@]} \
200+
${SGLANG_ARGS[@]} \
201+
${MISC_ARGS[@]}

scripts/run-qwen3-next-80B-A3B.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ SGLANG_ARGS=(
129129
--sglang-speculative-num-steps 2
130130
--sglang-speculative-eagle-topk 1
131131
--sglang-speculative-num-draft-tokens 3
132-
--sglang-enable-draft-weights-cpu-backup
133132

134133
--sglang-max-running-requests 512
135134
)

0 commit comments

Comments
 (0)