add FP8 training and inference script for Qwen3-30B-A3B model (#845)

yefei12 · chenyefei.cyf · GeLee-Q · web-flow · commit 5b706fb03b11 · 2025-11-19T23:23:05.000+08:00
Co-authored-by: chenyefei.cyf &lt;chenyefei.cyf@U-9V5T77LW-2356.local&gt;
Co-authored-by: GeLee-Q &lt;865038696@qq.com&gt;
Co-authored-by: Gao016 &lt;yngao016@163.com&gt;
Co-authored-by: yzlnew &lt;yzlnew@gmail.com&gt;
diff --git a/examples/low_precision/README.md b/examples/low_precision/README.md
@@ -6,6 +6,8 @@ This is an example of FP8 training and FP8 inference. Under FP8 training and inf
 
 * `run-qwen3-4b-fp8.sh`: example launch script with Qwen3‑4B in FP8.
 
+* `run-qwen3-30b-a3b-fp8-two-nodes.sh`: example launch script for running Qwen3‑30B‑A3B in FP8 across two nodes.
+
 ### Quick Start
 
 1. [optional] Convert your HuggingFace weights to FP8 format. You can use `tools/convert_hf_to_fp8`, or directly write an FP8 format model config.
@@ -14,8 +16,14 @@ This is an example of FP8 training and FP8 inference. Under FP8 training and inf
 
 ```
 cd slime
-bash examples/fp8/run-qwen3-4b-fp8.sh
+
+# Qwen3‑4B FP8 training (single node)
+bash examples/low_precision/run-qwen3-4b-fp8.sh
+
+# Qwen3‑30B‑A3B FP8 training (two nodes)
+bash examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
 ```
+
 Following the above command will launch FP8 training. According to slime's design, if the model under `--hf-checkpoint` is FP8, it will automatically use FP8 quantization in weight updates.
 
 3. Use the saved checkpoint for evaluation
diff --git a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 sglang
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+pkill -9 redis
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONBUFFERED=16
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/../../scripts/models/qwen3-30B-A3B.sh"
+
+# Base directory for checkpoints and related files (adjust if necessary)
+BASE_DIR="/root" 
+
+CKPT_ARGS=(
+   --hf-checkpoint “${BASE_DIR}/Qwen3-30B-A3B-FP8/”
+   --ref-load “${BASE_DIR}/Qwen3-30B-A3B_torch_dist/”
+   --load “${BASE_DIR}/Qwen3-30B-A3B_slime/”
+   --save “${BASE_DIR}/Qwen3-30B-A3B_slime/”
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data “${BASE_DIR}/dapo-math-17k.jsonl”
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type math
+   --num-rollout 200
+   --rollout-batch-size 16
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 0.8
+
+   --global-batch-size 128
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 20
+   --eval-prompt-data aime “${BASE_DIR}/aime-2024.jsonl”
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-top-p 0.7
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 1
+   --sequence-parallel
+   --pipeline-model-parallel-size 4
+   --context-parallel-size 1
+   --expert-model-parallel-size 4
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 20480
+
+   # use deepep for megatron
+   --moe-enable-deepep
+   --moe-token-dispatcher-type flex
+
+   # fp8
+   --transformer-impl transformer_engine
+   --bf16
+   --fp8-format e4m3
+   --fp8-recipe blockwise
+   # --fp8-param-gather
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+   --use-tis
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   #--use-wandb
+   # --wandb-project slime-dev
+   # --wandb-group qwen3-30B-A3B-test
+   # --wandb-key ${WANDB_KEY}
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 8
+   --sglang-mem-fraction-static 0.6
+   --sglang-cuda-graph-bs 1 2 4 8 $(seq 16 8 256)
+   --sglang-expert-parallel-size 8
+   --use-slime-router
+   # --use-rollout-routing-replay
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+)
+
+# Get Ray Head node info automatically
+ip=$(ps aux | grep dashboard | grep -oP '(?<=--node-ip-address=)[0-9\.]+' | head -1)
+port=$(ps aux | grep dashboard | grep -oP '(?<=dashboard-port=)\d+' | head -1)
+export HEAD_NODE_ADDRESS="$ip"
+export DASHBOARD_PORT="$port"
+echo "Detected Ray Head IP: $HEAD_NODE_ADDRESS, Port: $DASHBOARD_PORT"
+
+export RAY_ADDRESS="http://${HEAD_NODE_ADDRESS}:${DASHBOARD_PORT}"
+
+# You should enable NVTE_FP8_BLOCK_SCALING_FP32_SCALES to use fp32 scales in fp8 training
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"NVTE_FP8_BLOCK_SCALING_FP32_SCALES\": \"1\",
+    \"NCCL_TIMEOUT_MS\":\"36000000\"
+  }
+}"
+
+ray job submit --address="${RAY_ADDRESS}" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 2 \
+   --actor-num-gpus-per-node 8 \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}