Skip to content

Commit d1346d8

Browse files
khazicclaude
andcommitted
feat: add Qwen3.5 SFT megatron recipes for 2B and 27B models
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4240ea7 commit d1346d8

File tree

2 files changed

+256
-0
lines changed

2 files changed

+256
-0
lines changed
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env bash
2+
set -xeuo pipefail
3+
4+
5+
NUM_GPUS=${NUM_GPUS:-8}
6+
NNODES=${WORLD_SIZE:-${NNODES:-4}}
7+
NODE_RANK=${RANK:-${NODE_RANK:-0}}
8+
MASTER_PORT=${MASTER_PORT:-8888}
9+
10+
RAW_MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
11+
MASTER_ADDR=$(python3 -c "import socket; print(socket.getaddrinfo('${RAW_MASTER_ADDR}', None, socket.AF_INET)[0][4][0])" 2>/dev/null || echo "${RAW_MASTER_ADDR}")
12+
13+
TRAIN_FILES=${TRAIN_FILES:-"[/llm-align/liuchonghan/ins_dataset/ins_dataset/verl_parquet/Gemini3_translate_110w_1096687.parquet,/llm-align/liuchonghan/ins_dataset/ins_dataset/verl_parquet/Gemini_QA_mm_92w_920623.parquet]"}
14+
15+
MODEL_PATH=${MODEL_PATH:-/llm-align/liuchonghan/qwen3_5_27b_sft_global_step_8000}
16+
17+
TP_SIZE=${TP_SIZE:-8}
18+
PP_SIZE=${PP_SIZE:-1}
19+
VPP_SIZE=${VPP_SIZE:-null}
20+
CP_SIZE=${CP_SIZE:-1}
21+
EP_SIZE=${EP_SIZE:-1}
22+
ETP_SIZE=${ETP_SIZE:-1}
23+
24+
TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
25+
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-8}
26+
MAX_LENGTH=${MAX_LENGTH:-6144}
27+
MAX_TOKEN_LEN_PER_GPU=${MAX_TOKEN_LEN_PER_GPU:-${MAX_LENGTH}}
28+
PAD_MODE=${PAD_MODE:-no_padding}
29+
TRUNCATION=${TRUNCATION:-right}
30+
NUM_WORKERS=${NUM_WORKERS:-1} # 670w dataset is large; many workers easily cause CPU OOM, so default to 1
31+
LR=${LR:-5e-6}
32+
MIN_LR=${MIN_LR:-5e-7}
33+
DTYPE=${DTYPE:-bfloat16}
34+
TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
35+
36+
echo ">>> 数据文件: ${TRAIN_FILES}, total_epochs=${TOTAL_EPOCHS}"
37+
38+
BACKEND=megatron
39+
RESUME_MODE=${RESUME_MODE:-disable}
40+
41+
project_name=verl_sft_qwen3_5_27b_translate
42+
exp_name=qwen3_5_27b-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}
43+
ckpts_home=${ckpts_home:-/llm-align/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
44+
45+
echo ">>> 节点信息: RANK ${NODE_RANK} / WORLD_SIZE ${NNODES}"
46+
echo ">>> 通信信息: MASTER ${MASTER_ADDR} : ${MASTER_PORT}"
47+
48+
if [ "${NODE_RANK}" -eq 0 ]; then
49+
mkdir -p "${ckpts_home}"
50+
fi
51+
52+
# Qwen3.5 GDN + megatron bshd path currently requires no_padding + static bsz.
53+
if [ "${PAD_MODE}" != "no_padding" ]; then
54+
echo "ERROR: PAD_MODE must be no_padding for Qwen3.5 megatron bshd path."
55+
exit 1
56+
fi
57+
58+
export WANDB_MODE=${WANDB_MODE:-offline}
59+
export NCCL_DEBUG=WARN
60+
export PYTORCH_ALLOC_CONF=expandable_segments:True
61+
export HYDRA_FULL_ERROR=1
62+
export PYTHONPATH=${PYTHONPATH:-}:/llm-align/liuchonghan/verl_lao
63+
64+
# Key Qwen3.5 settings:
65+
# engine.use_remove_padding=False - GDN requires bshd format (no THD)
66+
# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge)
67+
ENGINE_CONFIG="\
68+
engine=${BACKEND} \
69+
optim=${BACKEND} \
70+
optim.lr=${LR} \
71+
optim.min_lr=${MIN_LR} \
72+
optim.lr_warmup_steps=20 \
73+
optim.weight_decay=0.1 \
74+
optim.betas='[0.9,0.95]' \
75+
optim.clip_grad=1.0 \
76+
optim.lr_warmup_init=0 \
77+
optim.lr_decay_style=cosine \
78+
+optim.override_optimizer_config.optimizer_offload_fraction=0 \
79+
+optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=False \
80+
+optim.override_optimizer_config.use_precision_aware_optimizer=True \
81+
+optim.override_optimizer_config.optimizer_cpu_offload=False \
82+
engine.tensor_model_parallel_size=${TP_SIZE} \
83+
engine.pipeline_model_parallel_size=${PP_SIZE} \
84+
engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
85+
engine.context_parallel_size=${CP_SIZE} \
86+
engine.expert_model_parallel_size=${EP_SIZE} \
87+
engine.expert_tensor_parallel_size=${ETP_SIZE} \
88+
engine.use_mbridge=True \
89+
engine.vanilla_mbridge=True \
90+
engine.dtype=${DTYPE} \
91+
engine.use_remove_padding=False \
92+
engine.override_transformer_config.attention_backend=auto \
93+
+engine.override_transformer_config.recompute_method=uniform \
94+
+engine.override_transformer_config.recompute_granularity=full \
95+
+engine.override_transformer_config.recompute_num_layers=1"
96+
97+
torchrun \
98+
--nproc_per_node=${NUM_GPUS} \
99+
--nnodes=${NNODES} \
100+
--node_rank=${NODE_RANK} \
101+
--master_addr=${MASTER_ADDR} \
102+
--master_port=${MASTER_PORT} \
103+
-m verl.trainer.sft_trainer \
104+
"data.train_files=${TRAIN_FILES}" \
105+
data.train_batch_size=${TRAIN_BATCH_SIZE} \
106+
data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
107+
data.max_length=${MAX_LENGTH} \
108+
data.pad_mode=${PAD_MODE} \
109+
data.truncation=${TRUNCATION} \
110+
data.use_dynamic_bsz=False \
111+
data.max_token_len_per_gpu=${MAX_TOKEN_LEN_PER_GPU} \
112+
data.num_workers=${NUM_WORKERS} \
113+
data.messages_key=messages \
114+
model.path=${MODEL_PATH} \
115+
model.use_remove_padding=True \
116+
model.trust_remote_code=True \
117+
model.enable_gradient_checkpointing=True \
118+
${ENGINE_CONFIG} \
119+
trainer.test_freq=-1 \
120+
trainer.save_freq=1000 \
121+
trainer.max_ckpt_to_keep=10 \
122+
trainer.logger="['console']" \
123+
trainer.project_name="${project_name}" \
124+
trainer.experiment_name="${exp_name}" \
125+
trainer.total_epochs=${TOTAL_EPOCHS} \
126+
trainer.default_local_dir="${ckpts_home}" \
127+
trainer.resume_mode=${RESUME_MODE} \
128+
'checkpoint.save_contents=[hf_model]'
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#!/usr/bin/env bash
2+
set -xeuo pipefail
3+
4+
5+
NUM_GPUS=${NUM_GPUS:-8}
6+
NNODES=${WORLD_SIZE:-${NNODES:-1}}
7+
NODE_RANK=${RANK:-${NODE_RANK:-0}}
8+
MASTER_PORT=${MASTER_PORT:-8888}
9+
10+
RAW_MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
11+
MASTER_ADDR=$(python3 -c "import socket; print(socket.getaddrinfo('${RAW_MASTER_ADDR}', None, socket.AF_INET)[0][4][0])" 2>/dev/null || echo "${RAW_MASTER_ADDR}")
12+
13+
TRAIN_FILES=${TRAIN_FILES:-/llm-align/liuchonghan/train_1.7b_trans_total_v3.parquet}
14+
15+
MODEL_PATH=${MODEL_PATH:-/llm-align/liuchonghan/Qwen3.5-2B}
16+
17+
TP_SIZE=${TP_SIZE:-1}
18+
PP_SIZE=${PP_SIZE:-1}
19+
VPP_SIZE=${VPP_SIZE:-null}
20+
CP_SIZE=${CP_SIZE:-1}
21+
EP_SIZE=${EP_SIZE:-1}
22+
ETP_SIZE=${ETP_SIZE:-1}
23+
24+
TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256}
25+
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-8}
26+
MAX_LENGTH=${MAX_LENGTH:-2048}
27+
MAX_TOKEN_LEN_PER_GPU=${MAX_TOKEN_LEN_PER_GPU:-${MAX_LENGTH}}
28+
PAD_MODE=${PAD_MODE:-no_padding}
29+
TRUNCATION=${TRUNCATION:-right}
30+
NUM_WORKERS=${NUM_WORKERS:-0}
31+
LR=${LR:-5e-6}
32+
MIN_LR=${MIN_LR:-5e-7}
33+
DTYPE=${DTYPE:-bfloat16}
34+
TOTAL_EPOCHS=${TOTAL_EPOCHS:-2}
35+
36+
echo ">>> 数据文件: ${TRAIN_FILES}, total_epochs=${TOTAL_EPOCHS}"
37+
38+
BACKEND=megatron
39+
RESUME_MODE=${RESUME_MODE:-disable}
40+
41+
project_name=verl_sft_qwen3_5_2b
42+
exp_name=qwen3_5_2b-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}
43+
ckpts_home=${ckpts_home:-/llm-align/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}}
44+
45+
echo ">>> 节点信息: RANK ${NODE_RANK} / WORLD_SIZE ${NNODES}"
46+
echo ">>> 通信信息: MASTER ${MASTER_ADDR} : ${MASTER_PORT}"
47+
48+
if [ "${NODE_RANK}" -eq 0 ]; then
49+
mkdir -p "${ckpts_home}"
50+
fi
51+
52+
# Qwen3.5 GDN + megatron bshd path currently requires no_padding + static bsz.
53+
if [ "${PAD_MODE}" != "no_padding" ]; then
54+
echo "ERROR: PAD_MODE must be no_padding for Qwen3.5 megatron bshd path."
55+
exit 1
56+
fi
57+
58+
export WANDB_MODE=${WANDB_MODE:-offline}
59+
export NCCL_DEBUG=WARN
60+
export PYTORCH_ALLOC_CONF=expandable_segments:True
61+
export HYDRA_FULL_ERROR=1
62+
export PYTHONPATH=${PYTHONPATH:-}:/llm-align/liuchonghan/verl_lao
63+
64+
# Key Qwen3.5 settings:
65+
# engine.use_remove_padding=False - GDN requires bshd format (no THD)
66+
# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge)
67+
ENGINE_CONFIG="\
68+
engine=${BACKEND} \
69+
optim=${BACKEND} \
70+
optim.lr=${LR} \
71+
optim.min_lr=${MIN_LR} \
72+
optim.lr_warmup_steps=20 \
73+
optim.weight_decay=0.1 \
74+
optim.betas='[0.9,0.95]' \
75+
optim.clip_grad=1.0 \
76+
optim.lr_warmup_init=0 \
77+
optim.lr_decay_style=cosine \
78+
+optim.override_optimizer_config.optimizer_offload_fraction=0 \
79+
+optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=False \
80+
+optim.override_optimizer_config.use_precision_aware_optimizer=True \
81+
+optim.override_optimizer_config.optimizer_cpu_offload=False \
82+
engine.tensor_model_parallel_size=${TP_SIZE} \
83+
engine.pipeline_model_parallel_size=${PP_SIZE} \
84+
engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
85+
engine.context_parallel_size=${CP_SIZE} \
86+
engine.expert_model_parallel_size=${EP_SIZE} \
87+
engine.expert_tensor_parallel_size=${ETP_SIZE} \
88+
engine.use_mbridge=True \
89+
engine.vanilla_mbridge=True \
90+
engine.dtype=${DTYPE} \
91+
engine.use_remove_padding=False \
92+
engine.override_transformer_config.attention_backend=auto \
93+
+engine.override_transformer_config.recompute_method=uniform \
94+
+engine.override_transformer_config.recompute_granularity=full \
95+
+engine.override_transformer_config.recompute_num_layers=1"
96+
97+
torchrun \
98+
--nproc_per_node=${NUM_GPUS} \
99+
--nnodes=${NNODES} \
100+
--node_rank=${NODE_RANK} \
101+
--master_addr=${MASTER_ADDR} \
102+
--master_port=${MASTER_PORT} \
103+
-m verl.trainer.sft_trainer \
104+
data.train_files="${TRAIN_FILES}" \
105+
data.train_batch_size=${TRAIN_BATCH_SIZE} \
106+
data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
107+
data.max_length=${MAX_LENGTH} \
108+
data.pad_mode=${PAD_MODE} \
109+
data.truncation=${TRUNCATION} \
110+
data.use_dynamic_bsz=False \
111+
data.max_token_len_per_gpu=${MAX_TOKEN_LEN_PER_GPU} \
112+
data.num_workers=${NUM_WORKERS} \
113+
data.messages_key=messages \
114+
model.path=${MODEL_PATH} \
115+
model.use_remove_padding=True \
116+
model.trust_remote_code=True \
117+
model.enable_gradient_checkpointing=True \
118+
${ENGINE_CONFIG} \
119+
trainer.test_freq=-1 \
120+
trainer.save_freq=500 \
121+
trainer.max_ckpt_to_keep=3 \
122+
trainer.logger="['console']" \
123+
trainer.project_name="${project_name}" \
124+
trainer.experiment_name="${exp_name}" \
125+
trainer.total_epochs=${TOTAL_EPOCHS} \
126+
trainer.default_local_dir="${ckpts_home}" \
127+
trainer.resume_mode=${RESUME_MODE} \
128+
'checkpoint.save_contents=[model,optimizer,extra,hf_model]'

0 commit comments

Comments
 (0)