Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cpu_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
run: |
pip3 install -r requirements-test.txt
pip3 install --no-deps -e .
pip3 install --upgrade "transformers<5.0.0"
pip3 install --upgrade "transformers>=5.0.0"
- name: Download datasets
run: |
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
Expand Down
165 changes: 165 additions & 0 deletions examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env bash
# Qwen3.5-35B-A3B MoE GRPO RL with Megatron (single node, 8 GPUs, geo3k dataset)
#
# notes on vllm:
# by 20260225, the latest vllm nightly does not support qwen3.5 rollout, to use this script, you need to
# 1. wait until vllm supports qwen3.5 officially, and build a verl docker with that version of vllm
# 2. self build a verl docker image with vllm from source code with qwen3.5 support (main branch 20260225 is OK)
# I succeeded in running this script with the main branch of vllm on 20260225, yet there are still some minor issues
# the vllm qwen3.5 during initialization, need to be fixed. Also, the cuda_graph is somehow not working, need to be
# fixed, either by verl team with supoorts to vllm0.16, or by vllm team.
# Requirements:
# - 8 GPUs (80GB each, e.g. 1x8 H100/H200)
# - Additional packages on top of the base image:
# pip install --upgrade transformers
# pip install flash-linear-attention
# pip install -U git+https://github.com/ISEEKYAN/mbridge.git
# - Megatron-LM==0.16.0
#
# Qwen3.5 architecture notes:
# Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
# NOT support packed sequences (THD format) in Megatron-LM. Therefore:
# - model.use_remove_padding=False (deprecated option, will be removed in the future forces bshd compute format)
# - actor.megatron.use_remove_padding=False (forces bshd compute format)
# - actor.use_dynamic_bsz=False (required for bshd mode)
#
# Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
# can be set to True for better performance.
#
# Tested parallelism config (8 GPUs / 1 node):
# TP=2 PP=1 CP=1 EP=8 ETP=1 GEN_TP=8
#

export CUDA_DEVICE_MAX_CONNECTIONS=1
export VLLM_USE_V1=1
export VLLM_ALLREDUCE_USE_SYMM_MEM=0

set -xeuo pipefail

########################### Quick Config ###########################

TP=${TP:-2}
PP=${PP:-1}
CP=${CP:-1}
EP=${EP:-8}
ETP=${ETP:-1}
GEN_TP=${GEN_TP:-8}

ALL_OFFLOAD=${ALL_OFFLOAD:-True}

rollout_name="vllm"
project_name='verl_grpo_qwen3_5_35b_geo3k'
exp_name='qwen3_5_35b_megatron'
adv_estimator=grpo

HF_MODEL_PATH=${HF_MODEL_PATH:-"Qwen3.5-35B-A3B"}
train_path=${train_path:-$HOME/data/geo3k/train.parquet}
test_path=${test_path:-$HOME/data/geo3k/test.parquet}

########################### Parameter Arrays ###########################

DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=32
data.max_prompt_length=1024
data.max_response_length=2048
data.truncation='error'
data.filter_overlong_prompts=True
)

MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)

ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=32
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=True
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16
++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
)

ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=5
actor_rollout_ref.rollout.mode=async
actor_rollout_ref.rollout.dtype=bfloat16
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
)

REF=(
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)

ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)

TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console","wandb"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=8
trainer.nnodes=1
trainer.save_freq=20
trainer.val_before_train=False
trainer.test_freq=5
trainer.total_epochs=15
)

########################### Launch ###########################

python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer.yaml' \
"${DATA[@]}" \
"${ALGORITHM[@]}" \
"${MODEL[@]}" \
"${ROLLOUT[@]}" \
"${ACTOR[@]}" \
"${REF[@]}" \
"${TRAINER[@]}" \
"$@"
142 changes: 142 additions & 0 deletions examples/sft/gsm8k/run_qwen3_5_megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
# Qwen3.5-397B-A17B SFT with Megatron backend + mbridge
#
# Requirements:
# - 128+ GPUs (80GB each, e.g. 16x8 H100/H200)
# - Docker: verlai/verl:vllm015 (or equivalent)
# - Additional packages on top of the base image:
# pip install --upgrade transformers
# pip install flash-linear-attention
# pip install -U git+https://github.com/ISEEKYAN/mbridge.git
# - Megatron-LM==0.16.0
#
# Qwen3.5 architecture notes:
# Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
# NOT support packed sequences (THD format) in Megatron-LM. Therefore:
# - engine.use_remove_padding=False (forces bshd compute format)
# - data.use_dynamic_bsz=False (required for bshd mode)
#
# Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD
# format will be supported and engine.use_remove_padding can be set to True
# for better performance.
#
# Tested parallelism config (128 GPUs / 16 nodes):
# TP=2 PP=4 EP=32 CP=1

set -xeuo pipefail

# ============================================================
# Distributed
# ============================================================
NUM_GPUS=${NUM_GPUS:-8}
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-29500}
NNODES=${NNODES:-16}
NODE_RANK=${NODE_RANK:-0}

# ============================================================
# Data
# ============================================================
DATASET_DIR=${DATASET_DIR:-~/dataset}
TRAIN_FILES=${TRAIN_FILES:-${DATASET_DIR}/train.parquet}

# ============================================================
# Model
# ============================================================
MODEL_PATH=${MODEL_PATH:-Qwen/Qwen3.5-397B-A17B}

# ============================================================
# Parallelism
# ============================================================
TP_SIZE=${TP_SIZE:-2}
PP_SIZE=${PP_SIZE:-4}
VPP_SIZE=${VPP_SIZE:-null}
CP_SIZE=${CP_SIZE:-1}
EP_SIZE=${EP_SIZE:-32}
ETP_SIZE=${ETP_SIZE:-1}

# ============================================================
# Training
# ============================================================
TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-128}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
MAX_LENGTH=${MAX_LENGTH:-2048}
LR=${LR:-2e-5}
MIN_LR=${MIN_LR:-2e-6}
DTYPE=${DTYPE:-bfloat16}

BACKEND=megatron
RESUME_MODE=${RESUME_MODE:-disable}

project_name=verl_sft_qwen3_5
exp_name=qwen3_5-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-ep${EP_SIZE}
ckpts_home=${ckpts_home:-~/verl/checkpoints/${project_name}/${exp_name}}
mkdir -p "${ckpts_home}"

# ============================================================
# Engine config
# ============================================================
# Key Qwen3.5 settings:
# engine.use_remove_padding=False - GDN requires bshd format (no THD)
# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge)
ENGINE_CONFIG="\
engine=${BACKEND} \
optim=${BACKEND} \
optim.lr=${LR} \
optim.min_lr=${MIN_LR} \
optim.lr_warmup_steps=10 \
optim.weight_decay=0.1 \
optim.betas='[0.9,0.95]' \
optim.clip_grad=1.0 \
optim.lr_warmup_init=0 \
optim.lr_decay_style=cosine \
+optim.override_optimizer_config.optimizer_offload_fraction=1 \
+optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+optim.override_optimizer_config.use_precision_aware_optimizer=True \
+optim.override_optimizer_config.optimizer_cpu_offload=True \
engine.tensor_model_parallel_size=${TP_SIZE} \
engine.pipeline_model_parallel_size=${PP_SIZE} \
engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
engine.context_parallel_size=${CP_SIZE} \
engine.expert_model_parallel_size=${EP_SIZE} \
engine.expert_tensor_parallel_size=${ETP_SIZE} \
engine.use_mbridge=True \
engine.vanilla_mbridge=True \
engine.dtype=${DTYPE} \
engine.use_remove_padding=False \
engine.override_transformer_config.attention_backend=auto \
+engine.override_transformer_config.recompute_method=uniform \
+engine.override_transformer_config.recompute_granularity=full \
+engine.override_transformer_config.recompute_num_layers=1"

# ============================================================
# Launch
# ============================================================
torchrun \
--nproc_per_node=${NUM_GPUS} \
--nnodes=${NNODES} \
--node_rank=${NODE_RANK} \
--master_addr=${MASTER_ADDR} \
--master_port=${MASTER_PORT} \
-m verl.trainer.sft_trainer \
data.train_files="${TRAIN_FILES}" \
data.train_batch_size=${TRAIN_BATCH_SIZE} \
data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
data.max_length=${MAX_LENGTH} \
data.pad_mode=no_padding \
data.truncation=error \
data.use_dynamic_bsz=False \
data.max_token_len_per_gpu=${MAX_LENGTH} \
data.messages_key=messages \
model.path=${MODEL_PATH} \
model.use_remove_padding=False \
model.trust_remote_code=True \
${ENGINE_CONFIG} \
trainer.test_freq=-1 \
trainer.save_freq=500 \
trainer.logger="['console']" \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.total_epochs=1 \
trainer.default_local_dir="${ckpts_home}" \
trainer.resume_mode=${RESUME_MODE}
4 changes: 2 additions & 2 deletions tests/checkpoint_engine/test_special_server_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ async def _run_server_manager_without_resume(
)

# wait a while and update weights to interrupt the generation
await asyncio.sleep(3)
await asyncio.sleep(2)
await checkpoint_manager.update_weights(global_steps=global_steps)

outputs = await asyncio.gather(*tasks)
Expand Down Expand Up @@ -149,7 +149,7 @@ async def _run_server_manager_with_resume(
# 2. trainer update weights to rollout multiple times
for global_steps in range(initial_steps, initial_steps + train_steps):
# wait a while and update weights to interrupt the generation
await asyncio.sleep(3)
await asyncio.sleep(2)
await checkpoint_manager.update_weights(global_steps=global_steps)

# 3. wait for rollout generate responses finished
Expand Down
Loading
Loading