[megatron] support qwen3.5 models for megatron, bump mbridge + megatron-core to latest (NovaSky-AI#1425)

erictang000 · web-flow · commit 29c11ba429d9 · 2026-04-06T17:00:36.000-07:00
GPU CI: https://github.com/NovaSky-AI/SkyRL/actions/runs/23869520430 Megatron GPU CI: https://github.com/NovaSky-AI/SkyRL/actions/runs/23869278330 Megatron GPU CI #2: https://github.com/NovaSky-AI/SkyRL/actions/runs/24045414612 megatron gpu CI #3: https://github.com/NovaSky-AI/SkyRL/actions/runs/24054807024 WandB run for Qwen3.5-0.8B: https://wandb.ai/sky-posttraining-uc-berkeley/gsm8k_megatron/runs/5cm9tg0j <img width="555" height="625" alt="image" src="https://github.com/user-attachments/assets/d3867343-6bc7-49a3-9d29-6c62f20381b3" />  --- <a href="https://app.devin.ai/review/novasky-ai/skyrl/pull/1425" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open with Devin"> </picture> </a>
diff --git a/examples/train/megatron/run_megatron_qwen3.5.sh b/examples/train/megatron/run_megatron_qwen3.5.sh
@@ -0,0 +1,70 @@
+set -x
+
+# Colocated GRPO training+generation for Qwen3.5-0.8B on GSM8K with Megatron.
+
+# uv run examples/train/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k
+# export WANDB_API_KEY=<your_key_here>
+# bash examples/train/megatron/run_megatron_qwen3.5.sh
+
+DATA_DIR="$HOME/data/gsm8k"
+LOGGER="wandb"  # change to "console" to print to stdout
+MODEL_NAME="Qwen/Qwen3.5-0.8B"
+
+INFERENCE_BACKEND="vllm" # currently only vllm is supported for megatron
+
+NUM_NODES=1
+NUM_GPUS=4
+
+MEGATRON_TP=1
+MEGATRON_PP=1
+MEGATRON_CP=1
+
+NUM_INFERENCE_ENGINES=1
+INFERENCE_ENGINE_TP=4
+
+# Qwen3.5 flags
+USE_SAMPLE_PACKING=false # sample packing is not yet supported for GDN layers in megatron - see: https://github.com/NVIDIA/Megatron-LM/pull/2644
+
+uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \
+  data.train_data="['$DATA_DIR/train.parquet']" \
+  data.val_data="['$DATA_DIR/validation.parquet']" \
+  trainer.algorithm.advantage_estimator="grpo" \
+  trainer.policy.model.path=$MODEL_NAME \
+  trainer.placement.colocate_all=true \
+  trainer.strategy=megatron \
+  trainer.placement.policy_num_nodes=$NUM_NODES \
+  trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \
+  generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \
+  generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \
+  trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \
+  trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \
+  trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \
+  trainer.use_sample_packing=$USE_SAMPLE_PACKING \
+  trainer.epochs=20 \
+  trainer.eval_batch_size=1024 \
+  trainer.eval_before_train=false \
+  trainer.eval_interval=5 \
+  trainer.update_epochs_per_batch=1 \
+  trainer.train_batch_size=128 \
+  trainer.policy_mini_batch_size=64 \
+  trainer.micro_forward_batch_size_per_gpu=4 \
+  trainer.micro_train_batch_size_per_gpu=4 \
+  trainer.ckpt_interval=10 \
+  trainer.max_prompt_length=512 \
+  generator.sampling_params.max_generate_length=1024 \
+  trainer.policy.optimizer_config.lr=1.0e-6 \
+  trainer.algorithm.use_kl_loss=false \
+  generator.inference_engine.backend=$INFERENCE_BACKEND \
+  generator.inference_engine.run_engines_locally=true \
+  generator.inference_engine.weight_sync_backend=nccl \
+  generator.inference_engine.async_engine=true \
+  generator.batched=true \
+  environment.env_class=gsm8k \
+  generator.n_samples_per_prompt=5 \
+  generator.inference_engine.gpu_memory_utilization=0.6 \
+  trainer.logger="$LOGGER" \
+  trainer.project_name="gsm8k_megatron" \
+  trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_qwen3.5-0.8b" \
+  trainer.resume_mode=null \
+  trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \
+  $@
diff --git a/pyproject.toml b/pyproject.toml
@@ -124,8 +124,10 @@ megatron = [
     "torch==2.10.0; sys_platform == 'linux'",
     "flashinfer-python==0.6.6; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "torchvision; sys_platform == 'linux'",
-    "megatron-bridge==0.3.1; sys_platform == 'linux'",
-    "megatron-core==0.16.1; sys_platform == 'linux'",
+    # megatron-bridge requires Python 3.12+; pin megatron-core to the same
+    # constraint so both packages are consistently available (or absent).
+    "megatron-bridge; sys_platform == 'linux' and python_version >= '3.12'",
+    "megatron-core; sys_platform == 'linux' and python_version >= '3.12'",
     "flashinfer-jit-cache==0.6.6; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "nvidia-modelopt; sys_platform == 'linux'",
 ]
@@ -215,8 +217,8 @@ override-dependencies = [
     "mamba-ssm; sys_platform == 'never'",
     "causal-conv1d; sys_platform == 'never'",
     "transformer-engine[pytorch]==2.10.0; sys_platform == 'linux'",
-    "megatron-core==0.16.1; sys_platform == 'linux'",
     "transformers>=5.0.0,<=5.3.0; sys_platform == 'linux'",
+    "megatron-core>=0.16.0; sys_platform == 'linux'",
     "ml_dtypes>=0.5.0; sys_platform == 'linux'",
 ]
 
@@ -261,6 +263,9 @@ torchvision = [
     { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
 ]
 harbor = { git = "https://github.com/laude-institute/harbor", rev = "8c040e1bb010201fd3c75bee3dede2407b9f57cd" }
+megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "420a7da37afea5eb4e8d3899d540c830b9c4cda2", marker = "sys_platform == 'linux'"}
+# megatron-core dev branch: https://github.com/NVIDIA/Megatron-LM/tree/dev latest as of 4/1/26
+megatron-core = {git = "https://github.com/NVIDIA/Megatron-LM", rev = "4ef64ebc468cd3da41a22d46a2db37163694e8e2", marker = "sys_platform == 'linux'"}
 
 [tool.black]
 line-length = 120
diff --git a/uv.lock b/uv.lock