Megatron VLM Support (Qwen2.5-VL series) (3/N) (#1210)

Zhuohao-Li · web-flow · commit f80b955cda2c · 2026-01-18T06:17:44.000+08:00
diff --git a/examples/geo3k_vlm/README.md b/examples/geo3k_vlm/README.md
@@ -2,6 +2,10 @@
 
 Training VLMs with FSDP or Megatron on single-turn reasoning task using GRPO on the [GEO3K dataset](https://huggingface.co/datasets/hiyouga/geometry3k). We used processed version [here](https://huggingface.co/datasets/chenhegu/geo3k_imgurl).
 
+Supported models:
+* Qwen2.5-VL
+* Qwen3-VL (Dense and Moe)
+
 Note: Please make sure the cudnn version in the environment is 9.16.0.29 to prevent severe performance regression in conv3d in torch 2.9 mentioned in https://github.com/pytorch/pytorch/issues/168167. Otherwise, you can reinstall cudnn with:
 ```bash
 pip install nvidia-cudnn-cu12==9.16.0.29
diff --git a/examples/geo3k_vlm/run_geo3k_vlm.sh b/examples/geo3k_vlm/run_geo3k_vlm.sh
@@ -15,6 +15,10 @@ DATASET_LOCAL_NAME=$(basename "$DATASET_NAME")
 
 # Validate MODEL_NAME
 VALID_MODELS="
+  Qwen2.5-VL-3B-Instruct
+  Qwen2.5-VL-7B-Instruct
+  Qwen2.5-VL-32B-Instruct
+  Qwen2.5-VL-72B-Instruct
   Qwen3-VL-2B-Instruct
   Qwen3-VL-4B-Instruct
   Qwen3-VL-8B-Instruct
@@ -80,6 +84,8 @@ fi
 # Common args
 CKPT_ARGS=(
    --hf-checkpoint /root/models/${MODEL_NAME}
+   # qwen3 vl model has rotary base 5000000, set it when applicable
+   --rotary-base 5000000
 )
 
 ROLLOUT_ARGS=(
diff --git a/examples/geo3k_vlm/run_geo3k_vlm_sft.sh b/examples/geo3k_vlm/run_geo3k_vlm_sft.sh
@@ -6,6 +6,10 @@ DATASET_LOCAL_NAME=$(basename "$DATASET_NAME")
 
 # Validate MODEL_NAME
 VALID_MODELS="
+  Qwen2.5-VL-3B-Instruct
+  Qwen2.5-VL-7B-Instruct
+  Qwen2.5-VL-32B-Instruct
+  Qwen2.5-VL-72B-Instruct
   Qwen3-VL-2B-Instruct
   Qwen3-VL-4B-Instruct
   Qwen3-VL-8B-Instruct