[docs] Update the NPU dependency versions and scripts. (#9500)

hazelduan · web-flow · commit 749f6d4ba468 · 2026-06-09T19:39:27.000+08:00
diff --git a/docs/source/BestPractices/NPU-support.md b/docs/source/BestPractices/NPU-support.md
@@ -24,6 +24,7 @@
 | torch_npu | >= 2.7.1.post4  |
 
 基础环境准备请参照 [Ascend PyTorch 安装文档](https://gitcode.com/Ascend/pytorch)。本文示例实验环境为 8 * 昇腾910B3 64G。
+注：vllm ascend系列官方推荐版本配套已更新至 CANN9.0.0 torch 2.9.0 torch_npu 2.9.0 vllm-ascend 0.18.0(A3) 0.19.1(A5)，详情请参阅 [vLLM Ascend 安装文档](https://docs.vllm.ai/projects/ascend/en/v0.18.0/installation.html)。
 
 | 一级特性 | 特性                | 进展     |
 | -------- | ------------------- | -------- |
@@ -41,7 +42,7 @@
 |          | QLoRA               | 暂不支持 |
 | RLHF     | GRPO                | 已支持   |
 |          | PPO                 | 已支持   |
-| 性能优化 | FA 等融合算子       | 已支持   |
+| 性能优化 | FA 等融合算子        | 已支持    |
 |          | Liger-Kernel        | 暂不支持 |
 | 部署     | PT                  | 已支持   |
 |          | vLLM                | 已支持   |
@@ -61,11 +62,11 @@
 | SFT       | Qwen3-30B-A3B               | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Qwen3-32B                   | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Qwen3-VL-30B-A3B-Instruct   | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3-Omni-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
+| SFT       | Qwen3-Omni-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
 | SFT       | InternVL3-8B                | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Ovis2.5-2B                  | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3.5-27B                 | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3.5-35B-A3B             | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
+| SFT       | Qwen3.5-27B                 | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
+| SFT       | Qwen3.5-35B-A3B             | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
 
 ### 已验证 RL 组合
 
@@ -168,7 +169,7 @@ cd ms-swift
 pip install -e .
 
 # 安装 torch_npu
-pip install torch_npu==2.7.1.post4 decorator
+pip install torch_npu==2.9.0 decorator
 # 如果你想要使用 deepspeed（控制显存占用，训练速度会有一定下降）
 pip install deepspeed
 
@@ -198,16 +199,16 @@ print(torch.randn(10, device='npu:0'))
 如果需要使用 MindSpeed(Megatron-LM)，请按照下面引导安装必要依赖。
 
 ```shell
-# 1. 获取并切换 Megatron-LM 至 v0.15.3 版本
+# 1. 获取并切换 Megatron-LM 至 v0.16.0 版本
 git clone https://github.com/NVIDIA/Megatron-LM.git
 cd Megatron-LM
-git checkout v0.15.3
+git checkout v0.16.0
 cd ..
 
 # 2. 获取并安装 MindSpeed
 git clone https://gitcode.com/Ascend/MindSpeed.git
 cd MindSpeed
-git checkout core_r0.15.3
+git checkout core_r0.16.0
 pip install -e .
 cd ..
 
@@ -217,11 +218,14 @@ cd mcore-bridge
 pip install -e .
 cd ..
 
-# 4. 设置环境变量
+# 4. 获取并安装 triton-ascend
+pip install triton-ascend==3.2.1 --extra-index-url=https://triton-ascend.osinfra.cn/pypi/simple
+
+# 5. 设置环境变量
 export PYTHONPATH=$PYTHONPATH:<your_local_megatron_lm_path>
 export MEGATRON_LM_PATH=<your_local_megatron_lm_path>
 
-# 5. 如需回退到 transformers 的 GatedDeltaNet 实现，可关闭 Megatron GDN
+# 6. 如需回退到 transformers 的 GatedDeltaNet 实现，可关闭 Megatron GDN
 export USE_MCORE_GDN=0
 ```
 
@@ -258,8 +262,9 @@ Qwen3.5 modeling.chunk_gated_delta_rule
 
 - 该 patch 主要覆盖的是 **Qwen3.5 linear attention 的 gated-delta-rule 路径**；
 - 它并不等价于“将整个 fla 包完整替换为 MindSpeed”；
-- 若需要这条路径生效，请确保当前环境中可以正确导入 MindSpeed。
-- 精度对齐验证版本：torch 2.7.1 + MindSpeed 0.12.1 + flash-linear-attention 4.1.0 + triton-ascend 3.2.0 + transformers 5.2.0
+- 若需要这条路径生效，请确保当前环境中可以正确导入 MindSpeed 和 triton ascend
+- 精度对齐验证版本：torch 2.9.0 + MindSpeed 0.16.0 + flash-linear-attention 0.4.2 + triton-ascend 3.2.1 + transformers 5.2.0
+
 
 当前 Qwen3.5 在 NPU 上如果走 Megatron-SWIFT 训练，还需要额外注意版本和功能约束：
 
diff --git a/docs/source_en/BestPractices/NPU-support.md b/docs/source_en/BestPractices/NPU-support.md
@@ -22,6 +22,7 @@ Recommended base environment versions:
 | CANN      | >= 8.5.1        |
 | torch     | >= 2.7.1        |
 | torch_npu | >= 2.7.1.post4  |
+Note: The officially recommended version compatibility matrix for the vLLM Ascend series has been updated to CANN 9.0.0, torch 2.9.0, torch_npu 2.9.0, vllm-ascend 0.18.0 for A3, and vllm-ascend 0.19.1 for A5. For details, see the [vLLM Ascend installation guide](https://docs.vllm.ai/projects/ascend/en/v0.18.0/installation.html).
 
 For base environment setup, see the [Ascend PyTorch installation guide](https://gitcode.com/Ascend/pytorch). The examples in this document were verified on 8 * Ascend 910B3 64G.
 
@@ -61,11 +62,11 @@ For base environment setup, see the [Ascend PyTorch installation guide](https://
 | SFT       | Qwen3-30B-A3B               | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Qwen3-32B                   | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Qwen3-VL-30B-A3B-Instruct   | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3-Omni-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
+| SFT       | Qwen3-Omni-30B-A3B-Instruct | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
 | SFT       | InternVL3-8B                | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
 | SFT       | Ovis2.5-2B                  | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3.5-27B                 | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
-| SFT       | Qwen3.5-35B-A3B             | FSDP1/FSDP2/deepspeed | Atlas 900 A2 PODc |
+| SFT       | Qwen3.5-27B                 | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
+| SFT       | Qwen3.5-35B-A3B             | FSDP1/FSDP2/deepspeed/Megatron | Atlas 900 A2 PODc/A3 SuperPoD |
 
 ### Verified RL Combinations
 
@@ -170,7 +171,7 @@ cd ms-swift
 pip install -e .
 
 # Install torch_npu
-pip install torch_npu==2.7.1.post4 decorator
+pip install torch_npu==2.9.0 decorator
 # If you want to use deepspeed (to reduce memory usage, with some speed overhead)
 pip install deepspeed
 
@@ -200,16 +201,16 @@ print(torch.randn(10, device='npu:0'))
 If you need MindSpeed(Megatron-LM), install the required dependencies as follows.
 
 ```shell
-# 1. Clone Megatron-LM and switch to v0.15.3
+# 1. Clone Megatron-LM and switch to v0.16.0
 git clone https://github.com/NVIDIA/Megatron-LM.git
 cd Megatron-LM
-git checkout v0.15.3
+git checkout v0.16.0
 cd ..
 
 # 2. Clone and install MindSpeed
 git clone https://gitcode.com/Ascend/MindSpeed.git
 cd MindSpeed
-git checkout core_r0.15.3
+git checkout core_r0.16.0
 pip install -e .
 cd ..
 
@@ -219,11 +220,14 @@ cd mcore-bridge
 pip install -e .
 cd ..
 
-# 4. Set environment variables
+# 4. Download and install triton-ascend
+pip install triton-ascend==3.2.1 --extra-index-url=https://triton-ascend.osinfra.cn/pypi/simple
+
+# 5. Set environment variables
 export PYTHONPATH=$PYTHONPATH:<your_local_megatron_lm_path>
 export MEGATRON_LM_PATH=<your_local_megatron_lm_path>
 
-# 5. Disable Megatron GDN if you need to fall back to the transformers GatedDeltaNet implementation
+# 6. Disable Megatron GDN if you need to fall back to the transformers GatedDeltaNet implementation
 export USE_MCORE_GDN=0
 ```
 
@@ -262,7 +266,7 @@ Therefore:
 - This patch mainly covers the **gated-delta-rule path of Qwen3.5 linear attention**.
 - It is not equivalent to “fully replacing the entire fla package with MindSpeed”.
 - To make this path effective, ensure that MindSpeed can be imported correctly in the current environment.
-- Verified versions for accuracy alignment: torch 2.7.1 + MindSpeed 0.12.1 + flash-linear-attention 4.1.0 + triton-ascend 3.2.0 + transformers 5.2.0
+- Verified versions for accuracy alignment: torch 2.9.0 + MindSpeed 0.16.0 + flash-linear-attention 0.4.2 + triton-ascend 3.2.1 + transformers 5.2.0
 
 When running Qwen3.5 with Megatron-SWIFT on NPU, note the following version and feature constraints:
 
diff --git a/examples/ascend/train/qwen3_5/qwen3_lora_megatron_npu.sh b/examples/ascend/train/qwen3_5/qwen3_lora_megatron_npu.sh
@@ -0,0 +1,57 @@
+# NPU stability environment variables
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+# NPU memory management environment variables
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+# NPU performance environment variables
+export TASK_QUEUE_ENABLE=2
+
+NPROC_PER_NODE=8 \
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+megatron sft \
+    --model Qwen/Qwen3.5-35B-A3B \
+    --save_safetensors true \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
+    --tuner_type lora \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    \
+    --tensor_model_parallel_size 2 \
+    --expert_model_parallel_size 4 \
+    --moe_permute_fusion true \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 1e-6 \
+    --sequence_parallel true \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    \
+    --micro_batch_size 1 \
+    --global_batch_size 8 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --gradient_accumulation_fusion false \
+    --masked_softmax_fusion false \
+    \
+    --lr 1e-4 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 1e-5 \
+    --num_train_epochs 16 \
+    \
+    --output_dir output/Qwen3.5-35B-A3B \
+    --save_steps 2000 \
+    --max_length 1024 \
+    --system 'You are a helpful assistant.' \
+    \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --no_save_optim true \
+    --no_save_rng true \
+    \
+    --attention_backend flash \
+    --model_author swift \
+    --model_name swift-robot