# docker拉取
docker pull modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py312-torch2.10.0-vllm0.19.1-modelscope1.35.4-swift4.1.3
docker tag modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.9.1-py312-torch2.10.0-vllm0.19.1-modelscope1.35.4-swift4.1.3 swift4.1.3
docker run \
-itd \
-v /mnt:/mnt \
-v /root/log:/root/log \
--network host \
--ipc host \
--gpus all \
--ulimit memlock=-1 \
--shm-size 1024g \
--privileged \
--device /dev/infiniband \
--cap-add IPC_LOCK \
--name swift \
-e NCCL_IB_HCA=mlx5_3,mlx5_7 \
-e NCCL_P2P_LEVEL=NVL \
-e NCCL_IB_GID_INDEX=0 \
-e NCCL_IB_CUDA_SUPPORT=1 \
-e NCCL_IB_DISABLE=0 \
-e NCCL_SOCKET_IFNAME=enp93s0f0 \
-e NCCL_DEBUG=INFO \
-e NCCL_NET_GDR_LEVEL=2 \
-e NCCL_IB_MERGE_NICS=1 \
-e NCCL_TIMEOUT=1800 \
-e MODELSCOPE_CACHE=/mnt/temp/modelscope \
-e HF_HOME=/mnt/temp/huggingface \
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
-e TORCH_NCCL_AVOID_RECORD_STREAMS=1 \
-e NCCL_DEBUG=ERROR \
-e TRAIN_NODE_RANK=$TRAIN_NODE_RANK \
swift4.1.3 \
/bin/bash
# 补充安装
docker exec swift pip install -e /mnt/code/lf_test/Liger-Kernel
docker exec swift pip install tilelang
docker exec swift pip install -U datasets
docker exec swift swanlab login -k xxxxxxxx
# 基础路径配置
ENV:
HF_ENDPOINT: https://hf-mirror.com
PYTHONUNBUFFERED: 1
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
SWIFT_USE_MCORE_GDN: 1
# 训练可视化配置
swanlab_project: megatron-swift
# workspace: megatron-swift-base # vl_experiment
swanlab_exp_name: Qwen3.5-35B-A3B-vl_instruct
# 数据配置
dataset: /mnt/datasets/local/ms_swift/vl
load_from_cache_file: true
dataset_shuffle: true
train_dataloader_shuffle: true
group_by_length: false
split_dataset_ratio: 0.01
packing: true
add_non_thinking_prefix: False
max_length: 32768
dataloader_num_workers: 64
dataset_num_proc: 64
packing_num_proc: 64
# Model配置
model: /mnt/models/Qwen3.5-35B-A3B
tuner_type: full
finetune: true
# use_liger_kernel: true
attention_backend: flash
freeze_llm: false
freeze_vit: true
freeze_aligner: true
save_strategy: epoch
save_safetensors: true
expert_model_parallel_size: 8
output_dir: "/mnt/checkpoints/ms-swift/Qwen3.5-35B-A3B-instruct"
# save_total_limit: 5
no_save_optim: true
no_save_rng: true
# Megatron配置
tensor_model_parallel_size: 4
sequence_parallel: true
# pipeline_model_parallel_size: 2
# virtual_pipeline_model_parallel_size: 0
context_parallel_size: 1
overlap_param_gather: true
overlap_grad_reduce: true
moe_token_dispatcher_type: alltoall
moe_permute_fusion: true
moe_grouped_gemm: true
moe_shared_expert_overlap: true
moe_layer_recompute: false
# moe_aux_loss_coeff: 1.0e-6
recompute_granularity: full # selective
recompute_method: uniform
recompute_num_layers: 1
cross_entropy_loss_fusion: true
moe_expert_capacity_factor: 2
use_precision_aware_optimizer: true
optimizer_cpu_offload: true
optimizer_offload_fraction: 0.64
# Trainer配置
micro_batch_size: 1
global_batch_size: 8
num_train_epochs: 2
lr: 1.0e-5
lr_warmup_fraction: 0.05
logging_steps: 1
min_lr: 1.0e-6
eval_steps: 200
save_steps: 200
bf16: true
report_to: swanlab
NPROC_PER_NODE=8 NNODES={nnodes} MASTER_ADDR={master_addr} MASTER_PORT=29500 NODE_RANK={node_rank}
megatron sft megatron sft {yaml_path}
Checklist / 检查清单
Bug Description / Bug 描述
How to Reproduce / 如何复现
节点信息
pip环境
训练脚本
启动命令
Additional Information / 补充信息