|
| 1 | +#!/usr/bin/env bash |
| 2 | +set -xeuo pipefail |
| 3 | + |
| 4 | + |
| 5 | +NUM_GPUS=${NUM_GPUS:-8} |
| 6 | +NNODES=${WORLD_SIZE:-${NNODES:-4}} |
| 7 | +NODE_RANK=${RANK:-${NODE_RANK:-0}} |
| 8 | +MASTER_PORT=${MASTER_PORT:-8888} |
| 9 | + |
| 10 | +RAW_MASTER_ADDR=${MASTER_ADDR:-127.0.0.1} |
| 11 | +MASTER_ADDR=$(python3 -c "import socket; print(socket.getaddrinfo('${RAW_MASTER_ADDR}', None, socket.AF_INET)[0][4][0])" 2>/dev/null || echo "${RAW_MASTER_ADDR}") |
| 12 | + |
| 13 | +TRAIN_FILES=${TRAIN_FILES:-"[/llm-align/liuchonghan/ins_dataset/ins_dataset/verl_parquet/Gemini3_translate_110w_1096687.parquet,/llm-align/liuchonghan/ins_dataset/ins_dataset/verl_parquet/Gemini_QA_mm_92w_920623.parquet]"} |
| 14 | + |
| 15 | +MODEL_PATH=${MODEL_PATH:-/llm-align/liuchonghan/qwen3_5_27b_sft_global_step_8000} |
| 16 | + |
| 17 | +TP_SIZE=${TP_SIZE:-8} |
| 18 | +PP_SIZE=${PP_SIZE:-1} |
| 19 | +VPP_SIZE=${VPP_SIZE:-null} |
| 20 | +CP_SIZE=${CP_SIZE:-1} |
| 21 | +EP_SIZE=${EP_SIZE:-1} |
| 22 | +ETP_SIZE=${ETP_SIZE:-1} |
| 23 | + |
| 24 | +TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-256} |
| 25 | +MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-8} |
| 26 | +MAX_LENGTH=${MAX_LENGTH:-6144} |
| 27 | +MAX_TOKEN_LEN_PER_GPU=${MAX_TOKEN_LEN_PER_GPU:-${MAX_LENGTH}} |
| 28 | +PAD_MODE=${PAD_MODE:-no_padding} |
| 29 | +TRUNCATION=${TRUNCATION:-right} |
| 30 | +NUM_WORKERS=${NUM_WORKERS:-1} # 670w dataset is large; many workers easily cause CPU OOM, so default to 1 |
| 31 | +LR=${LR:-5e-6} |
| 32 | +MIN_LR=${MIN_LR:-5e-7} |
| 33 | +DTYPE=${DTYPE:-bfloat16} |
| 34 | +TOTAL_EPOCHS=${TOTAL_EPOCHS:-2} |
| 35 | + |
| 36 | +echo ">>> 数据文件: ${TRAIN_FILES}, total_epochs=${TOTAL_EPOCHS}" |
| 37 | + |
| 38 | +BACKEND=megatron |
| 39 | +RESUME_MODE=${RESUME_MODE:-disable} |
| 40 | + |
| 41 | +project_name=verl_sft_qwen3_5_27b_translate |
| 42 | +exp_name=qwen3_5_27b-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE} |
| 43 | +ckpts_home=${ckpts_home:-/llm-align/liuchonghan/ckpt_verl/sft/${project_name}/${exp_name}} |
| 44 | + |
| 45 | +echo ">>> 节点信息: RANK ${NODE_RANK} / WORLD_SIZE ${NNODES}" |
| 46 | +echo ">>> 通信信息: MASTER ${MASTER_ADDR} : ${MASTER_PORT}" |
| 47 | + |
| 48 | +if [ "${NODE_RANK}" -eq 0 ]; then |
| 49 | + mkdir -p "${ckpts_home}" |
| 50 | +fi |
| 51 | + |
| 52 | +# Qwen3.5 GDN + megatron bshd path currently requires no_padding + static bsz. |
| 53 | +if [ "${PAD_MODE}" != "no_padding" ]; then |
| 54 | + echo "ERROR: PAD_MODE must be no_padding for Qwen3.5 megatron bshd path." |
| 55 | + exit 1 |
| 56 | +fi |
| 57 | + |
| 58 | +export WANDB_MODE=${WANDB_MODE:-offline} |
| 59 | +export NCCL_DEBUG=WARN |
| 60 | +export PYTORCH_ALLOC_CONF=expandable_segments:True |
| 61 | +export HYDRA_FULL_ERROR=1 |
| 62 | +export PYTHONPATH=${PYTHONPATH:-}:/llm-align/liuchonghan/verl_lao |
| 63 | + |
| 64 | +# Key Qwen3.5 settings: |
| 65 | +# engine.use_remove_padding=False - GDN requires bshd format (no THD) |
| 66 | +# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge) |
| 67 | +ENGINE_CONFIG="\ |
| 68 | + engine=${BACKEND} \ |
| 69 | + optim=${BACKEND} \ |
| 70 | + optim.lr=${LR} \ |
| 71 | + optim.min_lr=${MIN_LR} \ |
| 72 | + optim.lr_warmup_steps=20 \ |
| 73 | + optim.weight_decay=0.1 \ |
| 74 | + optim.betas='[0.9,0.95]' \ |
| 75 | + optim.clip_grad=1.0 \ |
| 76 | + optim.lr_warmup_init=0 \ |
| 77 | + optim.lr_decay_style=cosine \ |
| 78 | + +optim.override_optimizer_config.optimizer_offload_fraction=0 \ |
| 79 | + +optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=False \ |
| 80 | + +optim.override_optimizer_config.use_precision_aware_optimizer=True \ |
| 81 | + +optim.override_optimizer_config.optimizer_cpu_offload=False \ |
| 82 | + engine.tensor_model_parallel_size=${TP_SIZE} \ |
| 83 | + engine.pipeline_model_parallel_size=${PP_SIZE} \ |
| 84 | + engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \ |
| 85 | + engine.context_parallel_size=${CP_SIZE} \ |
| 86 | + engine.expert_model_parallel_size=${EP_SIZE} \ |
| 87 | + engine.expert_tensor_parallel_size=${ETP_SIZE} \ |
| 88 | + engine.use_mbridge=True \ |
| 89 | + engine.vanilla_mbridge=True \ |
| 90 | + engine.dtype=${DTYPE} \ |
| 91 | + engine.use_remove_padding=False \ |
| 92 | + engine.override_transformer_config.attention_backend=auto \ |
| 93 | + +engine.override_transformer_config.recompute_method=uniform \ |
| 94 | + +engine.override_transformer_config.recompute_granularity=full \ |
| 95 | + +engine.override_transformer_config.recompute_num_layers=1" |
| 96 | + |
| 97 | +torchrun \ |
| 98 | + --nproc_per_node=${NUM_GPUS} \ |
| 99 | + --nnodes=${NNODES} \ |
| 100 | + --node_rank=${NODE_RANK} \ |
| 101 | + --master_addr=${MASTER_ADDR} \ |
| 102 | + --master_port=${MASTER_PORT} \ |
| 103 | + -m verl.trainer.sft_trainer \ |
| 104 | + "data.train_files=${TRAIN_FILES}" \ |
| 105 | + data.train_batch_size=${TRAIN_BATCH_SIZE} \ |
| 106 | + data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \ |
| 107 | + data.max_length=${MAX_LENGTH} \ |
| 108 | + data.pad_mode=${PAD_MODE} \ |
| 109 | + data.truncation=${TRUNCATION} \ |
| 110 | + data.use_dynamic_bsz=False \ |
| 111 | + data.max_token_len_per_gpu=${MAX_TOKEN_LEN_PER_GPU} \ |
| 112 | + data.num_workers=${NUM_WORKERS} \ |
| 113 | + data.messages_key=messages \ |
| 114 | + model.path=${MODEL_PATH} \ |
| 115 | + model.use_remove_padding=True \ |
| 116 | + model.trust_remote_code=True \ |
| 117 | + model.enable_gradient_checkpointing=True \ |
| 118 | + ${ENGINE_CONFIG} \ |
| 119 | + trainer.test_freq=-1 \ |
| 120 | + trainer.save_freq=1000 \ |
| 121 | + trainer.max_ckpt_to_keep=10 \ |
| 122 | + trainer.logger="['console']" \ |
| 123 | + trainer.project_name="${project_name}" \ |
| 124 | + trainer.experiment_name="${exp_name}" \ |
| 125 | + trainer.total_epochs=${TOTAL_EPOCHS} \ |
| 126 | + trainer.default_local_dir="${ckpts_home}" \ |
| 127 | + trainer.resume_mode=${RESUME_MODE} \ |
| 128 | + 'checkpoint.save_contents=[hf_model]' |
0 commit comments