|
| 1 | + |
| 2 | +set -x |
| 3 | + |
| 4 | +NODES=1 |
| 5 | + |
| 6 | +# R2: enable routing replay |
| 7 | +# R3: enable rollout routing replay |
| 8 | +# If enabling R3, please set actor_rollout_ref.rollout.enable_rollout_routing_replay=True |
| 9 | +# R3 example is based on vllm related pr https://github.com/vllm-project/vllm/pull/5322 |
| 10 | + |
| 11 | +ROUTING_REPLAY_MODE="R2" |
| 12 | + |
| 13 | +DIST_CKPT_PATH="" |
| 14 | +HF_MODEL_PATH="" |
| 15 | +TRAIN_DATA_PATH="" |
| 16 | +TEST_DATA_PATH="" |
| 17 | + |
| 18 | +export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping |
| 19 | +PP=1 |
| 20 | +VPP=None |
| 21 | +TP=2 |
| 22 | +EP=8 |
| 23 | +ETP=1 |
| 24 | +VLLM_INFER_TP=2 |
| 25 | +offload=True |
| 26 | +gpu_memory_utilization=0.65 |
| 27 | +bs=8 |
| 28 | +micro_bs=3 |
| 29 | +use_dynamic_bsz=True |
| 30 | +max_prompt_length=1024 |
| 31 | +max_response_length=1024 |
| 32 | +ppo_mini_batch_size=8 |
| 33 | +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) |
| 34 | +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2)) |
| 35 | + |
| 36 | + |
| 37 | +exper_name=Node${NODES}_bs${bs}_${PP}${TP}${EP}${ETP}_${VLLM_INFER_TP}_minbs${ppo_mini_batch_size}_micro_bs${micro_bs} |
| 38 | + |
| 39 | +python3 -m verl.trainer.main_ppo --config-path=config \ |
| 40 | + --config-name='ppo_megatron_trainer.yaml' \ |
| 41 | + algorithm.adv_estimator=grpo \ |
| 42 | + data.train_files=$TRAIN_DATA_PATH \ |
| 43 | + data.val_files=$TEST_DATA_PATH \ |
| 44 | + data.train_batch_size=$bs \ |
| 45 | + data.max_prompt_length=$max_prompt_length \ |
| 46 | + data.max_response_length=$max_response_length \ |
| 47 | + data.filter_overlong_prompts=True \ |
| 48 | + data.truncation='error' \ |
| 49 | + actor_rollout_ref.model.use_fused_kernels=True \ |
| 50 | + actor_rollout_ref.model.path=$HF_MODEL_PATH \ |
| 51 | + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ |
| 52 | + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ |
| 53 | + actor_rollout_ref.actor.router_replay.mode=${ROUTING_REPLAY_MODE} \ |
| 54 | + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \ |
| 55 | + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \ |
| 56 | + +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \ |
| 57 | + +actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \ |
| 58 | + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \ |
| 59 | + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ |
| 60 | + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ |
| 61 | + +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ |
| 62 | + +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \ |
| 63 | + +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \ |
| 64 | + actor_rollout_ref.actor.megatron.param_offload=${offload} \ |
| 65 | + actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \ |
| 66 | + actor_rollout_ref.actor.megatron.grad_offload=${offload} \ |
| 67 | + actor_rollout_ref.actor.optim.lr=1e-6 \ |
| 68 | + actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \ |
| 69 | + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$micro_bs \ |
| 70 | + actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \ |
| 71 | + actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \ |
| 72 | + actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \ |
| 73 | + actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \ |
| 74 | + actor_rollout_ref.actor.use_kl_loss=False \ |
| 75 | + actor_rollout_ref.actor.kl_loss_coef=0.001 \ |
| 76 | + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ |
| 77 | + actor_rollout_ref.actor.entropy_coeff=0 \ |
| 78 | + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ |
| 79 | + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ |
| 80 | + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$micro_bs \ |
| 81 | + actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_INFER_TP \ |
| 82 | + actor_rollout_ref.rollout.name=vllm \ |
| 83 | + actor_rollout_ref.rollout.mode=async \ |
| 84 | + actor_rollout_ref.actor.megatron.use_mbridge=True \ |
| 85 | + actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \ |
| 86 | + actor_rollout_ref.rollout.n=8 \ |
| 87 | + actor_rollout_ref.rollout.enable_chunked_prefill=True \ |
| 88 | + actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ |
| 89 | + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$micro_bs \ |
| 90 | + actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \ |
| 91 | + actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \ |
| 92 | + actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \ |
| 93 | + actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \ |
| 94 | + actor_rollout_ref.ref.megatron.param_offload=${offload} \ |
| 95 | + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ |
| 96 | + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ |
| 97 | + algorithm.use_kl_in_reward=False \ |
| 98 | + trainer.critic_warmup=0 \ |
| 99 | + trainer.logger=['console'] \ |
| 100 | + trainer.project_name='verl_grpo_example_gsm8k_math' \ |
| 101 | + trainer.experiment_name="$exper_name" \ |
| 102 | + trainer.nnodes=$NODES \ |
| 103 | + trainer.n_gpus_per_node=8 \ |
| 104 | + trainer.save_freq=-1 \ |
| 105 | + trainer.test_freq=10 \ |
| 106 | + trainer.total_training_steps=50000 \ |
| 107 | + trainer.balance_batch=False \ |
| 108 | + trainer.val_before_train=False 2>&1 |
0 commit comments