@@ -19,7 +19,7 @@ TEST_FREQ=${TEST_FREQ:--1}
19
19
# Save & Resume
20
20
RESUME_MODE=${RESUME_MODE:- disable}
21
21
SAVE_FREQ=${SAVE_FREQ:- -1}
22
- TOT_TRAIN_STEPS =${TOT_TRAIN_STEPS :- 1}
22
+ TOTAL_TRAIN_STEPS =${TOTAL_TRAIN_STEPS :- 1}
23
23
24
24
train_traj_micro_bsz_per_gpu=2 # b
25
25
n_resp_per_prompt=4 # g
@@ -75,76 +75,80 @@ if [ $SKIP_SAVE_HF_MODEL -eq 1 ]; then
75
75
CHECKPOINT_CONTENTS=[' model' ,' optimizer' ,' extra' ]
76
76
fi
77
77
78
+ ENGINES=${ENGINES:- ['vllm', 'sglang', 'sglang_async']}
79
+
78
80
exp_name=" $( basename " ${MODEL_ID,,} " ) -megatron-gsm8k-minimal"
79
81
80
- python3 -m verl.trainer.main_ppo --config-path=config \
81
- --config-name=' ppo_megatron_trainer.yaml' \
82
- algorithm.adv_estimator=" ${ADV_ESTIMATOR} " \
83
- data.train_files=" ${TRAIN_FILES} " \
84
- data.val_files=" ${VAL_FILES} " \
85
- data.train_batch_size=${train_prompt_bsz} \
86
- data.max_prompt_length=512 \
87
- data.max_response_length=512 \
88
- data.filter_overlong_prompts=True \
89
- data.truncation=' error' \
90
- actor_rollout_ref.model.path=" ${MODEL_PATH} " \
91
- actor_rollout_ref.actor.optim.lr=1e-6 \
92
- actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
93
- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
94
- actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$ACTOR_PP \
95
- actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP \
96
- actor_rollout_ref.actor.megatron.context_parallel_size=$ACTOR_CP \
97
- actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$ACTOR_TP \
98
- actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
99
- actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
100
- actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
101
- actor_rollout_ref.actor.use_kl_loss=True \
102
- actor_rollout_ref.actor.kl_loss_coef=0.001 \
103
- actor_rollout_ref.actor.kl_loss_type=low_var_kl \
104
- actor_rollout_ref.actor.checkpoint.contents=$CHECKPOINT_CONTENTS \
105
- actor_rollout_ref.rollout.name=" ${ENGINE} " \
106
- actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP \
107
- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
108
- actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
109
- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
110
- actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$REF_PP \
111
- actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=$REF_VPP \
112
- actor_rollout_ref.ref.megatron.context_parallel_size=$REF_CP \
113
- actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$REF_TP \
114
- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
115
- actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
116
- critic.optim.lr=2e-5 \
117
- critic.model.path=" ${MODEL_PATH} " \
118
- critic.model.enable_gradient_checkpointing=False \
119
- critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
120
- critic.megatron.pipeline_model_parallel_size=$CRITIC_PP \
121
- critic.megatron.virtual_pipeline_model_parallel_size=$CRITIC_VPP \
122
- critic.megatron.context_parallel_size=$CRITIC_CP \
123
- critic.megatron.tensor_model_parallel_size=$CRITIC_TP \
124
- critic.checkpoint.contents=$CHECKPOINT_CONTENTS \
125
- critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
126
- critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
127
- critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
128
- reward_model.enable=True \
129
- reward_model.model.path=" ${MODEL_PATH} " \
130
- reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
131
- reward_model.megatron.pipeline_model_parallel_size=$RM_PP \
132
- reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \
133
- reward_model.megatron.context_parallel_size=$RM_CP \
134
- reward_model.megatron.tensor_model_parallel_size=$RM_TP \
135
- reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
136
- algorithm.use_kl_in_reward=False \
137
- algorithm.kl_penalty=kl \
138
- algorithm.kl_ctrl.kl_coef=0.001 \
139
- trainer.critic_warmup=0 \
140
- trainer.logger=[' console' ] \
141
- trainer.project_name=' verl-test' \
142
- trainer.experiment_name=" ${exp_name} " \
143
- trainer.nnodes=1 \
144
- trainer.n_gpus_per_node=${NUM_GPUS} \
145
- trainer.val_before_train=" ${VAL_BEFORE_TRAIN} " \
146
- trainer.test_freq=" ${TEST_FREQ} " \
147
- trainer.save_freq=" ${SAVE_FREQ} " \
148
- trainer.resume_mode=" ${RESUME_MODE} " \
149
- trainer.total_epochs=2 \
150
- trainer.total_training_steps=" ${TOT_TRAIN_STEPS} " $@
82
+ for ENGINE in " ${ENGINES[@]} " ; do
83
+ python3 -m verl.trainer.main_ppo --config-path=config \
84
+ --config-name=' ppo_megatron_trainer.yaml' \
85
+ algorithm.adv_estimator=" ${ADV_ESTIMATOR} " \
86
+ data.train_files=" ${TRAIN_FILES} " \
87
+ data.val_files=" ${VAL_FILES} " \
88
+ data.train_batch_size=${train_prompt_bsz} \
89
+ data.max_prompt_length=512 \
90
+ data.max_response_length=512 \
91
+ data.filter_overlong_prompts=True \
92
+ data.truncation=' error' \
93
+ actor_rollout_ref.model.path=" ${MODEL_PATH} " \
94
+ actor_rollout_ref.actor.optim.lr=1e-6 \
95
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
96
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
97
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$ACTOR_PP \
98
+ actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP \
99
+ actor_rollout_ref.actor.megatron.context_parallel_size=$ACTOR_CP \
100
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$ACTOR_TP \
101
+ actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
102
+ actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
103
+ actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
104
+ actor_rollout_ref.actor.use_kl_loss=True \
105
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
106
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
107
+ actor_rollout_ref.actor.checkpoint.contents=$CHECKPOINT_CONTENTS \
108
+ actor_rollout_ref.rollout.name=" ${ENGINE} " \
109
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP \
110
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
111
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
112
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
113
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$REF_PP \
114
+ actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=$REF_VPP \
115
+ actor_rollout_ref.ref.megatron.context_parallel_size=$REF_CP \
116
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$REF_TP \
117
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
118
+ actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
119
+ critic.optim.lr=2e-5 \
120
+ critic.model.path=" ${MODEL_PATH} " \
121
+ critic.model.enable_gradient_checkpointing=False \
122
+ critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
123
+ critic.megatron.pipeline_model_parallel_size=$CRITIC_PP \
124
+ critic.megatron.virtual_pipeline_model_parallel_size=$CRITIC_VPP \
125
+ critic.megatron.context_parallel_size=$CRITIC_CP \
126
+ critic.megatron.tensor_model_parallel_size=$CRITIC_TP \
127
+ critic.checkpoint.contents=$CHECKPOINT_CONTENTS \
128
+ critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
129
+ critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
130
+ critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
131
+ reward_model.enable=True \
132
+ reward_model.model.path=" ${MODEL_PATH} " \
133
+ reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
134
+ reward_model.megatron.pipeline_model_parallel_size=$RM_PP \
135
+ reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \
136
+ reward_model.megatron.context_parallel_size=$RM_CP \
137
+ reward_model.megatron.tensor_model_parallel_size=$RM_TP \
138
+ reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
139
+ algorithm.use_kl_in_reward=False \
140
+ algorithm.kl_penalty=kl \
141
+ algorithm.kl_ctrl.kl_coef=0.001 \
142
+ trainer.critic_warmup=0 \
143
+ trainer.logger=[' console' ] \
144
+ trainer.project_name=' verl-test' \
145
+ trainer.experiment_name=" ${exp_name} " \
146
+ trainer.nnodes=1 \
147
+ trainer.n_gpus_per_node=${NUM_GPUS} \
148
+ trainer.val_before_train=" ${VAL_BEFORE_TRAIN} " \
149
+ trainer.test_freq=" ${TEST_FREQ} " \
150
+ trainer.save_freq=" ${SAVE_FREQ} " \
151
+ trainer.resume_mode=" ${RESUME_MODE} " \
152
+ trainer.total_epochs=2 \
153
+ trainer.total_training_steps=" ${TOTAL_TRAIN_STEPS} " $@
154
+ done
0 commit comments