Skip to content

Commit 3d368eb

Browse files
committed
clean PR
1 parent 115d47d commit 3d368eb

File tree

5 files changed

+90
-370
lines changed

5 files changed

+90
-370
lines changed

.github/workflows/e2e_ppo_trainer_megatron.yml

Lines changed: 7 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -40,51 +40,9 @@ permissions:
4040
contents: read
4141

4242
jobs:
43-
e2e_ppo_trainer_megatron-qwen:
44-
runs-on: [L20x8]
45-
timeout-minutes: 60 # Increase this timeout value as needed
46-
env:
47-
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
48-
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
49-
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
50-
HF_ENDPOINT: "https://hf-mirror.com"
51-
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
52-
container:
53-
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
54-
options: --gpus all --shm-size=10g
55-
steps:
56-
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
57-
with:
58-
fetch-depth: 0
59-
- name: Install the current repository
60-
run: |
61-
pip3 install --no-deps -e .[test]
62-
- name: Prepare GSM8K dataset
63-
run: |
64-
python3 examples/data_preprocess/gsm8k.py
65-
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with validation and saving
66-
run: |
67-
ray stop --force
68-
ALL_OFFLOAD=True VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 bash tests/e2e/run_ppo_trainer_megatron.sh
69-
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) after resuming
70-
run: |
71-
ray stop --force
72-
RESUME_MODE=auto TOT_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
73-
- name: Test Megatron checkpoints merging function (Qwen Actor and Critic)
74-
run: |
75-
exp_name="qwen2.5-0.5b-megatron-gsm8k-minimal"
76-
python scripts/model_merger.py test --backend megatron --tie-word-embedding --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface
77-
python scripts/model_merger.py test --backend megatron --is-value-model --local_dir checkpoints/verl-test/${exp_name}/global_step_1/critic --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/critic/huggingface
78-
- name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
79-
run: |
80-
ray stop --force
81-
ADV_ESTIMATOR=grpo TOT_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
82-
- name: clean up
83-
run: |
84-
rm -rf checkpoints
8543
e2e_ppo_trainer_megatron-deepseek:
8644
runs-on: [L20x8]
87-
timeout-minutes: 60 # Increase this timeout value as needed
45+
timeout-minutes: 90 # Increase this timeout value as needed
8846
env:
8947
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
9048
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -111,11 +69,11 @@ jobs:
11169
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (DeepSeek)
11270
run: |
11371
ray stop --force
114-
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOT_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
72+
RESUME_MODE=auto MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
11573
- name: Running GRPO GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Deepseek)
11674
run: |
11775
ray stop --force
118-
ADV_ESTIMATOR=grpo MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOT_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
76+
ADV_ESTIMATOR=grpo MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct TOTAL_TRAIN_STEPS=2 bash tests/e2e/run_ppo_trainer_megatron.sh
11977
- name: Test Megatron checkpoints merging function (DeepSeek Actor and Critic)
12078
run: |
12179
exp_name="deepseek-coder-1.3b-instruct-megatron-gsm8k-minimal"
@@ -126,7 +84,7 @@ jobs:
12684
rm -rf checkpoints
12785
e2e_ppo_trainer_megatron-qwen3:
12886
runs-on: [L20x8]
129-
timeout-minutes: 30 # Increase this timeout value as needed
87+
timeout-minutes: 90 # Increase this timeout value as needed
13088
env:
13189
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
13290
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -166,42 +124,9 @@ jobs:
166124
- name: clean up
167125
run: |
168126
rm -rf checkpoints
169-
e2e_ppo_trainer_megatron-different-train-infer-tp-qwen:
170-
runs-on: [L20x8]
171-
timeout-minutes: 60 # Increase this timeout value as needed
172-
env:
173-
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
174-
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
175-
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
176-
HF_ENDPOINT: "https://hf-mirror.com"
177-
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
178-
container:
179-
image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
180-
options: --gpus all --shm-size=10g
181-
steps:
182-
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
183-
with:
184-
fetch-depth: 0
185-
- name: Install the current repository
186-
run: |
187-
pip3 install --no-deps -e .[test]
188-
- name: Prepare GSM8K dataset
189-
run: |
190-
python3 examples/data_preprocess/gsm8k.py
191-
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with train tp > infer tp
192-
run: |
193-
ray stop --force
194-
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=2 INFER_TP=1 bash tests/e2e/run_ppo_trainer_megatron.sh
195-
- name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen) with train tp < infer tp
196-
run: |
197-
ray stop --force
198-
VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 TRAIN_TP=1 INFER_TP=2 bash tests/e2e/run_ppo_trainer_megatron.sh
199-
- name: clean up
200-
run: |
201-
rm -rf checkpoints
202127
e2e_ppo_trainer_megatron-different-train-infer-tp-qwen-tie-embedding:
203128
runs-on: [L20x8]
204-
timeout-minutes: 60 # Increase this timeout value as needed
129+
timeout-minutes: 90 # Increase this timeout value as needed
205130
env:
206131
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
207132
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -234,7 +159,7 @@ jobs:
234159
rm -rf checkpoints
235160
e2e_ppo_trainer_megatron-qwen-override-transformer-config:
236161
runs-on: [L20x8]
237-
timeout-minutes: 60 # Increase this timeout value as needed
162+
timeout-minutes: 90 # Increase this timeout value as needed
238163
env:
239164
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
240165
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
@@ -273,7 +198,7 @@ jobs:
273198
rm -rf checkpoints
274199
e2e_ppo_trainer_megatron-deepseek-override-transformer-config:
275200
runs-on: [L20x8]
276-
timeout-minutes: 60 # Increase this timeout value as needed
201+
timeout-minutes: 90 # Increase this timeout value as needed
277202
env:
278203
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
279204
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}

tests/e2e/ppo_trainer/run_function_reward.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ TEST_FREQ=${TEST_FREQ:--1}
2929
# Save & Resume
3030
RESUME_MODE=${RESUME_MODE:-disable}
3131
SAVE_FREQ=${SAVE_FREQ:--1}
32-
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
32+
TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
3333

3434
# whether to save hf_model
3535
SAVE_HF_MODEL=${SAVE_HF_MODEL:-False}
@@ -115,7 +115,7 @@ python3 -m verl.trainer.main_ppo \
115115
trainer.save_freq="${SAVE_FREQ}" \
116116
trainer.resume_mode="${RESUME_MODE}" \
117117
trainer.total_epochs=2 \
118-
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@ \
118+
trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@ \
119119
| tee "${output_file}"
120120

121121
if [ "${CUSTOM_REWARD_FN}" = "True" ]; then

tests/e2e/ppo_trainer/run_model_reward.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ TEST_FREQ=${TEST_FREQ:--1}
2020
# Save & Resume
2121
RESUME_MODE=${RESUME_MODE:-disable}
2222
SAVE_FREQ=${SAVE_FREQ:--1}
23-
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
23+
TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
2424

2525
train_traj_micro_bsz_per_gpu=2 # b
2626
n_resp_per_prompt=4 # g
@@ -94,4 +94,4 @@ python3 -m verl.trainer.main_ppo \
9494
trainer.save_freq="${SAVE_FREQ}" \
9595
trainer.resume_mode="${RESUME_MODE}" \
9696
trainer.total_epochs=2 \
97-
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
97+
trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@

tests/e2e/run_ppo_trainer_megatron.sh

Lines changed: 76 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ TEST_FREQ=${TEST_FREQ:--1}
1919
# Save & Resume
2020
RESUME_MODE=${RESUME_MODE:-disable}
2121
SAVE_FREQ=${SAVE_FREQ:--1}
22-
TOT_TRAIN_STEPS=${TOT_TRAIN_STEPS:-1}
22+
TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
2323

2424
train_traj_micro_bsz_per_gpu=2 # b
2525
n_resp_per_prompt=4 # g
@@ -75,76 +75,80 @@ if [ $SKIP_SAVE_HF_MODEL -eq 1 ]; then
7575
CHECKPOINT_CONTENTS=['model','optimizer','extra']
7676
fi
7777

78+
ENGINES=${ENGINES:-['vllm', 'sglang', 'sglang_async']}
79+
7880
exp_name="$(basename "${MODEL_ID,,}")-megatron-gsm8k-minimal"
7981

80-
python3 -m verl.trainer.main_ppo --config-path=config \
81-
--config-name='ppo_megatron_trainer.yaml'\
82-
algorithm.adv_estimator="${ADV_ESTIMATOR}" \
83-
data.train_files="${TRAIN_FILES}" \
84-
data.val_files="${VAL_FILES}" \
85-
data.train_batch_size=${train_prompt_bsz} \
86-
data.max_prompt_length=512 \
87-
data.max_response_length=512 \
88-
data.filter_overlong_prompts=True \
89-
data.truncation='error' \
90-
actor_rollout_ref.model.path="${MODEL_PATH}" \
91-
actor_rollout_ref.actor.optim.lr=1e-6 \
92-
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
93-
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
94-
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$ACTOR_PP \
95-
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP \
96-
actor_rollout_ref.actor.megatron.context_parallel_size=$ACTOR_CP \
97-
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$ACTOR_TP \
98-
actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
99-
actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
100-
actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
101-
actor_rollout_ref.actor.use_kl_loss=True \
102-
actor_rollout_ref.actor.kl_loss_coef=0.001 \
103-
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
104-
actor_rollout_ref.actor.checkpoint.contents=$CHECKPOINT_CONTENTS \
105-
actor_rollout_ref.rollout.name="${ENGINE}" \
106-
actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP \
107-
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
108-
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
109-
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
110-
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$REF_PP \
111-
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=$REF_VPP \
112-
actor_rollout_ref.ref.megatron.context_parallel_size=$REF_CP \
113-
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$REF_TP \
114-
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
115-
actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
116-
critic.optim.lr=2e-5 \
117-
critic.model.path="${MODEL_PATH}" \
118-
critic.model.enable_gradient_checkpointing=False \
119-
critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
120-
critic.megatron.pipeline_model_parallel_size=$CRITIC_PP \
121-
critic.megatron.virtual_pipeline_model_parallel_size=$CRITIC_VPP \
122-
critic.megatron.context_parallel_size=$CRITIC_CP \
123-
critic.megatron.tensor_model_parallel_size=$CRITIC_TP \
124-
critic.checkpoint.contents=$CHECKPOINT_CONTENTS \
125-
critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
126-
critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
127-
critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
128-
reward_model.enable=True \
129-
reward_model.model.path="${MODEL_PATH}" \
130-
reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
131-
reward_model.megatron.pipeline_model_parallel_size=$RM_PP \
132-
reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \
133-
reward_model.megatron.context_parallel_size=$RM_CP \
134-
reward_model.megatron.tensor_model_parallel_size=$RM_TP \
135-
reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
136-
algorithm.use_kl_in_reward=False \
137-
algorithm.kl_penalty=kl \
138-
algorithm.kl_ctrl.kl_coef=0.001 \
139-
trainer.critic_warmup=0 \
140-
trainer.logger=['console'] \
141-
trainer.project_name='verl-test' \
142-
trainer.experiment_name="${exp_name}" \
143-
trainer.nnodes=1 \
144-
trainer.n_gpus_per_node=${NUM_GPUS} \
145-
trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
146-
trainer.test_freq="${TEST_FREQ}" \
147-
trainer.save_freq="${SAVE_FREQ}" \
148-
trainer.resume_mode="${RESUME_MODE}" \
149-
trainer.total_epochs=2 \
150-
trainer.total_training_steps="${TOT_TRAIN_STEPS}" $@
82+
for ENGINE in "${ENGINES[@]}"; do
83+
python3 -m verl.trainer.main_ppo --config-path=config \
84+
--config-name='ppo_megatron_trainer.yaml'\
85+
algorithm.adv_estimator="${ADV_ESTIMATOR}" \
86+
data.train_files="${TRAIN_FILES}" \
87+
data.val_files="${VAL_FILES}" \
88+
data.train_batch_size=${train_prompt_bsz} \
89+
data.max_prompt_length=512 \
90+
data.max_response_length=512 \
91+
data.filter_overlong_prompts=True \
92+
data.truncation='error' \
93+
actor_rollout_ref.model.path="${MODEL_PATH}" \
94+
actor_rollout_ref.actor.optim.lr=1e-6 \
95+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
96+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
97+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$ACTOR_PP \
98+
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP \
99+
actor_rollout_ref.actor.megatron.context_parallel_size=$ACTOR_CP \
100+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$ACTOR_TP \
101+
actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
102+
actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
103+
actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
104+
actor_rollout_ref.actor.use_kl_loss=True \
105+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
106+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
107+
actor_rollout_ref.actor.checkpoint.contents=$CHECKPOINT_CONTENTS \
108+
actor_rollout_ref.rollout.name="${ENGINE}" \
109+
actor_rollout_ref.rollout.tensor_model_parallel_size=$ROLLOUT_TP \
110+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
111+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
112+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
113+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$REF_PP \
114+
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=$REF_VPP \
115+
actor_rollout_ref.ref.megatron.context_parallel_size=$REF_CP \
116+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$REF_TP \
117+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
118+
actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
119+
critic.optim.lr=2e-5 \
120+
critic.model.path="${MODEL_PATH}" \
121+
critic.model.enable_gradient_checkpointing=False \
122+
critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
123+
critic.megatron.pipeline_model_parallel_size=$CRITIC_PP \
124+
critic.megatron.virtual_pipeline_model_parallel_size=$CRITIC_VPP \
125+
critic.megatron.context_parallel_size=$CRITIC_CP \
126+
critic.megatron.tensor_model_parallel_size=$CRITIC_TP \
127+
critic.checkpoint.contents=$CHECKPOINT_CONTENTS \
128+
critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
129+
critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
130+
critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
131+
reward_model.enable=True \
132+
reward_model.model.path="${MODEL_PATH}" \
133+
reward_model.micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
134+
reward_model.megatron.pipeline_model_parallel_size=$RM_PP \
135+
reward_model.megatron.virtual_pipeline_model_parallel_size=$RM_VPP \
136+
reward_model.megatron.context_parallel_size=$RM_CP \
137+
reward_model.megatron.tensor_model_parallel_size=$RM_TP \
138+
reward_model.megatron.param_offload=${RM_PARAM_OFFLOAD} \
139+
algorithm.use_kl_in_reward=False \
140+
algorithm.kl_penalty=kl \
141+
algorithm.kl_ctrl.kl_coef=0.001 \
142+
trainer.critic_warmup=0 \
143+
trainer.logger=['console'] \
144+
trainer.project_name='verl-test' \
145+
trainer.experiment_name="${exp_name}" \
146+
trainer.nnodes=1 \
147+
trainer.n_gpus_per_node=${NUM_GPUS} \
148+
trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
149+
trainer.test_freq="${TEST_FREQ}" \
150+
trainer.save_freq="${SAVE_FREQ}" \
151+
trainer.resume_mode="${RESUME_MODE}" \
152+
trainer.total_epochs=2 \
153+
trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@
154+
done

0 commit comments

Comments
 (0)