Skip to content

Commit 086a1e3

Browse files
authored
Add workflow for bug reproduction in GRPO training
This workflow is designed to reproduce a bug scenario from the local test script by running GRPO training with the TensorRT-LLM rollout backend and FSDP2.
1 parent f8b73fd commit 086a1e3

File tree

1 file changed

+184
-0
lines changed

1 file changed

+184
-0
lines changed

.github/workflows/main.yml

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# Workflow to reproduce bug from test_trtllm_rollout_local_cpu.sh
2+
# This workflow runs GRPO training with TensorRT-LLM rollout backend and FSDP2
3+
# to reproduce the bug scenario from the local test script
4+
5+
name: e2e_ppo_grpo_trainer_trtllm_bug_repro
6+
7+
on:
8+
# Trigger the workflow on push or pull request,
9+
# but only for the main branch.
10+
push:
11+
branches:
12+
- main
13+
- v0.*
14+
paths:
15+
- "**/*.py"
16+
- "!verl/trainer/fsdp_sft_trainer.py"
17+
- "!recipe/**"
18+
- "!verl/workers/**/*dp_*.py"
19+
pull_request:
20+
branches:
21+
- main
22+
- v0.*
23+
paths:
24+
- "**/*.py"
25+
- "!docker/**"
26+
- "!**/*.md"
27+
- "!docs/**"
28+
- "!examples/**"
29+
- "!tests/**"
30+
- "!verl/trainer/main_*.py"
31+
- "!verl/trainer/fsdp_sft_trainer.py"
32+
- "!recipe/**"
33+
- "!verl/workers/**/*dp_*.py"
34+
# Entrypoints related to this bug reproduction
35+
- "verl/workers/rollout/trtllm_rollout/*"
36+
- "verl/trainer/main_ppo.py"
37+
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro.yml"
38+
# Allow manual trigger for debugging
39+
workflow_dispatch:
40+
41+
# Cancel jobs on the same ref if a new one is triggered
42+
concurrency:
43+
group: ${{ github.workflow }}-${{ github.ref }}
44+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
45+
46+
# Declare permissions just read content.
47+
permissions:
48+
contents: read
49+
50+
env:
51+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:trtllm1.2.0rc6"
52+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
53+
54+
jobs:
55+
setup:
56+
if: github.repository_owner == 'verl-project'
57+
runs-on: ubuntu-latest
58+
outputs:
59+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
60+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
61+
steps:
62+
- uses: actions/checkout@v4
63+
- id: create-runner
64+
uses: volcengine/vemlp-github-runner@v1
65+
with:
66+
mode: "create"
67+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
68+
mlp-image: "${{ env.IMAGE }}"
69+
70+
e2e_grpo_trainer_trtllm_bug_repro:
71+
needs: setup
72+
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
73+
timeout-minutes: 60 # Increased timeout for bug reproduction
74+
env:
75+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
76+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
77+
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
78+
HF_ENDPOINT: "https://hf-mirror.com"
79+
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
80+
# Environment variables from the bug reproduction script
81+
TOKENIZERS_PARALLELISM: "true"
82+
NCCL_DEBUG: "WARN"
83+
TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL: "1"
84+
RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "1"
85+
RAY_DEDUP_LOGS: "0"
86+
RAY_IGNORE_UNHANDLED_ERRORS: "1"
87+
HYDRA_FULL_ERROR: "1"
88+
RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE: "1"
89+
TRTLLM_LOGGER_LEVEL: "WARNING"
90+
steps:
91+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
92+
with:
93+
fetch-depth: 0
94+
- name: Install the current repository
95+
run: |
96+
pip3 install -r requirements-test.txt
97+
pip3 install --no-deps -e .
98+
- name: Prepare GSM8K dataset
99+
run: |
100+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
101+
- name: Set up TensorRT-LLM engine directory
102+
run: |
103+
export TRTLLM_ENGINE_DIR=${TRTLLM_ENGINE_DIR:-/tmp/trtllm_engines}
104+
mkdir -p "$TRTLLM_ENGINE_DIR"
105+
echo "TRTLLM_ENGINE_DIR=$TRTLLM_ENGINE_DIR" >> $GITHUB_ENV
106+
- name: Running GRPO E2E training test to reproduce bug (Qwen2.5-7B, FSDP2, TensorRT-LLM async)
107+
run: |
108+
ray stop --force
109+
# Set CUDA_VISIBLE_DEVICES to use 4 GPUs (matching the bug reproduction script)
110+
export CUDA_VISIBLE_DEVICES=0,1,2,3
111+
112+
# Run the exact command from the bug reproduction script
113+
# Using GSM8K dataset as a substitute for the original training data
114+
python3 -m verl.trainer.main_ppo \
115+
algorithm.adv_estimator=grpo \
116+
algorithm.rollout_correction.rollout_is_threshold=2.0 \
117+
data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
118+
data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
119+
data.train_batch_size=512 \
120+
data.max_prompt_length=4096 \
121+
data.max_response_length=1024 \
122+
data.return_raw_chat=True \
123+
data.filter_overlong_prompts=True \
124+
data.truncation='error' \
125+
actor_rollout_ref.hybrid_engine=True \
126+
actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen2.5-7B-Instruct" \
127+
actor_rollout_ref.model.trust_remote_code=True \
128+
actor_rollout_ref.actor.optim.lr=1e-6 \
129+
actor_rollout_ref.model.use_remove_padding=True \
130+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
131+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
132+
actor_rollout_ref.actor.use_kl_loss=True \
133+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
134+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
135+
actor_rollout_ref.actor.entropy_coeff=0 \
136+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
137+
actor_rollout_ref.actor.strategy=fsdp2 \
138+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
139+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
140+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
141+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
142+
actor_rollout_ref.rollout.name=trtllm \
143+
actor_rollout_ref.rollout.mode="async" \
144+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
145+
actor_rollout_ref.rollout.n=5 \
146+
actor_rollout_ref.rollout.max_num_seqs=256 \
147+
actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
148+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
149+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
150+
actor_rollout_ref.rollout.calculate_log_probs=True \
151+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
152+
actor_rollout_ref.ref.strategy=fsdp2 \
153+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
154+
algorithm.use_kl_in_reward=False \
155+
reward_manager.name=naive \
156+
reward_manager.source=register \
157+
trainer.critic_warmup=0 \
158+
trainer.logger='["console"]' \
159+
trainer.project_name="verl_grpo_bug_repro" \
160+
trainer.experiment_name="qwen2.5-7b-trtllm-fsdp2-4gpus-bug-repro" \
161+
trainer.n_gpus_per_node=4 \
162+
trainer.nnodes=1 \
163+
trainer.save_freq=10 \
164+
trainer.test_freq=5 \
165+
trainer.resume_mode=disable \
166+
trainer.total_training_steps=1
167+
- name: Clean up
168+
if: always()
169+
run: |
170+
rm -rf checkpoints
171+
rm -rf ${TRTLLM_ENGINE_DIR:-/tmp/trtllm_engines}/*
172+
173+
cleanup:
174+
runs-on: ubuntu-latest
175+
needs: [setup, e2e_grpo_trainer_trtllm_bug_repro]
176+
if: always()
177+
steps:
178+
- id: destroy-runner
179+
uses: volcengine/vemlp-github-runner@v1
180+
with:
181+
mode: "destroy"
182+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
183+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
184+

0 commit comments

Comments
 (0)