Skip to content

Commit cabfe26

Browse files
committed
Update cpu affinity workflow
1 parent f904162 commit cabfe26

File tree

2 files changed

+172
-1
lines changed

2 files changed

+172
-1
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
name: e2e_ppo_grpo_trainer_trtllm_bug_repro
2+
3+
on:
4+
# Trigger the workflow on push or pull request,
5+
# but only for the main branch.
6+
push:
7+
branches:
8+
- main
9+
- v0.*
10+
paths:
11+
- "**/*.py"
12+
- "!verl/trainer/fsdp_sft_trainer.py"
13+
- "!recipe/**"
14+
- "!verl/workers/**/*dp_*.py"
15+
- "verl/workers/rollout/trtllm_rollout/*"
16+
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro.yml"
17+
pull_request:
18+
branches:
19+
- main
20+
- v0.*
21+
paths:
22+
- "**/*.py"
23+
- "!docker/**"
24+
- "!**/*.md"
25+
- "!docs/**"
26+
- "!examples/**"
27+
- "!tests/**"
28+
- "!verl/trainer/main_*.py"
29+
- "!verl/trainer/fsdp_sft_trainer.py"
30+
- "!recipe/**"
31+
- "!verl/workers/**/*dp_*.py"
32+
- "verl/workers/rollout/trtllm_rollout/*"
33+
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro.yml"
34+
workflow_dispatch: # Allow manual triggering
35+
36+
# Cancel jobs on the same ref if a new one is triggered
37+
concurrency:
38+
group: ${{ github.workflow }}-${{ github.ref }}
39+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
40+
41+
# Declare permissions just read content.
42+
permissions:
43+
contents: read
44+
45+
env:
46+
IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:trtllm1.2.0rc6"
47+
DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
48+
49+
jobs:
50+
setup:
51+
if: github.repository_owner == 'verl-project'
52+
runs-on: ubuntu-latest
53+
outputs:
54+
runner-label: ${{ steps.create-runner.outputs.runner-label }}
55+
mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
56+
steps:
57+
- uses: actions/checkout@v4
58+
- id: create-runner
59+
uses: volcengine/vemlp-github-runner@v1
60+
with:
61+
mode: "create"
62+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
63+
mlp-image: "${{ env.IMAGE }}"
64+
65+
e2e_grpo_trainer_trtllm_bug_repro:
66+
needs: setup
67+
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
68+
timeout-minutes: 30
69+
env:
70+
HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
71+
HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
72+
NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
73+
HF_ENDPOINT: "https://hf-mirror.com"
74+
HF_HUB_ENABLE_HF_TRANSFER: "0"
75+
# Ray and TensorRT-LLM environment variables from the bug repro script
76+
TOKENIZERS_PARALLELISM: "true"
77+
NCCL_DEBUG: "WARN"
78+
TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL: "1"
79+
RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "1"
80+
RAY_DEDUP_LOGS: "0"
81+
RAY_IGNORE_UNHANDLED_ERRORS: "1"
82+
HYDRA_FULL_ERROR: "1"
83+
RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE: "1"
84+
TRTLLM_LOGGER_LEVEL: "WARNING"
85+
steps:
86+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
87+
with:
88+
fetch-depth: 0
89+
- name: Install the current repository
90+
run: |
91+
pip3 install -r requirements-test.txt
92+
pip3 install --no-deps -e .
93+
- name: Prepare GSM8K dataset
94+
run: |
95+
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
96+
- name: Running GRPO E2E training test with TensorRT-LLM async rollout (Bug Repro)
97+
run: |
98+
ray stop --force
99+
DATADIR=${HOME}/data \
100+
python3 -m verl.trainer.main_ppo \
101+
algorithm.adv_estimator=grpo \
102+
algorithm.rollout_correction.rollout_is_threshold=2.0 \
103+
data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
104+
data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
105+
data.train_batch_size=512 \
106+
data.max_prompt_length=4096 \
107+
data.max_response_length=1024 \
108+
data.return_raw_chat=True \
109+
data.filter_overlong_prompts=True \
110+
data.truncation='error' \
111+
actor_rollout_ref.hybrid_engine=True \
112+
actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct" \
113+
actor_rollout_ref.model.trust_remote_code=True \
114+
actor_rollout_ref.actor.optim.lr=1e-6 \
115+
actor_rollout_ref.model.use_remove_padding=True \
116+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
117+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
118+
actor_rollout_ref.actor.use_kl_loss=True \
119+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
120+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
121+
actor_rollout_ref.actor.entropy_coeff=0 \
122+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
123+
actor_rollout_ref.actor.strategy=fsdp2 \
124+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
125+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
126+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
127+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
128+
actor_rollout_ref.rollout.name=trtllm \
129+
actor_rollout_ref.rollout.mode="async" \
130+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
131+
actor_rollout_ref.rollout.n=5 \
132+
actor_rollout_ref.rollout.max_num_seqs=256 \
133+
actor_rollout_ref.rollout.max_num_batched_tokens=16384 \
134+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
135+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
136+
actor_rollout_ref.rollout.calculate_log_probs=True \
137+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
138+
actor_rollout_ref.ref.strategy=fsdp2 \
139+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
140+
algorithm.use_kl_in_reward=False \
141+
reward_manager.name=naive \
142+
reward_manager.source=register \
143+
trainer.critic_warmup=0 \
144+
trainer.logger='["console"]' \
145+
trainer.project_name="verl_grpo_molecular_transformer" \
146+
trainer.experiment_name="qwen2.5-0.5b-trtllm-fsdp2-4gpus-bug-repro" \
147+
trainer.n_gpus_per_node=4 \
148+
trainer.nnodes=1 \
149+
trainer.save_freq=10 \
150+
trainer.test_freq=5 \
151+
trainer.resume_mode=disable \
152+
trainer.total_training_steps=1
153+
- name: clean up
154+
run: |
155+
rm -rf checkpoints
156+
157+
cleanup:
158+
runs-on: ubuntu-latest
159+
needs: [setup, e2e_grpo_trainer_trtllm_bug_repro]
160+
if: always()
161+
steps:
162+
- id: destroy-runner
163+
uses: volcengine/vemlp-github-runner@v1
164+
with:
165+
mode: "destroy"
166+
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
167+
mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
168+

verl/workers/rollout/trtllm_rollout/trtllm_async_server.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,10 @@ async def launch_servers(self):
329329
else f"trtllm_server_reward_{self.replica_rank}"
330330
)
331331

332-
runtime_env_vars = {"TLLM_NUMA_AWARE_WORKER_AFFINITY": "0"}
332+
runtime_env_vars = {
333+
"RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1",
334+
# "TLLM_NUMA_AWARE_WORKER_AFFINITY": "0"
335+
}
333336
server = TRTLLMHttpServer.options(
334337
scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
335338
node_id=node_id,

0 commit comments

Comments
 (0)