Skip to content

Commit cf7c69a

Browse files
committed
Update cpu affinity workflow
1 parent 85168b8 commit cf7c69a

File tree

2 files changed

+7
-175
lines changed

2 files changed

+7
-175
lines changed

.github/workflows/e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: e2e_ppo_grpo_trainer_trtllm_bug_repro
1+
name: e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test
22

33
on:
44
# Trigger the workflow on push or pull request,
@@ -13,7 +13,7 @@ on:
1313
- "!recipe/**"
1414
- "!verl/workers/**/*dp_*.py"
1515
- "verl/workers/rollout/trtllm_rollout/*"
16-
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro.yml"
16+
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test.yml"
1717
pull_request:
1818
branches:
1919
- main
@@ -30,7 +30,7 @@ on:
3030
- "!recipe/**"
3131
- "!verl/workers/**/*dp_*.py"
3232
- "verl/workers/rollout/trtllm_rollout/*"
33-
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro.yml"
33+
- ".github/workflows/e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test.yml"
3434
workflow_dispatch: # Allow manual triggering
3535

3636
# Cancel jobs on the same ref if a new one is triggered
@@ -62,7 +62,7 @@ jobs:
6262
faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
6363
mlp-image: "${{ env.IMAGE }}"
6464

65-
e2e_grpo_trainer_trtllm_bug_repro:
65+
e2e_grpo_trainer_trtllm_cpu_affinity_test:
6666
needs: setup
6767
runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
6868
timeout-minutes: 30
@@ -93,7 +93,7 @@ jobs:
9393
- name: Prepare GSM8K dataset
9494
run: |
9595
python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
96-
- name: Running GRPO E2E training test with TensorRT-LLM async rollout (Bug Repro)
96+
- name: Running GRPO E2E training test with TensorRT-LLM async rollout (CPU Affinity Test)
9797
run: |
9898
ray stop --force
9999
DATADIR=${HOME}/data \
@@ -143,7 +143,7 @@ jobs:
143143
trainer.critic_warmup=0 \
144144
trainer.logger='["console"]' \
145145
trainer.project_name="verl_grpo_molecular_transformer" \
146-
trainer.experiment_name="qwen2.5-0.5b-trtllm-fsdp2-4gpus-bug-repro" \
146+
trainer.experiment_name="qwen2.5-0.5b-trtllm-fsdp2-4gpus-cpu-affinity-test" \
147147
trainer.n_gpus_per_node=4 \
148148
trainer.nnodes=1 \
149149
trainer.save_freq=10 \
@@ -156,7 +156,7 @@ jobs:
156156
157157
cleanup:
158158
runs-on: ubuntu-latest
159-
needs: [setup, e2e_grpo_trainer_trtllm_bug_repro]
159+
needs: [setup, e2e_grpo_trainer_trtllm_cpu_affinity_test]
160160
if: always()
161161
steps:
162162
- id: destroy-runner

.github/workflows/main.yml

Lines changed: 0 additions & 168 deletions
This file was deleted.

0 commit comments

Comments
 (0)