1- name : e2e_ppo_grpo_trainer_trtllm_bug_repro
1+ name : e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test
22
33on :
44 # Trigger the workflow on push or pull request,
1313 - " !recipe/**"
1414 - " !verl/workers/**/*dp_*.py"
1515 - " verl/workers/rollout/trtllm_rollout/*"
16- - " .github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro .yml"
16+ - " .github/workflows/e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test .yml"
1717 pull_request :
1818 branches :
1919 - main
3030 - " !recipe/**"
3131 - " !verl/workers/**/*dp_*.py"
3232 - " verl/workers/rollout/trtllm_rollout/*"
33- - " .github/workflows/e2e_ppo_grpo_trainer_trtllm_bug_repro .yml"
33+ - " .github/workflows/e2e_ppo_grpo_trainer_trtllm_cpu_affinity_test .yml"
3434 workflow_dispatch : # Allow manual triggering
3535
3636# Cancel jobs on the same ref if a new one is triggered
6262 faas-url : " ${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
6363 mlp-image : " ${{ env.IMAGE }}"
6464
65- e2e_grpo_trainer_trtllm_bug_repro :
65+ e2e_grpo_trainer_trtllm_cpu_affinity_test :
6666 needs : setup
6767 runs-on : ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
6868 timeout-minutes : 30
9393 - name : Prepare GSM8K dataset
9494 run : |
9595 python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_save_dir ${PWD}/data/gsm8k
96- - name : Running GRPO E2E training test with TensorRT-LLM async rollout (Bug Repro )
96+ - name : Running GRPO E2E training test with TensorRT-LLM async rollout (CPU Affinity Test )
9797 run : |
9898 ray stop --force
9999 DATADIR=${HOME}/data \
@@ -143,7 +143,7 @@ jobs:
143143 trainer.critic_warmup=0 \
144144 trainer.logger='["console"]' \
145145 trainer.project_name="verl_grpo_molecular_transformer" \
146- trainer.experiment_name="qwen2.5-0.5b-trtllm-fsdp2-4gpus-bug-repro " \
146+ trainer.experiment_name="qwen2.5-0.5b-trtllm-fsdp2-4gpus-cpu-affinity-test " \
147147 trainer.n_gpus_per_node=4 \
148148 trainer.nnodes=1 \
149149 trainer.save_freq=10 \
@@ -156,7 +156,7 @@ jobs:
156156
157157 cleanup :
158158 runs-on : ubuntu-latest
159- needs : [setup, e2e_grpo_trainer_trtllm_bug_repro ]
159+ needs : [setup, e2e_grpo_trainer_trtllm_cpu_affinity_test ]
160160 if : always()
161161 steps :
162162 - id : destroy-runner
0 commit comments