[Audio support] merge into main #3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: e2e_ppo_trainer | |
| on: | |
| # Trigger the workflow on push or pull request, | |
| # but only for the main branch | |
| # For push, for now only anti-patterns are specified so it is more conservative | |
| # and achieves higher coverage. | |
| push: | |
| branches: | |
| - main | |
| - v0.* | |
| paths: | |
| - "**/*.py" | |
| # Other entrypoints | |
| - "!verl/trainer/fsdp_sft_trainer.py" | |
| # Recipes | |
| - "!recipe/**" | |
| # Megatron | |
| - "!verl/workers/**/megatron_*.py" | |
| pull_request: | |
| branches: | |
| - main | |
| - v0.* | |
| paths: | |
| - "**/*.py" | |
| # Other entrypoints | |
| - "!**/*.md" | |
| - "!docker/**" | |
| - "!examples/**" | |
| - "!tests/**" | |
| - "!verl/trainer/main_*.py" | |
| - "!verl/trainer/fsdp_sft_trainer.py" | |
| # Docs | |
| - "!docs/**" | |
| # Recipes | |
| - "!recipe/**" | |
| # Megatron | |
| - "!verl/workers/**/megatron_*.py" | |
| # Entrypoints | |
| - ".github/workflows/e2e_ppo_trainer.yml" | |
| - "examples/data_preprocess/gsm8k.py" | |
| - "examples/data_preprocess/geo3k.py" | |
| - "tests/special_e2e/ppo_trainer" | |
| - "verl/trainer/main_ppo.py" | |
| - "verl/trainer/config/ppo_trainer.yaml" | |
| # Cancel jobs on the same ref if a new one is triggered | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| # Declare permissions just read content. | |
| permissions: | |
| contents: read | |
| jobs: | |
| pre_commit_for_ppo: | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| python-version: ["3.12"] | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Install the current repository | |
| run: | | |
| pip install -e . | |
| - name: Set ruff --output-format=github | |
| run: | | |
| sed -i 's/--output-format=full/--output-format=github/' .pre-commit-config.yaml | |
| git add .pre-commit-config.yaml | |
| - uses: pre-commit/[email protected] | |
| with: | |
| extra_args: "" # Overriding default "--all-files" | |
| e2e_ppo_trainer_vllm: | |
| runs-on: [L20x8] | |
| timeout-minutes: 60 # Increase this timeout value as needed | |
| env: | |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" | |
| HF_ENDPOINT: "https://hf-mirror.com" | |
| HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable | |
| container: | |
| image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 | |
| options: --gpus all --shm-size=10g | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install the current repository | |
| run: | | |
| pip3 install --no-deps -e .[test,vllm] | |
| - name: Prepare GSM8K dataset | |
| run: | | |
| ray stop --force | |
| python3 examples/data_preprocess/gsm8k.py | |
| # HF sanity | |
| # - name: Running GSM8K E2E training tests on 1 L20 GPU with hf for sanity | |
| # run: | | |
| # ray stop --force | |
| # bash tests/special_e2e/ppo_trainer/run_single_gpu.sh | |
| # # HF sanity | |
| # - name: Running GSM8K E2E training tests on 1 L20 GPU with engine interface for sanity. | |
| # run: | | |
| # ray stop --force | |
| # bash tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh | |
| # Function RM | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP_SIZE=8) | |
| run: | | |
| ray stop --force | |
| VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp-size8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm after resuming | |
| run: | | |
| ray stop --force | |
| RESUME_MODE=auto VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp-size8" bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Test merging FSDP checkpoints (Qwen Actor) | |
| run: | | |
| exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp-size8" | |
| python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (DDP_SIZE=2, FSDP_SIZE=4) | |
| run: | | |
| ray stop --force | |
| VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Test merging DDP+FSDP checkpoints (Qwen Actor) | |
| run: | | |
| exp_name="qwen2.5-0.5b-function-reward-minimal-ddp-size2-fsdp-size4" | |
| python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with validation and saving (FSDP2) | |
| run: | | |
| ray stop --force | |
| VAL_BEFORE_TRAIN=True TEST_FREQ=1 SAVE_FREQ=1 SAVE_HF_MODEL=True VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8" STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Test merging FSDP2 checkpoints (Qwen Actor) | |
| run: | | |
| exp_name="qwen2.5-0.5b-function-reward-minimal-fsdp2-size8" | |
| python -m verl.model_merger test --backend fsdp --local_dir checkpoints/verl-test/${exp_name}/global_step_1/actor --test_hf_dir checkpoints/verl-test/${exp_name}/global_step_1/actor/huggingface | |
| - name: Running GSM8K E2E without rmpad using function rm | |
| run: | | |
| ray stop --force | |
| RM_PAD=False bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (GRPO) | |
| run: | | |
| ray stop --force | |
| ADV_ESTIMATOR=grpo USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm (ReMax) | |
| run: | | |
| ray stop --force | |
| ADV_ESTIMATOR=remax USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using customized reward function | |
| run: | | |
| ray stop --force | |
| CUSTOM_REWARD_FN=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm with in-reward kl and kl loss | |
| run: | | |
| ray stop --force | |
| USE_KL=True bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| # LoRA tests | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm | |
| run: | | |
| ray stop --force | |
| ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon | |
| run: | | |
| ray stop --force | |
| ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True TOTAL_TRAIN_STEPS=1 SAVE_FREQ=1 FSDP_SIZE=4 VERL_EXP_NAME="qwen2.5-0.5b-function-reward-minimal" bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Test GRPO LoRA checkpoints merging function | |
| run: | | |
| export EXP_NAME="qwen2.5-0.5b-function-reward-minimal" | |
| ls checkpoints/verl-test/${EXP_NAME}/global_step_1/actor | |
| cat checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface/config.json | |
| python3 -m verl.model_merger merge --backend fsdp --local_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/ --target_dir checkpoints/verl-test/${EXP_NAME}/global_step_1/actor/huggingface | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with grpo lora using function rm with use_shm and layered_summon with fsdp2 | |
| run: | | |
| ray stop --force | |
| ADV_ESTIMATOR=grpo USE_SHM=True LORA_RANK=32 LOAD_FORMAT=safetensors LAYERED_SUMMON=True STRATEGY=fsdp2 bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| # Model RM | |
| - name: Running GRPO GSM8K E2E training tests with FSDP on 8 L20 GPUs (DeepSeek) | |
| run: | | |
| ray stop --force | |
| MODEL_ID=deepseek-ai/deepseek-coder-1.3b-instruct bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm | |
| run: | | |
| ray stop --force | |
| bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E without rmpad using model rm | |
| run: | | |
| ray stop --force | |
| RM_PAD=False bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm and ulysses sp=2 | |
| run: | | |
| ray stop --force | |
| SP_SIZE=2 bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm and dynamic batch size | |
| run: | | |
| ray stop --force | |
| SEQ_BALANCE=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm with Liger Kernel enabled | |
| run: | | |
| ray stop --force | |
| LIGER=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled | |
| run: | | |
| ray stop --force | |
| FUSED_KERNELS=True bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E with rmpad using model rm with Fused Kernel enabled | |
| run: | | |
| ray stop --force | |
| FUSED_KERNEL=True FUSED_KERNEL_BACKEND=triton bash tests/special_e2e/ppo_trainer/run_model_reward.sh | |
| - name: Running GSM8K E2E training tests on vllm async | |
| run: | | |
| ray stop --force | |
| export VLLM_USE_V1=1 | |
| ray start --head | |
| TOTAL_TRAIN_STEPS=2 ENGINE=vllm ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| e2e_ppo_trainer_vllm_vlm: | |
| runs-on: [L20x8] | |
| needs: pre_commit_for_ppo | |
| timeout-minutes: 40 # Increase this timeout value as needed | |
| env: | |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" | |
| HF_ENDPOINT: "https://hf-mirror.com" | |
| HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable | |
| container: | |
| image: verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2 | |
| options: --gpus all --shm-size=50g # Visual dataloader requires large memory | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install the current repository | |
| run: | | |
| pip3 install --no-deps -e .[test,gpu,vllm,geo,trl] | |
| pip install "transformers[hf_xet]==4.54.0" | |
| # Geo3k | |
| - name: Prepare GEO3K dataset | |
| run: | | |
| python3 examples/data_preprocess/geo3k.py | |
| - name: Running GEO3K VLM GRPO E2E training tests on 8 L20 GPUs with rmpad using function rm | |
| run: | | |
| ray stop --force | |
| TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \ | |
| ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| SP_SIZE=2 \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GEO3K VLM PPO E2E training tests on 8 L20 GPUs with rmpad using function rm | |
| run: | | |
| ray stop --force | |
| TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \ | |
| ADV_ESTIMATOR=gae RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| SP_SIZE=2 \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GEO3K VLM GRPO E2E lora training tests on 8 L20 GPUs with rmpad using function rm | |
| run: | | |
| ray stop --force | |
| TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \ | |
| ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| SP_SIZE=2 \ | |
| LORA_RANK=32 LORA_EXCLUDE=".*visual.*" \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| e2e_ppo_trainer_sglang: | |
| runs-on: [L20x8] | |
| needs: pre_commit_for_ppo | |
| timeout-minutes: 40 # Increase this timeout value as needed | |
| env: | |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" | |
| HF_ENDPOINT: "https://hf-mirror.com" | |
| HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable | |
| container: | |
| image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 | |
| options: --gpus all --shm-size=10g | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install the current repository | |
| run: | | |
| pip3 install -e .[test,gpu,sglang] | |
| - name: Prepare gsm8k dataset | |
| run: | | |
| ray stop --force | |
| python3 examples/data_preprocess/gsm8k.py | |
| - name: Running GSM8K E2E training tests on 8 L20 GPUs with rmpad using function rm and save ckpt | |
| run: | | |
| ray stop --force | |
| ENGINE=sglang bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GSM8K E2E training tests on sglang async | |
| run: | | |
| ray stop --force | |
| TOTAL_TRAIN_STEPS=2 ENGINE=sglang ROLLOUT_MODE=async bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| e2e_ppo_trainer_sglang_vlm: | |
| runs-on: [L20x8] | |
| needs: pre_commit_for_ppo | |
| timeout-minutes: 60 # Increase this timeout value as needed | |
| env: | |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} | |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} | |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" | |
| HF_ENDPOINT: "https://hf-mirror.com" | |
| HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable | |
| container: | |
| image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 | |
| options: --gpus all --shm-size=50g # Visual dataloader requires large memory | |
| steps: | |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install the current repository | |
| run: | | |
| pip3 install -e .[test,geo,gpu,sglang] --no-deps | |
| pip install "transformers[hf_xet]==4.54.0" | |
| # Geo3k | |
| - name: Prepare GEO3K dataset | |
| run: | | |
| ray stop --force | |
| python3 examples/data_preprocess/geo3k.py | |
| - name: Running GEO3K VLM E2E training tests on 8 L20 GPUs with rmpad using function rm | |
| run: | | |
| ray stop --force | |
| TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2-VL-2B-Instruct \ | |
| ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \ | |
| ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GEO3K VLM E2E with rmpad using torch fused kernel (Qwen2.5-VL) | |
| run: | | |
| ray stop --force | |
| FUSED_KERNELS=True TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \ | |
| ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \ | |
| ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh | |
| - name: Running GEO3K VLM E2E with rmpad using triton fused kernel (Qwen2.5-VL) | |
| run: | | |
| ray stop --force | |
| FUSED_KERNELS=True FUSED_KERNEL_BACKEND=triton \ | |
| TRAIN_FILES=$HOME/data/geo3k/train.parquet VAL_FILES=$HOME/data/geo3k/test.parquet \ | |
| MAX_PROMPT_LEN=1536 MAX_RESPONSE_LEN=1536 \ | |
| MODEL_ID=Qwen/Qwen2.5-VL-3B-Instruct \ | |
| ADV_ESTIMATOR=grpo RM_PAD=True USE_KL=True ENABLE_CHUNKED_PREFILL=False \ | |
| ENGINE=sglang GPU_MEMORY_UTILIZATION=0.6 ACTOR_FSDP_PARAM_OFFLOAD=True \ | |
| ACTOR_FSDP_OPTIMIZER_OFFLOAD=True REF_FSDP_PARAM_OFFLOAD=True \ | |
| bash tests/special_e2e/ppo_trainer/run_function_reward.sh |