PR Test (AMD) #14
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PR Test (AMD) | |
| # Dynamic run-name for /rerun-stage commands to enable URL lookup | |
| # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs | |
| run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} | |
| on: | |
| schedule: | |
| - cron: '0 */6 * * *' # Run every 6 hours (UTC) | |
| pull_request: | |
| branches: [ main ] | |
| paths: | |
| - "python/**" | |
| - "scripts/ci/**" | |
| - "test/**" | |
| - "sgl-kernel/**" | |
| - ".github/workflows/pr-test-amd.yml" | |
| - "docker/rocm.Dockerfile" | |
| workflow_dispatch: | |
| inputs: | |
| target_stage_select: | |
| description: "Select a stage to run from dropdown (leave empty for auto-detect)" | |
| required: false | |
| type: choice | |
| default: '' | |
| options: | |
| - '' | |
| - sgl-kernel-unit-test-amd | |
| - sgl-kernel-unit-test-2-gpu-amd | |
| - stage-a-test-1-gpu-small-amd | |
| - jit-kernel-unit-test-amd | |
| - stage-b-test-1-gpu-small-amd | |
| - stage-b-test-1-gpu-small-amd-nondeterministic | |
| - stage-b-test-1-gpu-small-amd-mi35x | |
| - stage-b-test-1-gpu-large-amd | |
| - stage-b-test-2-gpu-large-amd | |
| - multimodal-gen-test-1-gpu-amd | |
| - multimodal-gen-test-2-gpu-amd | |
| - stage-c-test-4-gpu-amd | |
| - stage-c-test-large-8-gpu-amd | |
| - stage-c-test-large-8-gpu-amd-mi35x | |
| - stage-b-test-large-8-gpu-disaggregation-amd | |
| target_stage: | |
| description: "Or type comma-separated stage names (overrides dropdown if non-empty)" | |
| required: false | |
| type: string | |
| default: "" | |
| pr_head_sha: | |
| description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" | |
| required: false | |
| type: string | |
| default: "" | |
| aiter_ref: | |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' | |
| required: false | |
| type: string | |
| default: '' | |
| continue_on_error: | |
| description: 'Continue on error (do not fail the workflow on test failures)' | |
| required: false | |
| type: boolean | |
| default: false | |
| workflow_call: | |
| inputs: | |
| ref: | |
| description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' | |
| required: false | |
| type: string | |
| default: '' | |
| run_all_tests: | |
| description: "Run all tests (for releasing or testing purpose)" | |
| required: false | |
| type: boolean | |
| default: false | |
| aiter_ref: | |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' | |
| required: false | |
| type: string | |
| default: '' | |
| continue_on_error: | |
| description: 'Continue on error (do not fail the workflow on test failures)' | |
| required: false | |
| type: boolean | |
| default: false | |
| env: | |
| AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} | |
| concurrency: | |
| # Scheduled and run_all_tests runs get unique groups (never cancel each other). | |
| # PR runs share a group per branch so new pushes cancel stale runs. | |
| group: pr-test-amd-${{ (inputs.run_all_tests || github.event_name == 'schedule') && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} | |
| cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' && github.event_name != 'schedule' }} | |
| jobs: | |
| call-gate: | |
| if: github.event_name != 'schedule' | |
| uses: ./.github/workflows/pr-gate.yml | |
| secrets: inherit | |
| check-changes: | |
| needs: [call-gate] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| outputs: | |
| main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} | |
| sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} | |
| jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} | |
| multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} | |
| continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Determine run mode | |
| id: run-mode | |
| run: | | |
| if [[ "${{ inputs.run_all_tests }}" == "true" || "${{ github.event_name }}" == "schedule" ]]; then | |
| echo "run_all_tests=true" >> $GITHUB_OUTPUT | |
| echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }}, event=${{ github.event_name }})" | |
| else | |
| echo "run_all_tests=false" >> $GITHUB_OUTPUT | |
| echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" | |
| fi | |
| - name: Set continue-on-error for schedule/full runs | |
| id: set-continue-on-error | |
| run: | | |
| if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" || "${{ inputs.continue_on_error }}" == "true" ]]; then | |
| echo "continue_on_error=true" >> $GITHUB_OUTPUT | |
| echo "Continue-on-error: ENABLED (run_all_tests=${{ steps.run-mode.outputs.run_all_tests }}, input=${{ inputs.continue_on_error }})" | |
| else | |
| echo "continue_on_error=false" >> $GITHUB_OUTPUT | |
| echo "Continue-on-error: DISABLED" | |
| fi | |
| - name: Detect file changes | |
| id: filter | |
| uses: dorny/paths-filter@v3 | |
| if: steps.run-mode.outputs.run_all_tests != 'true' | |
| with: | |
| filters: | | |
| main_package: | |
| - "python/sglang/!(multimodal_gen)/**/!(*.md)" | |
| - "python/pyproject_rocm.toml" | |
| - "python/pyproject_other.toml" | |
| - "scripts/ci/amd/*" | |
| - "scripts/ci/utils/*" | |
| - "test/**/!(*.md)" | |
| - ".github/workflows/pr-test-amd.yml" | |
| sgl_kernel: | |
| - "sgl-kernel/**/*.!(md|txt)" | |
| - ".github/workflows/pr-test-amd.yml" | |
| jit_kernel: | |
| - "python/sglang/jit_kernel/**" | |
| - ".github/workflows/pr-test-amd.yml" | |
| multimodal_gen: | |
| - "python/sglang/multimodal_gen/**/*.!(md|ipynb)" | |
| - "python/sglang/cli/**" | |
| - "python/sglang/jit_kernel/diffusion/**" | |
| - "python/sglang/jit_kernel/tests/diffusion/**" | |
| - "python/sglang/jit_kernel/benchmark/diffusion/**" | |
| - "python/pyproject_rocm.toml" | |
| - "python/pyproject_other.toml" | |
| # =============================================== sgl-kernel ==================================================== | |
| sgl-kernel-unit-test-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.sgl_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 14 | |
| run: | | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py | |
| sgl-kernel-unit-test-2-gpu-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.sgl_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 20 | |
| run: | | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py | |
| # =============================================== primary ==================================================== | |
| stage-a-test-1-gpu-small-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-gpu-small-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 10 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-gpu-small-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| jit-kernel-unit-test-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.jit_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run JIT kernel unit tests | |
| timeout-minutes: 10 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py | |
| # =============================================== Wait Jobs for Sequential PR Execution ==================================================== | |
| # These jobs poll GitHub API to wait for previous stages to complete. | |
| # For PR runs: wait jobs run and enforce sequential execution via polling. | |
| # For scheduled runs: wait jobs are skipped, enabling parallel execution of all stages. | |
| wait-for-stage-a-amd: | |
| needs: [check-changes, call-gate] | |
| if: | | |
| always() && | |
| !cancelled() && | |
| github.event_name == 'pull_request' && | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') && | |
| (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') | |
| runs-on: ubuntu-latest | |
| outputs: | |
| stage_a_result: ${{ steps.wait.outputs.result }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: ./.github/actions/wait-for-jobs | |
| id: wait | |
| with: | |
| stage-name: stage-a-amd | |
| jobs: '[{"prefix": "stage-a-test-1-gpu-small-amd", "expected_count": 1}]' | |
| max-wait-minutes: '240' | |
| stage-b-test-1-gpu-small-amd: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-small-amd-nondeterministic: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-nondeterministic,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-nondeterministic --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-small-amd-mi35x: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-mi35x,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-mi35x ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-large-amd: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-large-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1, 2] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-2-gpu-large-amd: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-2-gpu-large-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| part: [0, 1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-2-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| multimodal-gen-test-1-gpu-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.multimodal_gen == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1, 2, 3] # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion | |
| - name: Setup kernel caches | |
| run: | | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) | |
| # This directory persists across container restarts on the self-hosted runner | |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub | |
| # Clear pre-built AITER kernels from Docker image to avoid segfaults | |
| # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ | |
| echo "Clearing pre-built AITER kernels from Docker image..." | |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true | |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true | |
| echo "AITER kernels cleared - will be rebuilt on first use" | |
| # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) | |
| # This tells the test cleanup code to NOT delete downloaded models | |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then | |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache | |
| echo "Created .persistent_cache marker - HF cache will persist" | |
| else | |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" | |
| fi | |
| # Check MIOpen cache (VAE convolution kernels) | |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") | |
| echo "Found ${miopen_files} MIOpen cache files" | |
| - name: Diagnose HF cache and system resources | |
| run: | | |
| echo "=== System Memory Status ===" | |
| free -h | |
| echo "" | |
| echo "=== Disk Space ===" | |
| df -h /home/runner/sgl-data 2>/dev/null || df -h | |
| echo "" | |
| echo "=== HF Cache Directory Structure ===" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" | |
| echo "" | |
| echo "=== Checking for cached diffusion models (1-GPU tests) ===" | |
| # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 | |
| for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do | |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" | |
| if docker exec ci_sglang test -d "$cache_path"; then | |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) | |
| echo "✓ CACHED: $model ($size)" | |
| else | |
| echo "✗ NOT CACHED: $model" | |
| fi | |
| done | |
| echo "" | |
| echo "=== GPU Memory Status ===" | |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" | |
| - name: Run diffusion server tests (1-GPU) | |
| timeout-minutes: 90 | |
| run: | | |
| # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) | |
| # Tests: T2V, T2I, I2V, LoRA | |
| # | |
| # HF download env vars: | |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) | |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings | |
| docker exec \ | |
| -e SGLANG_E2E_TOLERANCE=0.3 \ | |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ | |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ | |
| -e SGLANG_SKIP_CONSISTENCY=1 \ | |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ | |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ | |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ | |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ | |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ | |
| -w /sglang-checkout/python \ | |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 1-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 4 \ | |
| -k "not flux_2" | |
| # Post-test diagnostics | |
| echo "=== Post-test System Memory Status ===" | |
| free -h | |
| multimodal-gen-test-2-gpu-amd: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.multimodal_gen == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion | |
| - name: Setup kernel caches | |
| run: | | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) | |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub | |
| # Clear pre-built AITER kernels from Docker image to avoid segfaults | |
| # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ | |
| echo "Clearing pre-built AITER kernels from Docker image..." | |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true | |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true | |
| echo "AITER kernels cleared - will be rebuilt on first use" | |
| # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) | |
| # This tells the test cleanup code to NOT delete downloaded models | |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then | |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache | |
| echo "Created .persistent_cache marker - HF cache will persist" | |
| else | |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" | |
| fi | |
| # Check MIOpen cache (VAE convolution kernels) | |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") | |
| echo "Found ${miopen_files} MIOpen cache files" | |
| - name: Diagnose HF cache and system resources | |
| run: | | |
| echo "=== System Memory Status ===" | |
| free -h | |
| echo "" | |
| echo "=== Disk Space ===" | |
| df -h /home/runner/sgl-data 2>/dev/null || df -h | |
| echo "" | |
| echo "=== HF Cache Directory Structure ===" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" | |
| echo "" | |
| echo "=== Checking for cached diffusion models (2-GPU tests) ===" | |
| # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 | |
| for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do | |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" | |
| if docker exec ci_sglang test -d "$cache_path"; then | |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) | |
| echo "✓ CACHED: $model ($size)" | |
| else | |
| echo "✗ NOT CACHED: $model" | |
| fi | |
| done | |
| echo "" | |
| echo "=== GPU Memory Status ===" | |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" | |
| - name: Run diffusion server tests (2-GPU) | |
| timeout-minutes: 80 | |
| run: | | |
| # AMD CI: All 2-GPU tests including LoRA | |
| # Tests: T2V, T2I, I2V, LoRA | |
| # | |
| # HF download env vars: | |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) | |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings | |
| docker exec \ | |
| -e SGLANG_E2E_TOLERANCE=0.3 \ | |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ | |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ | |
| -e SGLANG_SKIP_CONSISTENCY=1 \ | |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ | |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ | |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ | |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ | |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ | |
| -w /sglang-checkout/python \ | |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 2-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 2 | |
| # Post-test diagnostics | |
| echo "=== Post-test System Memory Status ===" | |
| free -h | |
| wait-for-stage-b-amd: | |
| needs: [check-changes, call-gate, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| !cancelled() && | |
| github.event_name == 'pull_request' && | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') && | |
| (needs.wait-for-stage-a-amd.result == 'success' || needs.wait-for-stage-a-amd.result == 'skipped') && | |
| (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') | |
| runs-on: ubuntu-latest | |
| outputs: | |
| stage_b_result: ${{ steps.wait.outputs.result }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: ./.github/actions/wait-for-jobs | |
| id: wait | |
| with: | |
| stage-name: stage-b-amd | |
| jobs: | | |
| [ | |
| {"prefix": "stage-b-test-1-gpu-small-amd", "expected_count": 14}, | |
| {"prefix": "stage-b-test-2-gpu-large-amd", "expected_count": 2} | |
| ] | |
| max-wait-minutes: '480' | |
| stage-c-test-4-gpu-amd: | |
| needs: [check-changes, call-gate, wait-for-stage-b-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-4-gpu-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-4gpu-sglang] | |
| part: [0] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh \ | |
| -e NCCL_CUMEM_ENABLE=0 \ | |
| -e NCCL_NVLS_ENABLE=0 \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e SGLANG_USE_ROCM700A=1 \ | |
| -w "/sglang-checkout/test" \ | |
| python3 run_suite.py \ | |
| --hw amd \ | |
| --suite stage-c-test-4-gpu-amd \ | |
| --auto-partition-id ${{ matrix.part }} \ | |
| --auto-partition-size 1 \ | |
| --timeout-per-file 1800 \ | |
| --enable-retry \ | |
| --max-attempts 2 \ | |
| --retry-wait-seconds 120 \ | |
| --retry-timeout-increase 0 \ | |
| ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-c-test-large-8-gpu-amd: | |
| needs: [check-changes, call-gate, wait-for-stage-b-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| env: | |
| RUNNER_LABELS: linux-mi325-8gpu-sglang | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-8gpu-sglang] | |
| part: [0, 1, 2] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Test RCCL multi-GPU communication | |
| timeout-minutes: 5 | |
| run: | | |
| echo "Testing RCCL multi-GPU communication with debug info..." | |
| docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-c-test-large-8-gpu-amd-mi35x: | |
| needs: [check-changes, call-gate, wait-for-stage-b-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-8] | |
| part: [0, 1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| # =============================================== Disaggregation ==================================================== | |
| stage-b-test-large-8-gpu-35x-disaggregation-amd: | |
| needs: [check-changes, wait-for-stage-a-amd] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ((github.event_name == 'schedule') || (!failure() && !cancelled())) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-8.fabric] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Check Host RDMA Environment | |
| id: rdma_detect | |
| run: | | |
| set +e | |
| echo "=== Checking Host RDMA Environment ===" | |
| echo "" | |
| echo "=== 1. Ionic driver library check ===" | |
| ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" | |
| echo "" | |
| echo "=== 2. Infiniband devices ===" | |
| ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" | |
| ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" | |
| echo "" | |
| echo "=== 3. ibv_devinfo ===" | |
| which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" | |
| echo "" | |
| echo "=== 4. Kernel modules ===" | |
| lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" | |
| echo "" | |
| echo "=== 5. Detect RDMA Devices for test environment ===" | |
| if [ -d "/sys/class/infiniband" ]; then | |
| RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) | |
| echo "Detected RDMA Devices: $RDMA_DEVS" | |
| echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV | |
| else | |
| echo "No RDMA devices found in /sys/class/infiniband" | |
| echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV | |
| fi | |
| echo "" | |
| echo "=== Host RDMA Check Complete ===" | |
| - name: Start Special Container | |
| run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Verify RDMA in Container | |
| run: | | |
| docker exec -u root ci_sglang bash -c ' | |
| echo "=== Container RDMA Verification ===" | |
| echo "Device nodes:" | |
| ls -la /dev/infiniband/ | |
| echo "" | |
| echo "Provider libraries:" | |
| ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" | |
| echo "" | |
| echo "HCA devices:" | |
| HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") | |
| ibv_devinfo -list | |
| if [ "$HCA_COUNT" -gt 0 ]; then | |
| echo "" | |
| echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" | |
| else | |
| echo "" | |
| echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" | |
| fi | |
| ' | |
| - name: Run Aiter Op Test (RMSNorm) | |
| timeout-minutes: 10 | |
| run: | | |
| echo "Running pre-check: test_rmsnorm2d.py" | |
| docker exec \ | |
| -e MAX_JOBS=192 \ | |
| ci_sglang \ | |
| python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py | |
| - name: Run test_disaggregation | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh \ | |
| -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ | |
| -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| pr-test-amd-finish: | |
| needs: | |
| [ | |
| call-gate, | |
| check-changes, | |
| sgl-kernel-unit-test-amd, | |
| sgl-kernel-unit-test-2-gpu-amd, | |
| multimodal-gen-test-1-gpu-amd, | |
| multimodal-gen-test-2-gpu-amd, | |
| wait-for-stage-a-amd, | |
| stage-a-test-1-gpu-small-amd, | |
| jit-kernel-unit-test-amd, | |
| wait-for-stage-b-amd, | |
| stage-b-test-1-gpu-small-amd, | |
| stage-b-test-1-gpu-small-amd-nondeterministic, | |
| stage-b-test-1-gpu-small-amd-mi35x, | |
| stage-b-test-1-gpu-large-amd, | |
| stage-b-test-2-gpu-large-amd, | |
| stage-b-test-large-8-gpu-35x-disaggregation-amd, | |
| stage-c-test-4-gpu-amd, | |
| stage-c-test-large-8-gpu-amd, | |
| stage-c-test-large-8-gpu-amd-mi35x, | |
| ] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check all dependent job statuses | |
| run: | | |
| # Convert the 'needs' context to a JSON string | |
| json_needs='${{ toJson(needs) }}' | |
| # Get a list of all job names from the JSON keys | |
| job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') | |
| for job in $job_names; do | |
| # For each job, extract its result | |
| result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') | |
| # Print the job name and its result | |
| echo "$job: $result" | |
| # Check for failure or cancellation and exit if found | |
| if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then | |
| echo "The above jobs failed." | |
| exit 1 | |
| fi | |
| done | |
| # If the loop completes, all jobs were successful | |
| echo "All jobs completed successfully" | |
| exit 0 |