Nightly Test (AMD) #83
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Test (AMD) | |
| on: | |
| schedule: | |
| - cron: '0 0 * * *' | |
| push: | |
| branches: | |
| - main | |
| paths: | |
| - "python/sglang/version.py" | |
| workflow_dispatch: | |
| inputs: | |
| job_filter: | |
| description: 'Select which job to run (leave empty or "all" to run all jobs)' | |
| required: false | |
| type: choice | |
| default: 'all' | |
| options: | |
| - 'all' | |
| # MI30x Unit Tests | |
| - 'nightly-test-1-gpu-unit' | |
| # MI30x Accuracy Tests (GSM8K / MMMU) | |
| - 'nightly-accuracy-2-gpu' | |
| - 'nightly-accuracy-2-gpu-vlm' | |
| - 'nightly-accuracy-8-gpu' | |
| - 'nightly-accuracy-8-gpu-deepseek-r1' | |
| # MI30x Accuracy + Performance Tests (combined) | |
| - 'nightly-8-gpu-grok1-int4' | |
| - 'nightly-8-gpu-grok2' | |
| - 'nightly-8-gpu-deepseek-v31' | |
| # MI35x jobs | |
| - 'nightly-test-1-gpu-mi35x' | |
| - 'nightly-accuracy-8-gpu-mi35x' | |
| - 'nightly-accuracy-8-gpu-mi35x-deepseek-r1' | |
| - 'nightly-8-gpu-mi35x-grok1-int4' | |
| - 'nightly-8-gpu-mi35x-grok2' | |
| - 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4' | |
| workflow_call: | |
| inputs: | |
| ref: | |
| description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' | |
| required: false | |
| type: string | |
| default: '' | |
| job_filter: | |
| description: 'Select which job to run (leave empty or "all" to run all jobs)' | |
| required: false | |
| type: string | |
| default: 'all' | |
| concurrency: | |
| group: nightly-test-amd-${{ inputs.ref || github.ref }} | |
| cancel-in-progress: ${{ github.event_name != 'workflow_call' }} | |
| jobs: | |
| # ============================================== MI30x Unit Tests ============================================== | |
| # 1-GPU Unit Tests - LoRA, debug utils, scheduler, etc. (MI30x only) | |
| nightly-test-1-gpu-unit: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-unit') | |
| runs-on: linux-mi325-gpu-1 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Nightly Unit Test (1-GPU) | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-1-gpu --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # ============================================== MI30x Accuracy Tests ============================================== | |
| # 2-GPU Accuracy Tests - GSM8K eval (MI30x only) | |
| nightly-accuracy-2-gpu: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu') | |
| runs-on: linux-mi325-gpu-2 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Nightly Test (2-GPU) | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # 2-GPU VLM Accuracy Tests - Vision-Language Models MMMU evaluation | |
| nightly-accuracy-2-gpu-vlm: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-2-gpu-vlm') | |
| runs-on: linux-mi325-gpu-2 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Nightly Accuracy Test (2-GPU VLM MMMU) | |
| timeout-minutes: 180 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-2-gpu-vlm --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # 8-GPU Accuracy Tests - GPT-OSS, Grok1-FP8 (accuracy only) | |
| nightly-accuracy-8-gpu: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu') | |
| runs-on: linux-mi325-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Accuracy Test (8-GPU GPT-OSS) | |
| timeout-minutes: 180 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-gpt-oss --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Accuracy Test (8-GPU Grok1-FP8) | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-fp8 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # 8-GPU DeepSeek-R1 Accuracy Test (separate job due to long loading time) | |
| nightly-accuracy-8-gpu-deepseek-r1: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-deepseek-r1') | |
| runs-on: linux-mi325-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Accuracy Test (8-GPU DeepSeek-R1) | |
| timeout-minutes: 240 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-r1 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # ============================================== MI30x Combined Accuracy + Performance Tests ============================================== | |
| # 8-GPU Grok1-INT4 (Accuracy + Performance combined) | |
| nightly-8-gpu-grok1-int4: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok1-int4') | |
| runs-on: linux-mi325-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Accuracy Test (8-GPU Grok1-INT4) | |
| timeout-minutes: 60 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test (8-GPU Grok1-INT4) | |
| timeout-minutes: 60 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # 8-GPU Grok2 (Accuracy + Performance combined) | |
| nightly-8-gpu-grok2: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-grok2') | |
| runs-on: linux-mi325-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Accuracy Test (8-GPU Grok2) | |
| timeout-minutes: 60 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test (8-GPU Grok2) | |
| timeout-minutes: 60 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # 8-GPU DeepSeek-V3.1 (Accuracy + Performance combined) | |
| nightly-8-gpu-deepseek-v31: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-deepseek-v31') | |
| runs-on: linux-mi325-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd_ci_install_dependency.sh | |
| - name: Accuracy Test (8-GPU DeepSeek-V3.1) | |
| timeout-minutes: 120 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e SGLANG_USE_AITER=1 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-deepseek-v31 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test (8-GPU DeepSeek-V3.1) | |
| timeout-minutes: 300 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e SGLANG_USE_ROCM700A=1 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-deepseek-v31 --nightly --timeout-per-file 18000 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # ============================================== MI35x Tests ============================================== | |
| # MI35x 1-GPU tests - platform-agnostic tests that may work on CDNA4 (gfx950) | |
| nightly-test-1-gpu-mi35x: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-test-1-gpu-mi35x') | |
| runs-on: linux-mi35x-gpu-1 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Nightly Test MI35x (1-GPU) | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-1-gpu-mi35x --nightly --timeout-per-file 600 --continue-on-error || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # MI35x 8-GPU Accuracy Tests - GPT-OSS (accuracy only) | |
| nightly-accuracy-8-gpu-mi35x: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x') | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x (8-GPU GPT-OSS) | |
| timeout-minutes: 180 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # MI35x 8-GPU DeepSeek-R1-0528 Accuracy Test (separate job due to long loading time) | |
| nightly-accuracy-8-gpu-mi35x-deepseek-r1: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-accuracy-8-gpu-mi35x-deepseek-r1') | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-0528) | |
| timeout-minutes: 240 | |
| run: | | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-deepseek-r1 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # MI35x 8-GPU Grok1-INT4 (Accuracy + Performance combined) | |
| nightly-8-gpu-mi35x-grok1-int4: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok1-int4') | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x (8-GPU Grok1-INT4) | |
| timeout-minutes: 60 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test MI35x (8-GPU Grok1-INT4) | |
| timeout-minutes: 60 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # MI35x 8-GPU Grok2 (Accuracy + Performance combined) | |
| nightly-8-gpu-mi35x-grok2: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-grok2') | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x (8-GPU Grok2) | |
| timeout-minutes: 60 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test MI35x (8-GPU Grok2) | |
| timeout-minutes: 60 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-perf-8-gpu-mi35x-grok2 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| # MI35x 8-GPU DeepSeek-R1-MXFP4 (Accuracy + Performance combined) | |
| nightly-8-gpu-mi35x-deepseek-r1-mxfp4: | |
| if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-deepseek-r1-mxfp4') | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.ref || github.ref }} | |
| - name: Setup docker | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd_ci_start_container.sh | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd_ci_install_dependency.sh | |
| # Install tabulate for run_suite.py (missing in MI35x container) | |
| bash scripts/ci/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x (8-GPU DeepSeek-R1-MXFP4) | |
| timeout-minutes: 180 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-r1-mxfp4 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| - name: Performance Test MI35x (8-GPU DeepSeek-R1-MXFP4) | |
| timeout-minutes: 300 | |
| continue-on-error: true # Perf test failure doesn't fail the job if accuracy passed | |
| run: | | |
| > github_summary.md # Clear summary file | |
| bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| python3 registered/amd/perf/mi35x/test_deepseek_r1_mxfp4_perf_mi35x.py || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| check-all-jobs: | |
| if: always() && (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch') | |
| needs: | |
| # MI30x Unit Tests | |
| - nightly-test-1-gpu-unit | |
| # MI30x Accuracy Tests | |
| - nightly-accuracy-2-gpu | |
| - nightly-accuracy-2-gpu-vlm | |
| - nightly-accuracy-8-gpu | |
| - nightly-accuracy-8-gpu-deepseek-r1 | |
| # MI30x Combined Accuracy + Performance Tests | |
| - nightly-8-gpu-grok1-int4 | |
| - nightly-8-gpu-grok2 | |
| - nightly-8-gpu-deepseek-v31 | |
| # MI35x jobs | |
| - nightly-test-1-gpu-mi35x | |
| - nightly-accuracy-8-gpu-mi35x | |
| - nightly-accuracy-8-gpu-mi35x-deepseek-r1 | |
| - nightly-8-gpu-mi35x-grok1-int4 | |
| - nightly-8-gpu-mi35x-grok2 | |
| - nightly-8-gpu-mi35x-deepseek-r1-mxfp4 | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check if any job failed | |
| run: | | |
| if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then | |
| echo "One or more nightly test jobs failed" | |
| exit 1 | |
| fi | |
| if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then | |
| echo "One or more nightly test jobs were cancelled" | |
| exit 1 | |
| fi | |
| echo "All nightly test jobs passed" |