fix: extend moe alltoall top-k specializations #3049
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # CI workflow using AWS self-hosted runners. | |
| # Runs AOT build tests and GPU unit tests on push/PR to main. | |
| # Uses ci/bash.sh for Docker execution (same as Jenkins). | |
| # | |
| # Permission Control: | |
| # - Push to main: Always runs | |
| # - PR from org members (ci-users team): Runs automatically | |
| # - PR from external contributors: Requires 'run-ci' label | |
| # (added via @flashinfer-bot run command from authorized user) | |
| # | |
| # Rerun Strategy: | |
| # - Spot jobs run with fail-fast: true | |
| # - Background monitor checks AWS metadata for spot termination notice | |
| # - If termination detected, writes marker to log (captured by GitHub) | |
| # - Analyze job checks logs for marker to decide if should rerun | |
| # - Spot termination: rerun all failed/cancelled jobs on on-demand | |
| # - Real failure: no rerun, workflow fails fast | |
| name: PR Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| types: [opened, synchronize, reopened, labeled, ready_for_review] | |
| workflow_dispatch: | |
| inputs: | |
| skip_aot: | |
| description: 'Skip AOT build tests' | |
| type: boolean | |
| default: false | |
| skip_gpu: | |
| description: 'Skip GPU tests' | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: pr-test-${{ github.ref }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| actions: read | |
| env: | |
| EXECUTOR_NUMBER: "0" | |
| SKIP_CI_PATTERNS: '\.md$|\.txt$|^docs/|^docker/|^licenses/|^LICENSE$|^NOTICE$|^benchmarks/|^\.github/CODEOWNERS$' | |
| jobs: | |
| # --------------------------------------------------------------------------- | |
| # Gate - Check if PR is authorized to run CI | |
| # --------------------------------------------------------------------------- | |
| gate: | |
| name: Permission Check | |
| runs-on: ubuntu-latest | |
| outputs: | |
| authorized: ${{ steps.check.outputs.authorized }} | |
| steps: | |
| - name: Check authorization | |
| id: check | |
| env: | |
| GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }} | |
| run: | | |
| # Always allow push to main and workflow_dispatch | |
| if [[ "${{ github.event_name }}" != "pull_request" ]]; then | |
| echo "authorized=true" >> "$GITHUB_OUTPUT" | |
| echo "Not a PR, authorized" | |
| exit 0 | |
| fi | |
| # Skip draft PRs to save CI capacity | |
| if [[ "${{ github.event.pull_request.draft }}" == "true" ]]; then | |
| echo "authorized=false" >> "$GITHUB_OUTPUT" | |
| echo "Draft PR, skipping CI" | |
| exit 0 | |
| fi | |
| # Check if PR has run-ci label | |
| if [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-ci') }}" == "true" ]]; then | |
| echo "authorized=true" >> "$GITHUB_OUTPUT" | |
| echo "PR has run-ci label, authorized" | |
| exit 0 | |
| fi | |
| # Check if PR author is a member of ci-users team | |
| AUTHOR="${{ github.event.pull_request.user.login }}" | |
| ORG="${{ github.repository_owner }}" | |
| TEAM="ci-users" | |
| echo "Checking if $AUTHOR is a member of $ORG/$TEAM..." | |
| if [[ -z "$GH_TOKEN" ]]; then | |
| echo "::warning::FLASHINFER_GITHUB_TOKEN not set, falling back to association check" | |
| # Fallback: check if author has write access | |
| ASSOC="${{ github.event.pull_request.author_association }}" | |
| if [[ "$ASSOC" =~ ^(OWNER|MEMBER|COLLABORATOR)$ ]]; then | |
| echo "authorized=true" >> "$GITHUB_OUTPUT" | |
| echo "PR author has $ASSOC access, authorized" | |
| else | |
| echo "authorized=false" >> "$GITHUB_OUTPUT" | |
| echo "PR author is $ASSOC, not authorized" | |
| fi | |
| exit 0 | |
| fi | |
| # Check team membership | |
| MEMBERS=$(gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| --paginate \ | |
| "/orgs/${ORG}/teams/${TEAM}/members" \ | |
| --jq '.[].login' 2>&1) || { | |
| echo "::warning::Failed to get team members: $MEMBERS" | |
| echo "authorized=false" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| } | |
| if echo "$MEMBERS" | grep -qx "$AUTHOR"; then | |
| echo "authorized=true" >> "$GITHUB_OUTPUT" | |
| echo "$AUTHOR is a member of $TEAM, authorized" | |
| else | |
| echo "authorized=false" >> "$GITHUB_OUTPUT" | |
| echo "$AUTHOR is not a member of $TEAM, not authorized" | |
| fi | |
| # --------------------------------------------------------------------------- | |
| # Setup - Read docker tag and check if build should be skipped | |
| # --------------------------------------------------------------------------- | |
| setup: | |
| name: Setup | |
| needs: gate | |
| if: needs.gate.outputs.authorized == 'true' | |
| runs-on: ubuntu-latest | |
| outputs: | |
| docker_tag: ${{ steps.get-tag.outputs.tag }} | |
| skip_build: ${{ steps.check.outputs.skip }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Get Docker Tag | |
| id: get-tag | |
| run: | | |
| TAG=$(grep 'flashinfer/flashinfer-ci-cu129:' ci/docker-tags.yml | cut -d':' -f2 | tr -d ' ') | |
| if [ -z "$TAG" ]; then | |
| echo "::error::Failed to extract Docker tag from ci/docker-tags.yml" | |
| exit 1 | |
| fi | |
| echo "tag=$TAG" >> $GITHUB_OUTPUT | |
| echo "Docker tag: $TAG" | |
| - name: Check Skip Conditions | |
| id: check | |
| run: | | |
| if [ "${{ github.event_name }}" != "pull_request" ]; then | |
| echo "skip=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Use PR event SHAs for reliable diff (avoids issues with origin refs) | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| HEAD_SHA="${{ github.event.pull_request.head.sha }}" | |
| CHANGED=$(git diff --name-only "$BASE_SHA...$HEAD_SHA") | |
| # TODO (yongwww): Add back ^\.github/ before merging to main | |
| SKIP_PATTERNS="$SKIP_CI_PATTERNS" | |
| SKIP=true | |
| while IFS= read -r file; do | |
| if [ -n "$file" ] && ! echo "$file" | grep -qE "$SKIP_PATTERNS"; then | |
| SKIP=false | |
| break | |
| fi | |
| done <<< "$CHANGED" | |
| echo "skip=$SKIP" >> $GITHUB_OUTPUT | |
| if [ "$SKIP" == "true" ]; then | |
| echo "::notice::Skipping build - only docs/config files changed" | |
| fi | |
| # --------------------------------------------------------------------------- | |
| # AOT Build Import Tests (Spot + On-Demand Rerun) | |
| # --------------------------------------------------------------------------- | |
| aot-build-import: | |
| name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) | |
| needs: [gate, setup] | |
| if: | | |
| needs.gate.outputs.authorized == 'true' && | |
| needs.setup.outputs.skip_build != 'true' && | |
| github.event.inputs.skip_aot != 'true' | |
| runs-on: | |
| - self-hosted | |
| - linux | |
| - ${{ matrix.arch }} | |
| - cpu | |
| - spot | |
| timeout-minutes: 360 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| arch: [x64, arm64] | |
| cuda: [cu126, cu128, cu129, cu130] | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Start spot termination monitor | |
| run: ./scripts/task_monitor_spot.sh & | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run Test | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh | |
| analyze-aot-failure: | |
| name: Analyze AOT Failure | |
| needs: [setup, aot-build-import] | |
| if: "!cancelled() && (needs.aot-build-import.result == 'failure' || needs.aot-build-import.result == 'cancelled')" | |
| runs-on: ubuntu-latest | |
| outputs: | |
| is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} | |
| rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} | |
| steps: | |
| - name: Checkout scripts | |
| uses: actions/checkout@v4 | |
| with: | |
| sparse-checkout: scripts | |
| sparse-checkout-cone-mode: false | |
| - name: Analyze failure from job logs | |
| id: analyze | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: ./scripts/task_analyze_spot.sh 'startswith("AOT")' '${{ github.repository }}' '${{ github.run_id }}' | |
| - name: Build rerun matrix | |
| id: matrix | |
| if: steps.analyze.outputs.is_spot_termination == 'true' | |
| run: | | |
| MATRIX='{"include":[' | |
| for arch in x64 arm64; do | |
| for cuda in cu126 cu128 cu129 cu130; do | |
| MATRIX+='{"arch":"'$arch'","cuda":"'$cuda'"},' | |
| done | |
| done | |
| MATRIX="${MATRIX%,}]}" | |
| echo "rerun_matrix=$MATRIX" >> $GITHUB_OUTPUT | |
| aot-build-import-rerun: | |
| name: AOT Build Import Rerun (${{ matrix.arch }}, ${{ matrix.cuda }}) | |
| needs: [setup, analyze-aot-failure] | |
| if: | | |
| !cancelled() && | |
| needs.analyze-aot-failure.outputs.is_spot_termination == 'true' && | |
| needs.analyze-aot-failure.outputs.rerun_matrix != '' | |
| runs-on: | |
| - self-hosted | |
| - linux | |
| - ${{ matrix.arch }} | |
| - cpu | |
| - on-demand | |
| timeout-minutes: 360 | |
| strategy: | |
| fail-fast: true | |
| matrix: ${{ fromJSON(needs.analyze-aot-failure.outputs.rerun_matrix) }} | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run Test | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh | |
| # --------------------------------------------------------------------------- | |
| # GPU JIT Tests - SM86 (A10G) - Spot + On-Demand Rerun | |
| # --------------------------------------------------------------------------- | |
| gpu-tests-a10g: | |
| name: JIT Unittest ${{ matrix.shard }} (A10G) | |
| needs: [gate, setup] | |
| if: | | |
| needs.gate.outputs.authorized == 'true' && | |
| needs.setup.outputs.skip_build != 'true' && | |
| github.event.inputs.skip_gpu != 'true' | |
| runs-on: [self-hosted, linux, x64, gpu, sm86, spot] | |
| timeout-minutes: 360 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| shard: [1, 2, 3, 4, 5] | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| nvidia-smi || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Start spot termination monitor | |
| run: ./scripts/task_monitor_spot.sh & | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run JIT Unittest Part ${{ matrix.shard }} | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh | |
| analyze-gpu-a10g-failure: | |
| name: Analyze GPU A10G Failure | |
| needs: [setup, gpu-tests-a10g] | |
| if: "!cancelled() && (needs.gpu-tests-a10g.result == 'failure' || needs.gpu-tests-a10g.result == 'cancelled')" | |
| runs-on: ubuntu-latest | |
| outputs: | |
| is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} | |
| rerun_matrix: ${{ steps.matrix.outputs.rerun_matrix }} | |
| steps: | |
| - name: Checkout scripts | |
| uses: actions/checkout@v4 | |
| with: | |
| sparse-checkout: scripts | |
| sparse-checkout-cone-mode: false | |
| - name: Analyze failure from job logs | |
| id: analyze | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: ./scripts/task_analyze_spot.sh 'contains("A10G")' '${{ github.repository }}' '${{ github.run_id }}' | |
| - name: Build rerun matrix | |
| id: matrix | |
| if: steps.analyze.outputs.is_spot_termination == 'true' | |
| run: | | |
| echo 'rerun_matrix={"include":[{"shard":1},{"shard":2},{"shard":3},{"shard":4},{"shard":5}]}' >> $GITHUB_OUTPUT | |
| gpu-tests-a10g-rerun: | |
| name: JIT Rerun ${{ matrix.shard }} (A10G) | |
| needs: [setup, analyze-gpu-a10g-failure] | |
| if: | | |
| !cancelled() && | |
| needs.analyze-gpu-a10g-failure.outputs.is_spot_termination == 'true' && | |
| needs.analyze-gpu-a10g-failure.outputs.rerun_matrix != '' | |
| runs-on: [self-hosted, linux, x64, gpu, sm86, on-demand] | |
| timeout-minutes: 360 | |
| strategy: | |
| fail-fast: true | |
| matrix: ${{ fromJSON(needs.analyze-gpu-a10g-failure.outputs.rerun_matrix) }} | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| nvidia-smi || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run JIT Unittest Part ${{ matrix.shard }} | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh | |
| # --------------------------------------------------------------------------- | |
| # GPU JIT Tests - SM75 (T4) - Spot + On-Demand Rerun | |
| # --------------------------------------------------------------------------- | |
| gpu-tests-t4: | |
| name: JIT Unittest (T4) | |
| needs: [gate, setup] | |
| if: | | |
| needs.gate.outputs.authorized == 'true' && | |
| needs.setup.outputs.skip_build != 'true' && | |
| github.event.inputs.skip_gpu != 'true' | |
| runs-on: [self-hosted, linux, x64, gpu, sm75, spot] | |
| timeout-minutes: 360 | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| nvidia-smi || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Start spot termination monitor | |
| run: ./scripts/task_monitor_spot.sh & | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run JIT Unittest Part 3 (T4) | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh | |
| analyze-gpu-t4-failure: | |
| name: Analyze GPU T4 Failure | |
| needs: [setup, gpu-tests-t4] | |
| if: "!cancelled() && (needs.gpu-tests-t4.result == 'failure' || needs.gpu-tests-t4.result == 'cancelled')" | |
| runs-on: ubuntu-latest | |
| outputs: | |
| is_spot_termination: ${{ steps.analyze.outputs.is_spot_termination }} | |
| steps: | |
| - name: Checkout scripts | |
| uses: actions/checkout@v4 | |
| with: | |
| sparse-checkout: scripts | |
| sparse-checkout-cone-mode: false | |
| - name: Analyze failure from job logs | |
| id: analyze | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: ./scripts/task_analyze_spot.sh 'contains("T4")' '${{ github.repository }}' '${{ github.run_id }}' | |
| gpu-tests-t4-rerun: | |
| name: JIT Rerun (T4) | |
| needs: [setup, analyze-gpu-t4-failure] | |
| if: | | |
| !cancelled() && | |
| needs.analyze-gpu-t4-failure.outputs.is_spot_termination == 'true' | |
| runs-on: [self-hosted, linux, x64, gpu, sm75, on-demand] | |
| timeout-minutes: 360 | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Stop all Docker containers to free memory | |
| docker stop $(docker ps -q) 2>/dev/null || true | |
| docker rm $(docker ps -aq) 2>/dev/null || true | |
| # Clean workspace and caches | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| nvidia-smi || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run JIT Unittest Part 3 (T4) | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh | |
| # --------------------------------------------------------------------------- | |
| # GPU JIT Tests - H100 (Hopper) - Capacity Block | |
| # Requires manually purchased CB via AWS Console | |
| # --------------------------------------------------------------------------- | |
| gpu-tests-h100: | |
| name: JIT Unittest (H100) | |
| needs: [gate, setup] | |
| if: | | |
| needs.gate.outputs.authorized == 'true' && | |
| needs.setup.outputs.skip_build != 'true' && | |
| github.event.inputs.skip_gpu != 'true' | |
| runs-on: [self-hosted, linux, x64, gpu, h100] | |
| timeout-minutes: 360 | |
| env: | |
| DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }} | |
| steps: | |
| - name: Cleanup | |
| run: | | |
| # Shared multi-runner node: only stop containers started by THIS runner (identified by workspace mount) | |
| for cid in $(docker ps -q); do | |
| if docker inspect "$cid" --format '{{json .Mounts}}' 2>/dev/null | grep -qF "${{ github.workspace }}/"; then | |
| docker stop "$cid" 2>/dev/null || true | |
| fi | |
| done | |
| sudo rm -rf ${{ github.workspace }}/* || true | |
| sudo rm -rf ${{ github.workspace }}/.[!.]* || true | |
| rm -rf ~/.cache/flashinfer_jit || true | |
| docker container prune -f 2>/dev/null || true | |
| docker image prune -f || true | |
| docker builder prune -f --filter "until=24h" || true | |
| echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" | |
| nvidia-smi -i "$CUDA_VISIBLE_DEVICES" || true | |
| - uses: actions/checkout@v4 | |
| with: | |
| submodules: recursive | |
| - name: Show Node Info | |
| run: ./scripts/task_show_node_info.sh | |
| env: | |
| NODE_NAME: ${{ runner.name }} | |
| WORKSPACE: ${{ github.workspace }} | |
| BUILD_NUMBER: ${{ github.run_number }} | |
| - name: Run H100 Kernel Tests | |
| run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_run_unit_tests.sh | |
| # --------------------------------------------------------------------------- | |
| # Test Results Summary | |
| # --------------------------------------------------------------------------- | |
| test-results-summary: | |
| name: Test Results Summary | |
| if: "!cancelled()" | |
| needs: | |
| - gate | |
| - setup | |
| - aot-build-import | |
| - analyze-aot-failure | |
| - aot-build-import-rerun | |
| - gpu-tests-a10g | |
| - analyze-gpu-a10g-failure | |
| - gpu-tests-a10g-rerun | |
| - gpu-tests-t4 | |
| - analyze-gpu-t4-failure | |
| - gpu-tests-t4-rerun | |
| - gpu-tests-h100 | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check Authorization | |
| if: needs.gate.outputs.authorized != 'true' | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Check skip patterns for unauthorized PRs | |
| if: needs.gate.outputs.authorized != 'true' | |
| id: skip-check | |
| run: | | |
| if [ "${{ github.event_name }}" != "pull_request" ]; then | |
| echo "skip=false" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| HEAD_SHA="${{ github.event.pull_request.head.sha }}" | |
| CHANGED=$(git diff --name-only "$BASE_SHA...$HEAD_SHA") | |
| SKIP_PATTERNS="$SKIP_CI_PATTERNS" | |
| SKIP=true | |
| while IFS= read -r file; do | |
| if [ -n "$file" ] && ! echo "$file" | grep -qE "$SKIP_PATTERNS"; then | |
| SKIP=false | |
| break | |
| fi | |
| done <<< "$CHANGED" | |
| echo "skip=$SKIP" >> "$GITHUB_OUTPUT" | |
| - name: Report unauthorized PR status | |
| if: needs.gate.outputs.authorized != 'true' | |
| run: | | |
| echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY | |
| if [ "${{ steps.skip-check.outputs.skip }}" == "true" ]; then | |
| echo "CI not required (docs/config only changes)" >> $GITHUB_STEP_SUMMARY | |
| echo "::notice::Docs-only PR — no CI needed" | |
| exit 0 | |
| fi | |
| echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY | |
| echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::CI skipped — pending authorization" | |
| exit 1 | |
| - name: Check Results | |
| if: needs.gate.outputs.authorized == 'true' | |
| run: | | |
| echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY | |
| # Helper function to check job status | |
| check_status() { | |
| local name=$1 skip=$2 spot=$3 spot_term=$4 rerun=$5 | |
| echo "$name" >> $GITHUB_STEP_SUMMARY | |
| if [ "$skip" == "true" ]; then | |
| echo "- Status: Skipped" >> $GITHUB_STEP_SUMMARY | |
| elif [ "$spot" == "success" ]; then | |
| echo "- Status: Passed (spot)" >> $GITHUB_STEP_SUMMARY | |
| elif [ "$spot_term" == "true" ] && [ "$rerun" == "success" ]; then | |
| echo "- Status: Passed (on-demand rerun)" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "- Status: Failed" >> $GITHUB_STEP_SUMMARY | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| echo "Test Results Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then | |
| echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY | |
| exit 0 | |
| fi | |
| FAILED=false | |
| check_status "AOT Build Import Tests" \ | |
| "${{ github.event.inputs.skip_aot }}" \ | |
| "${{ needs.aot-build-import.result }}" \ | |
| "${{ needs.analyze-aot-failure.outputs.is_spot_termination }}" \ | |
| "${{ needs.aot-build-import-rerun.result }}" || FAILED=true | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| check_status "GPU Tests (A10G)" \ | |
| "${{ github.event.inputs.skip_gpu }}" \ | |
| "${{ needs.gpu-tests-a10g.result }}" \ | |
| "${{ needs.analyze-gpu-a10g-failure.outputs.is_spot_termination }}" \ | |
| "${{ needs.gpu-tests-a10g-rerun.result }}" || FAILED=true | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| check_status "GPU Tests (T4)" \ | |
| "${{ github.event.inputs.skip_gpu }}" \ | |
| "${{ needs.gpu-tests-t4.result }}" \ | |
| "${{ needs.analyze-gpu-t4-failure.outputs.is_spot_termination }}" \ | |
| "${{ needs.gpu-tests-t4-rerun.result }}" || FAILED=true | |
| # H100 tests (no rerun logic yet - CB instances don't get spot terminated) | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| H100="${{ needs.gpu-tests-h100.result }}" | |
| echo "GPU Tests (H100): $H100" >> $GITHUB_STEP_SUMMARY | |
| if [ "$H100" != "success" ] && [ "$H100" != "skipped" ] && [ "${{ github.event.inputs.skip_gpu }}" != "true" ]; then | |
| FAILED=true | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| if [ "$FAILED" == "true" ]; then | |
| echo "Result: Tests Failed" >> $GITHUB_STEP_SUMMARY | |
| exit 1 | |
| fi | |
| echo "Result: Tests Passed" >> $GITHUB_STEP_SUMMARY |