GPU Tests #188
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2024-2026, NVIDIA CORPORATION. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # --------------------------------------------------------------------------- | |
| # GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot | |
| # pattern: PRs are tested via push events to pull-request/* branches rather | |
| # than pull_request events. | |
| # See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/ | |
| # --------------------------------------------------------------------------- | |
| name: GPU Tests | |
| on: | |
| schedule: | |
| - cron: '0 2 * * *' | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| workflow_dispatch: | |
| defaults: | |
| run: | |
| shell: bash -x -e -u -o pipefail {0} | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| changes: | |
| name: Detect changes | |
| # No `if` guard here: dorny/paths-filter outputs `true` for all filters on | |
| # workflow_dispatch (no base to diff against), so the E2E job always runs. | |
| # Guarding this job would skip it, and a skipped `needs` dependency causes | |
| # downstream jobs to be skipped even when their own `if` condition passes. | |
| runs-on: linux-amd64-cpu4 | |
| permissions: | |
| contents: read | |
| outputs: | |
| src: ${{ steps.changes.outputs.src }} | |
| test: ${{ steps.changes.outputs.test }} | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Detect changes | |
| id: changes | |
| uses: ./.github/actions/detect-changes | |
| gpu-smoke-test: | |
| name: GPU Smoke Tests | |
| needs: changes | |
| if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }} | |
| timeout-minutes: 30 | |
| runs-on: nemo-ci-aws-gpu-x2 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.11"] | |
| steps: | |
| - name: checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install make | |
| run: apt-get update && apt-get install -y --no-install-recommends make | |
| - name: Setup Python environment | |
| uses: ./.github/actions/setup-python-env | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| bootstrap-tools: "true" | |
| - name: Bootstrap CUDA environment | |
| run: make bootstrap-nss cu128 | |
| - name: Check GPU availability | |
| run: | | |
| uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())" | |
| - name: Run GPU smoke tests | |
| timeout-minutes: 20 | |
| run: make test-smoke-gpu | |
| gpu-e2e-test: | |
| name: GPU E2E Tests | |
| needs: changes | |
| if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }} | |
| timeout-minutes: 55 | |
| runs-on: nemo-ci-aws-gpu-x2 | |
| steps: | |
| - name: checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Install make | |
| run: apt-get update && apt-get install -y --no-install-recommends make | |
| - name: Setup Python environment | |
| uses: ./.github/actions/setup-python-env | |
| with: | |
| python-version: "3.11" | |
| bootstrap-tools: "true" | |
| - name: Bootstrap CUDA environment | |
| run: make bootstrap-nss cu128 | |
| - name: Check GPU availability | |
| run: | | |
| uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())" | |
| - name: Run GPU E2E tests | |
| timeout-minutes: 45 | |
| run: make test-e2e | |
| # --------------------------------------------------------------------------- | |
| # Single required status check for branch protection. | |
| # Smoke tests are required; E2E failures produce a warning but don't block. | |
| # --------------------------------------------------------------------------- | |
| gpu-ci-status: | |
| name: GPU CI Status | |
| if: always() && !cancelled() | |
| needs: [changes, gpu-smoke-test, gpu-e2e-test] | |
| runs-on: linux-amd64-cpu4 | |
| steps: | |
| - name: Check job results | |
| run: | | |
| echo "changes: ${{ needs.changes.result }}" | |
| echo "gpu-smoke-test: ${{ needs.gpu-smoke-test.result }}" | |
| echo "gpu-e2e-test: ${{ needs.gpu-e2e-test.result }}" | |
| if [[ "${{ needs.changes.result }}" == "failure" ]]; then | |
| echo "::error::Change detection failed" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.gpu-smoke-test.result }}" == "failure" ]]; then | |
| echo "::error::GPU smoke tests failed (required)" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.gpu-e2e-test.result }}" == "failure" ]]; then | |
| echo "::warning::GPU E2E tests failed (informational, does not block merge)" | |
| fi | |
| echo "All required GPU jobs passed (or were skipped)." |