GPU Tests #218
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2024-2026, NVIDIA CORPORATION. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # --------------------------------------------------------------------------- | |
| # GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot | |
| # pattern: PRs are tested via push events to pull-request/* branches rather | |
| # than pull_request events. | |
| # See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/ | |
| # --------------------------------------------------------------------------- | |
| name: GPU Tests | |
| on: | |
| schedule: | |
| # Nightly at 02:00 UTC. | |
| - cron: '0 2 * * *' | |
| # disabled for now to avoid running on PRs | |
| # push: | |
| # branches: | |
| # - "pull-request/[0-9]+" | |
| workflow_dispatch: | |
| inputs: | |
| suite: | |
| description: "GPU test suite to run" | |
| required: true | |
| default: all | |
| type: choice | |
| options: | |
| - all | |
| - smoke | |
| - e2e | |
| defaults: | |
| run: | |
| shell: bash -x -e -u -o pipefail {0} | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| changes: | |
| name: Detect changes | |
| if: github.event_name != 'workflow_dispatch' | |
| runs-on: linux-amd64-cpu4 | |
| permissions: | |
| contents: read | |
| outputs: | |
| src_test_deps: ${{ steps.changes.outputs.src_test_deps }} | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - name: Detect changes | |
| id: changes | |
| uses: ./.github/actions/detect-changes | |
| gpu-smoke-test: | |
| name: GPU Smoke Tests | |
| needs: changes | |
| # `changes` is intentionally skipped on workflow_dispatch. `always()` lets | |
| # manual runs bypass that skipped dependency and run the selected GPU suite. | |
| if: >- | |
| ${{ | |
| always() && | |
| ( | |
| github.event_name == 'workflow_dispatch' || | |
| needs.changes.outputs.src_test_deps == 'true' | |
| ) && | |
| ( | |
| github.event_name != 'workflow_dispatch' || | |
| inputs.suite == 'all' || | |
| inputs.suite == 'smoke' | |
| ) | |
| }} | |
| timeout-minutes: 30 | |
| runs-on: linux-amd64-gpu-a100-latest-1 | |
| steps: | |
| - name: checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Setup GPU test environment | |
| uses: ./.github/actions/setup-gpu-test-env | |
| - name: Run GPU smoke tests - train only | |
| timeout-minutes: 10 | |
| run: make test-smoke-gpu-train-only | |
| - name: Run GPU smoke tests - generation | |
| timeout-minutes: 10 | |
| run: make test-smoke-gpu-generation | |
| - name: Run GPU smoke tests - resume | |
| timeout-minutes: 10 | |
| run: make test-smoke-gpu-resume | |
| - name: Run GPU smoke tests - structured generation | |
| timeout-minutes: 10 | |
| run: make test-smoke-gpu-structured-generation | |
| - name: Run GPU smoke tests - timeseries | |
| timeout-minutes: 10 | |
| run: make test-smoke-gpu-timeseries | |
| - name: Run GPU smoke tests - SmolLM2 | |
| timeout-minutes: 20 | |
| run: make test-smoke-gpu-smollm2 | |
| gpu-e2e-test: | |
| name: GPU E2E Tests | |
| needs: changes | |
| # `changes` is intentionally skipped on workflow_dispatch. `always()` lets | |
| # manual runs bypass that skipped dependency and run the selected GPU suite. | |
| if: >- | |
| ${{ | |
| always() && | |
| ( | |
| github.event_name == 'workflow_dispatch' || | |
| needs.changes.outputs.src_test_deps == 'true' | |
| ) && | |
| ( | |
| github.event_name != 'workflow_dispatch' || | |
| inputs.suite == 'all' || | |
| inputs.suite == 'e2e' | |
| ) | |
| }} | |
| timeout-minutes: 60 | |
| runs-on: linux-amd64-gpu-a100-latest-1 | |
| steps: | |
| - name: checkout | |
| uses: actions/checkout@v6 | |
| with: | |
| fetch-depth: 0 | |
| - name: Setup GPU test environment | |
| uses: ./.github/actions/setup-gpu-test-env | |
| - name: Run GPU E2E tests | |
| timeout-minutes: 45 | |
| run: make test-e2e | |
| # --------------------------------------------------------------------------- | |
| # Single required status check for branch protection. | |
| # Smoke tests are required; E2E failures produce a warning but don't block. | |
| # --------------------------------------------------------------------------- | |
| gpu-ci-status: | |
| name: GPU CI Status | |
| if: always() && !cancelled() | |
| needs: [changes, gpu-smoke-test, gpu-e2e-test] | |
| runs-on: linux-amd64-cpu4 | |
| steps: | |
| - name: Check job results | |
| run: | | |
| echo "changes: ${{ needs.changes.result }}" | |
| echo "gpu-smoke-test: ${{ needs.gpu-smoke-test.result }}" | |
| echo "gpu-e2e-test: ${{ needs.gpu-e2e-test.result }}" | |
| if [[ "${{ needs.changes.result }}" == "failure" ]]; then | |
| echo "::error::Change detection failed" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.gpu-smoke-test.result }}" == "failure" ]]; then | |
| echo "::error::GPU smoke tests failed (required)" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.gpu-e2e-test.result }}" == "failure" ]]; then | |
| echo "::warning::GPU E2E tests failed (informational, does not block merge)" | |
| fi | |
| echo "All required GPU jobs passed (or were skipped)." |