Skip to content

GPU Tests

GPU Tests #218

Workflow file for this run

# Copyright (c) 2024-2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ---------------------------------------------------------------------------
# GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot
# pattern: PRs are tested via push events to pull-request/* branches rather
# than pull_request events.
# See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/
# ---------------------------------------------------------------------------
name: GPU Tests
on:
schedule:
# Nightly at 02:00 UTC.
- cron: '0 2 * * *'
# disabled for now to avoid running on PRs
# push:
# branches:
# - "pull-request/[0-9]+"
workflow_dispatch:
inputs:
suite:
description: "GPU test suite to run"
required: true
default: all
type: choice
options:
- all
- smoke
- e2e
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
changes:
name: Detect changes
if: github.event_name != 'workflow_dispatch'
runs-on: linux-amd64-cpu4
permissions:
contents: read
outputs:
src_test_deps: ${{ steps.changes.outputs.src_test_deps }}
steps:
- uses: actions/checkout@v6
- name: Detect changes
id: changes
uses: ./.github/actions/detect-changes
gpu-smoke-test:
name: GPU Smoke Tests
needs: changes
# `changes` is intentionally skipped on workflow_dispatch. `always()` lets
# manual runs bypass that skipped dependency and run the selected GPU suite.
if: >-
${{
always() &&
(
github.event_name == 'workflow_dispatch' ||
needs.changes.outputs.src_test_deps == 'true'
) &&
(
github.event_name != 'workflow_dispatch' ||
inputs.suite == 'all' ||
inputs.suite == 'smoke'
)
}}
timeout-minutes: 30
runs-on: linux-amd64-gpu-a100-latest-1
steps:
- name: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup GPU test environment
uses: ./.github/actions/setup-gpu-test-env
- name: Run GPU smoke tests - train only
timeout-minutes: 10
run: make test-smoke-gpu-train-only
- name: Run GPU smoke tests - generation
timeout-minutes: 10
run: make test-smoke-gpu-generation
- name: Run GPU smoke tests - resume
timeout-minutes: 10
run: make test-smoke-gpu-resume
- name: Run GPU smoke tests - structured generation
timeout-minutes: 10
run: make test-smoke-gpu-structured-generation
- name: Run GPU smoke tests - timeseries
timeout-minutes: 10
run: make test-smoke-gpu-timeseries
- name: Run GPU smoke tests - SmolLM2
timeout-minutes: 20
run: make test-smoke-gpu-smollm2
gpu-e2e-test:
name: GPU E2E Tests
needs: changes
# `changes` is intentionally skipped on workflow_dispatch. `always()` lets
# manual runs bypass that skipped dependency and run the selected GPU suite.
if: >-
${{
always() &&
(
github.event_name == 'workflow_dispatch' ||
needs.changes.outputs.src_test_deps == 'true'
) &&
(
github.event_name != 'workflow_dispatch' ||
inputs.suite == 'all' ||
inputs.suite == 'e2e'
)
}}
timeout-minutes: 60
runs-on: linux-amd64-gpu-a100-latest-1
steps:
- name: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Setup GPU test environment
uses: ./.github/actions/setup-gpu-test-env
- name: Run GPU E2E tests
timeout-minutes: 45
run: make test-e2e
# ---------------------------------------------------------------------------
# Single required status check for branch protection.
# Smoke tests are required; E2E failures produce a warning but don't block.
# ---------------------------------------------------------------------------
gpu-ci-status:
name: GPU CI Status
if: always() && !cancelled()
needs: [changes, gpu-smoke-test, gpu-e2e-test]
runs-on: linux-amd64-cpu4
steps:
- name: Check job results
run: |
echo "changes: ${{ needs.changes.result }}"
echo "gpu-smoke-test: ${{ needs.gpu-smoke-test.result }}"
echo "gpu-e2e-test: ${{ needs.gpu-e2e-test.result }}"
if [[ "${{ needs.changes.result }}" == "failure" ]]; then
echo "::error::Change detection failed"
exit 1
fi
if [[ "${{ needs.gpu-smoke-test.result }}" == "failure" ]]; then
echo "::error::GPU smoke tests failed (required)"
exit 1
fi
if [[ "${{ needs.gpu-e2e-test.result }}" == "failure" ]]; then
echo "::warning::GPU E2E tests failed (informational, does not block merge)"
fi
echo "All required GPU jobs passed (or were skipped)."