Skip to content

GPU Tests

GPU Tests #178

Workflow file for this run

# Copyright (c) 2024-2026, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ---------------------------------------------------------------------------
# GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot
# pattern: PRs are tested via push events to pull-request/* branches rather
# than pull_request events.
# See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/
# ---------------------------------------------------------------------------
name: GPU Tests
on:
schedule:
- cron: '0 2 * * *'
push:
branches:
- "pull-request/[0-9]+"
workflow_dispatch:
defaults:
run:
shell: bash -x -e -u -o pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
changes:
name: Detect changes
# No `if` guard here: dorny/paths-filter outputs `true` for all filters on
# workflow_dispatch (no base to diff against), so the E2E job always runs.
# Guarding this job would skip it, and a skipped `needs` dependency causes
# downstream jobs to be skipped even when their own `if` condition passes.
runs-on: linux-amd64-cpu4
permissions:
contents: read
outputs:
src: ${{ steps.changes.outputs.src }}
test: ${{ steps.changes.outputs.test }}
steps:
- uses: actions/checkout@v6
- name: Detect changes
id: changes
uses: ./.github/actions/detect-changes
gpu-smoke-test:
name: GPU Smoke Tests
needs: changes
if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }}
timeout-minutes: 30
runs-on: nemo-ci-aws-gpu-x2
strategy:
fail-fast: false
matrix:
python-version: ["3.11"]
steps:
- name: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install make
run: apt-get update && apt-get install -y --no-install-recommends make
- name: Setup Python environment
uses: ./.github/actions/setup-python-env
with:
python-version: ${{ matrix.python-version }}
bootstrap-tools: "true"
- name: Bootstrap CUDA environment
run: make bootstrap-nss cu128
- name: Check GPU availability
run: |
uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
- name: Run GPU smoke tests
timeout-minutes: 20
run: make test-smoke-gpu
gpu-e2e-test:
name: GPU E2E Tests
needs: changes
if: ${{ needs.changes.outputs.src == 'true' || needs.changes.outputs.test == 'true' || github.event_name == 'workflow_dispatch' }}
timeout-minutes: 55
runs-on: nemo-ci-aws-gpu-x2
steps:
- name: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Install make
run: apt-get update && apt-get install -y --no-install-recommends make
- name: Setup Python environment
uses: ./.github/actions/setup-python-env
with:
python-version: "3.11"
bootstrap-tools: "true"
- name: Bootstrap CUDA environment
run: make bootstrap-nss cu128
- name: Check GPU availability
run: |
uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
- name: Run GPU E2E tests
timeout-minutes: 45
run: make test-e2e
# ---------------------------------------------------------------------------
# Single required status check for branch protection.
# Smoke tests are required; E2E failures produce a warning but don't block.
# ---------------------------------------------------------------------------
gpu-ci-status:
name: GPU CI Status
if: always() && !cancelled()
needs: [changes, gpu-smoke-test, gpu-e2e-test]
runs-on: linux-amd64-cpu4
steps:
- name: Check job results
run: |
echo "changes: ${{ needs.changes.result }}"
echo "gpu-smoke-test: ${{ needs.gpu-smoke-test.result }}"
echo "gpu-e2e-test: ${{ needs.gpu-e2e-test.result }}"
if [[ "${{ needs.changes.result }}" == "failure" ]]; then
echo "::error::Change detection failed"
exit 1
fi
if [[ "${{ needs.gpu-smoke-test.result }}" == "failure" ]]; then
echo "::error::GPU smoke tests failed (required)"
exit 1
fi
if [[ "${{ needs.gpu-e2e-test.result }}" == "failure" ]]; then
echo "::warning::GPU E2E tests failed (informational, does not block merge)"
fi
echo "All required GPU jobs passed (or were skipped)."