Skip to content

Commit 10f4e40

Browse files
committed
feat: adding generator for cuda deps
Signed-off-by: Aaron Gonzales <aagonzales@nvidia.com>
1 parent fc118d2 commit 10f4e40

27 files changed

Lines changed: 5077 additions & 2225 deletions

.claude/commands/bootstrap.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ Set up the development environment from scratch.
1414
```bash
1515
make bootstrap-nss cpu # CPU-only (macOS or Linux without GPU)
1616
make bootstrap-nss cuda # CUDA 12.8 (Linux with NVIDIA GPU)
17-
make bootstrap-nss engine # Engine dependencies only (no torch)
17+
make bootstrap-nss cu130 # CUDA 13.0 (Linux with NVIDIA GPU)
18+
make bootstrap-nss docs # Documentation dependencies only
1819
make bootstrap-nss dev # Minimal dev dependencies only
1920
```
2021

.cursor/rules/repo-navigation.mdc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Cursor discovers skills directly from `.agents/skills/` -- no symlinks needed.
2727

2828
| File | Purpose |
2929
|------|---------|
30-
| `pyproject.toml` | Package metadata, dependencies, extras (cpu/cu128/engine), uv config |
30+
| `pyproject.toml` | Package metadata, dependencies, runtime extras (cpu/cu128/cu130), uv config |
3131
| `pytest.ini` | Test markers, pytest options, timeout, parallelism |
3232
| `ruff.toml` | Ruff linting and formatting rules |
3333
| `mkdocs.yml` | Documentation site config (MkDocs Material) |

.cursor/setup-worktree.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ fi
1616

1717
# Bare --frozen installs the base environment. For GPU dev work (ty, import
1818
# checks, GPU tests) run the full command manually after setup:
19-
# uv sync --frozen --extra cu128 --extra engine --group dev
19+
# uv sync --frozen --extra cu128 --group dev
2020
uv sync --frozen
2121
echo "Venv ready: $(pwd)/.venv"
22-
echo "Note: for GPU extras run: uv sync --frozen --extra cu128 --extra engine --group dev"
22+
echo "Note: for GPU extras run: uv sync --frozen --extra cu128 --group dev"
2323

2424
for _envfile in .env .env.local mise.local.toml .local.envrc; do
2525
if [ -f "$ROOT_WORKTREE_PATH/$_envfile" ]; then

.github/actions/setup-gpu-test-env/action.yml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,90 @@ runs:
6767
shell: bash
6868
run: |
6969
uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
70+
71+
# Copyright (c) 2024-2026, NVIDIA CORPORATION.
72+
#
73+
# Licensed under the Apache License, Version 2.0 (the "License");
74+
# you may not use this file except in compliance with the License.
75+
# You may obtain a copy of the License at
76+
#
77+
# http://www.apache.org/licenses/LICENSE-2.0
78+
#
79+
# Unless required by applicable law or agreed to in writing, software
80+
# distributed under the License is distributed on an "AS IS" BASIS,
81+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
82+
# See the License for the specific language governing permissions and
83+
# limitations under the License.
84+
85+
name: "Setup GPU Test Environment"
86+
description: "Common setup for GPU test workflows: make, Python, CUDA dependencies, and GPU availability"
87+
88+
inputs:
89+
python-version:
90+
description: "Python version to use (defaults to .python-version when empty)"
91+
required: false
92+
default: ""
93+
bootstrap-tools:
94+
description: "Whether to install dev tools via mise"
95+
required: false
96+
default: "true"
97+
cuda-extra:
98+
description: "CUDA dependency extra to bootstrap"
99+
required: false
100+
default: "cu128"
101+
102+
runs:
103+
using: "composite"
104+
steps:
105+
- name: Install make
106+
shell: bash
107+
run: |
108+
if command -v make >/dev/null 2>&1; then
109+
echo "make is already installed: $(command -v make)"
110+
exit 0
111+
fi
112+
113+
sudo apt-get update
114+
sudo apt-get install -y --no-install-recommends make
115+
116+
- name: Setup uv cache
117+
uses: astral-sh/setup-uv@v6
118+
with:
119+
enable-cache: true
120+
cache-dependency-glob: |
121+
.python-version
122+
pyproject.toml
123+
uv.lock
124+
125+
- name: Setup Python environment
126+
uses: ./.github/actions/setup-python-env
127+
with:
128+
checkout: "false"
129+
python-version: ${{ inputs.python-version }}
130+
bootstrap-tools: ${{ inputs.bootstrap-tools }}
131+
132+
- name: Bootstrap CUDA environment
133+
shell: bash
134+
run: make bootstrap-nss ${{ inputs.cuda-extra }}
135+
136+
- name: Report GPU environment
137+
shell: bash
138+
run: |
139+
if command -v nvidia-smi >/dev/null 2>&1; then
140+
nvidia-smi --query-gpu=driver_version,name --format=csv,noheader
141+
else
142+
echo "nvidia-smi: not available"
143+
fi
144+
uv run python - <<'PY'
145+
import platform
146+
import torch
147+
148+
print("python:", platform.python_version())
149+
print("machine:", platform.machine())
150+
print("torch:", torch.__version__)
151+
print("torch cuda:", torch.version.cuda)
152+
print("cuda available:", torch.cuda.is_available())
153+
if torch.cuda.is_available():
154+
print("device count:", torch.cuda.device_count())
155+
print("device 0:", torch.cuda.get_device_name(0))
156+
PY

.github/copy-pr-bot.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
#
1414
# On-demand runs: comment `/sync` on any open PR to trigger GPU test runs
1515
# immediately without pushing a new commit. The bot pushes the current HEAD to
16-
# pull-request/<number>, which fires gpu-tests.yml and posts the GPU CI Status
17-
# check result back to the PR. Useful for draft PRs or re-running flaky tests.
16+
# pull-request/<number>. GPU workflow runs require the pull-request/* push
17+
# trigger in gpu-tests.yml, which is currently disabled.
1818

1919
enabled: true
2020
auto_sync_draft: false

.github/workflows/README.md

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
1111

1212
| Workflow | Trigger | Description |
1313
| -------------------------------------------------- | --------------------------- | ---------------------------------------------------------------------------------------------------------- |
14-
| [ci-checks.yml](ci-checks.yml) | Push to `main`, PRs, manual | Format, typecheck, unit tests, and CPU smoke tests |
14+
| [ci-checks.yml](ci-checks.yml) | Push to `main`, PRs, manual | Format, lock/generated dependency checks, typecheck, unit tests, and CPU smoke tests |
1515
| [gpu-tests.yml](gpu-tests.yml) | Nightly, manual | GPU smoke tests (required) and E2E tests |
1616
| [conventional-commit.yml](conventional-commit.yml) | PRs | Validates PR titles follow conventional commit format |
1717
| [docs.yml](docs.yml) | Push to `main` (docs paths) | Publishes `main` docs as the `latest` GitHub Pages version |
@@ -20,9 +20,9 @@ All workflows that use `.github/actions/setup-python-env` now default to the ver
2020

2121
## Pull Request Testing (copy-pr-bot)
2222

23-
GPU tests on PRs are currently disabled due to internal constraints. We hope to reenable them asap. The rest of this information is kept for posterity, but it is also relevant to the external tests ran for unit and cpu smoke tests.
23+
GPU tests on PRs are currently disabled due to internal constraints. The `pull-request/*` push trigger is commented out in `gpu-tests.yml`, so copy-pr-bot syncs do not start GPU workflow runs until that trigger is reenabled.
2424

25-
GPU tests (`gpu-tests.yml`) run on NVIDIA self-hosted runners, which block `pull_request`-triggered jobs. They use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern instead:
25+
When PR GPU tests are reenabled, `gpu-tests.yml` should use the [copy-pr-bot](https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/) pattern because NVIDIA self-hosted runners block `pull_request`-triggered jobs:
2626

2727
1. When a PR is opened by a trusted user with trusted changes, `copy-pr-bot` automatically copies the code to a `pull-request/<number>` branch
2828
2. The push to `pull-request/<number>` triggers the GPU workflow
@@ -35,7 +35,7 @@ CPU checks (`ci-checks.yml`) run on GitHub-hosted `ubuntu-latest` runners and us
3535

3636
### On-demand GPU test runs
3737

38-
To trigger a GPU test run on an open PR without waiting for the auto-sync, comment `/sync` on the PR. copy-pr-bot will push the current HEAD to `pull-request/<number>`, which fires `gpu-tests.yml` and posts the `GPU CI Status` check result back to the PR -- the same check as the automatic trigger.
38+
The `/sync` command pushes the current PR HEAD to `pull-request/<number>`. While the `pull-request/*` push trigger is disabled, that push does not fire `gpu-tests.yml` or post a `GPU CI Status` check.
3939

4040
Use `/sync` when:
4141

@@ -49,7 +49,7 @@ Use `/sync` when:
4949
flowchart LR
5050
subgraph triggers [Triggers]
5151
push[Push to main]
52-
cpb[copy-pr-bot push to pull-request/*]
52+
schedule[Nightly schedule]
5353
pr[Pull Request event]
5454
manual[Manual Dispatch]
5555
end
@@ -92,8 +92,9 @@ flowchart LR
9292
publishArtifactory[Publish to Artifactory/PyPI]
9393
end
9494
95-
push --> ci & gpu
96-
cpb --> gpu
95+
push --> ci
96+
schedule --> gpu
97+
manual --> gpu
9798
pr --> ci & conventional & secrets
9899
tag[Tag push v[0-9]*] --> release
99100
@@ -111,7 +112,7 @@ The `ci-checks.yml` workflow runs on every push to `main` and on pull requests.
111112
| Job | `make` target | What it checks |
112113
| --- | --- | --- |
113114
| Format | `format-check` | `ruff format --check` + `ruff check` + SPDX copyright headers |
114-
| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml` |
115+
| Format (lock) | `lock-check` | `uv.lock` matches `pyproject.toml`; generated CUDA dependency sections match `cuda_deps.toml` |
115116
| Typecheck | `typecheck` | `ty check` (excludes per `pyproject.toml [tool.ty.src]`) |
116117
| Unit Tests | `test-ci` | pytest with coverage (excludes slow, e2e, gpu, smoke) |
117118
| Smoke Tests | `test-smoke` | CPU smoke tests (training/generation hot paths, tiny models) |
@@ -126,7 +127,7 @@ To replicate CI locally:
126127

127128
```bash
128129
make check # format-check + typecheck
129-
make lock-check # verify uv.lock
130+
make lock-check # verify uv.lock and generated CUDA dependency sections
130131
make test # unit tests
131132
make test-smoke # CPU smoke tests
132133
```
@@ -135,13 +136,14 @@ All jobs run on `ubuntu-latest` (GitHub-hosted).
135136

136137
## GPU Tests Workflow
137138

138-
The `gpu-tests.yml` workflow runs nightly at 02:00 UTC, and can also be triggered manually via `workflow_dispatch`. Manual dispatch includes a `suite` dropdown with `all`, `smoke`, and `e2e` options. There are several key jobs:
139+
The `gpu-tests.yml` workflow runs nightly at 02:00 UTC, and can also be triggered manually via `workflow_dispatch`. Manual dispatch includes a `suite` dropdown with `all`, `smoke`, and `e2e` options. GPU jobs run on Python 3.11 and matrix over CUDA runtime extras. There are several key jobs:
139140

140141
- GPU Smoke Tests: Quick smoke tests on a gpu runner with a 30-minute job timeout and 20-minute step timeout. Required for merge.
141142
- GPU E2E Tests: End-to-end tests on a gpu runner with a 60-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
143+
- GPU E2E Tests: End-to-end tests on a gpu runner with a 60-minute job timeout and 45-minute step timeout. Informational -- failures produce a warning but don't block merge.
142144
- GPU CI Status: Aggregation job -- single required check for branch protection. Fails if smoke tests fail; warns if E2E tests fail.
143145

144-
The `changes` (Detect Changes) job is skipped on `workflow_dispatch`. GPU jobs use `always()` in their job conditions so manual runs can bypass the skipped dependency and run the selected suite. On scheduled runs, `changes` gates GPU jobs to source and test changes.
146+
The `changes` (Detect Changes) job is skipped on `workflow_dispatch`. GPU jobs use `always()` in their job conditions so manual runs can bypass the skipped dependency and run the selected suite. On scheduled runs, `changes` gates GPU jobs to source, test, dependency (`pyproject.toml` or `uv.lock`), and CI workflow/action changes.
145147

146148
GPU jobs use `.github/actions/setup-gpu-test-env` for shared GPU setup: installing `make`, setting up Python from `.python-version`, bootstrapping CUDA dependencies, and checking GPU availability.
147149

@@ -151,14 +153,19 @@ To trigger manually from the CLI (produces a run but not a PR status check):
151153
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=all
152154
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=smoke
153155
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=e2e
156+
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=all
157+
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=smoke
158+
gh workflow run gpu-tests.yml --ref <branch-name> -f suite=e2e
154159
```
155160

156-
To trigger from the PR UI and get a status check result, use `/sync` -- see [On-demand GPU test runs](#on-demand-gpu-test-runs) above.
161+
PR status checks from `/sync` require the `pull-request/*` push trigger to be reenabled -- see [On-demand GPU test runs](#on-demand-gpu-test-runs) above.
157162

158163
### Runners
159164

160165
Internal runners and projects are defined in an internal repo, `nv-gha-runners/enterprise-runner-configuration`.
161166

167+
Internal runners and projects are defined in an internal repo, `nv-gha-runners/enterprise-runner-configuration`.
168+
162169
| Workflow | Job | Runner Label | Type |
163170
| --- | --- | --- | --- |
164171
| CI Checks | All jobs | `ubuntu-latest` | GitHub-hosted |

.github/workflows/gpu-tests.yml

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,27 @@
1313
# limitations under the License.
1414

1515
# ---------------------------------------------------------------------------
16-
# GPU tests run on NVIDIA on-prem self-hosted runners and use the copy-pr-bot
17-
# pattern: PRs are tested via push events to pull-request/* branches rather
18-
# than pull_request events.
16+
# GPU tests run on NVIDIA on-prem self-hosted runners. PR copy-pr-bot testing
17+
# uses push events to pull-request/* branches rather than pull_request events,
18+
# but that push trigger is currently disabled below.
1919
# See: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/
2020
# ---------------------------------------------------------------------------
2121

2222
name: GPU Tests
2323

2424
on:
2525
schedule:
26+
# Nightly at 02:00 UTC.
2627
# Nightly at 02:00 UTC.
2728
- cron: '0 2 * * *'
2829
# disabled for now to avoid running on PRs
2930
# push:
3031
# branches:
3132
# - "pull-request/[0-9]+"
33+
# disabled for now to avoid running on PRs
34+
# push:
35+
# branches:
36+
# - "pull-request/[0-9]+"
3237
workflow_dispatch:
3338
inputs:
3439
suite:
@@ -40,6 +45,16 @@ on:
4045
- all
4146
- smoke
4247
- e2e
48+
inputs:
49+
suite:
50+
description: "GPU test suite to run"
51+
required: true
52+
default: all
53+
type: choice
54+
options:
55+
- all
56+
- smoke
57+
- e2e
4358

4459
defaults:
4560
run:
@@ -53,20 +68,23 @@ jobs:
5368
changes:
5469
name: Detect changes
5570
if: github.event_name != 'workflow_dispatch'
71+
if: github.event_name != 'workflow_dispatch'
5672
runs-on: linux-amd64-cpu4
5773
permissions:
5874
contents: read
5975
outputs:
6076
src: ${{ steps.changes.outputs.src }}
6177
test: ${{ steps.changes.outputs.test }}
78+
deps: ${{ steps.changes.outputs.deps }}
79+
ci: ${{ steps.changes.outputs.ci }}
6280
steps:
6381
- uses: actions/checkout@v6
6482
- name: Detect changes
6583
id: changes
6684
uses: ./.github/actions/detect-changes
6785

6886
gpu-smoke-test:
69-
name: GPU Smoke Tests
87+
name: GPU Smoke Tests (Python 3.11, ${{ matrix.cuda-extra }})
7088
needs: changes
7189
# `changes` is intentionally skipped on workflow_dispatch. `always()` lets
7290
# manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -76,16 +94,27 @@ jobs:
7694
(
7795
github.event_name == 'workflow_dispatch' ||
7896
needs.changes.outputs.src == 'true' ||
79-
needs.changes.outputs.test == 'true'
97+
needs.changes.outputs.test == 'true' ||
98+
needs.changes.outputs.deps == 'true' ||
99+
needs.changes.outputs.ci == 'true'
80100
) &&
81101
(
82102
github.event_name != 'workflow_dispatch' ||
83103
inputs.suite == 'all' ||
84104
inputs.suite == 'smoke'
85105
)
86106
}}
107+
continue-on-error: ${{ matrix.required == false }}
87108
timeout-minutes: 30
88109
runs-on: linux-amd64-gpu-a100-latest-1
110+
strategy:
111+
fail-fast: false
112+
matrix:
113+
include:
114+
- cuda-extra: cu128
115+
required: true
116+
- cuda-extra: cu130
117+
required: false
89118
steps:
90119
- name: checkout
91120
uses: actions/checkout@v6
@@ -94,13 +123,16 @@ jobs:
94123

95124
- name: Setup GPU test environment
96125
uses: ./.github/actions/setup-gpu-test-env
126+
with:
127+
python-version: "3.11"
128+
cuda-extra: ${{ matrix.cuda-extra }}
97129

98130
- name: Run GPU smoke tests
99131
timeout-minutes: 20
100132
run: make test-smoke-gpu
101133

102134
gpu-e2e-test:
103-
name: GPU E2E Tests
135+
name: GPU E2E Tests (Python 3.11, ${{ matrix.cuda-extra }})
104136
needs: changes
105137
# `changes` is intentionally skipped on workflow_dispatch. `always()` lets
106138
# manual runs bypass that skipped dependency and run the selected GPU suite.
@@ -110,16 +142,27 @@ jobs:
110142
(
111143
github.event_name == 'workflow_dispatch' ||
112144
needs.changes.outputs.src == 'true' ||
113-
needs.changes.outputs.test == 'true'
145+
needs.changes.outputs.test == 'true' ||
146+
needs.changes.outputs.deps == 'true' ||
147+
needs.changes.outputs.ci == 'true'
114148
) &&
115149
(
116150
github.event_name != 'workflow_dispatch' ||
117151
inputs.suite == 'all' ||
118152
inputs.suite == 'e2e'
119153
)
120154
}}
155+
continue-on-error: ${{ matrix.required == false }}
121156
timeout-minutes: 60
122157
runs-on: linux-amd64-gpu-a100-latest-1
158+
strategy:
159+
fail-fast: false
160+
matrix:
161+
include:
162+
- cuda-extra: cu128
163+
required: true
164+
- cuda-extra: cu130
165+
required: false
123166
steps:
124167
- name: checkout
125168
uses: actions/checkout@v6
@@ -128,6 +171,9 @@ jobs:
128171

129172
- name: Setup GPU test environment
130173
uses: ./.github/actions/setup-gpu-test-env
174+
with:
175+
python-version: "3.11"
176+
cuda-extra: ${{ matrix.cuda-extra }}
131177

132178
- name: Run GPU E2E tests
133179
timeout-minutes: 45

0 commit comments

Comments
 (0)