NVIDIA-NeMo
diff --git a/‎.agents/skills/diagnose-failures/SKILL.md‎
Lines changed: 4 additions & 4 deletions b/‎.agents/skills/diagnose-failures/SKILL.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎.agents/skills/git-worktrees/SKILL.md‎
Lines changed: 2 additions & 2 deletions b/‎.agents/skills/git-worktrees/SKILL.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.agents/skills/uv-build/SKILL.md‎
Lines changed: 5 additions & 5 deletions b/‎.agents/skills/uv-build/SKILL.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.claude/commands/bootstrap.md‎
Lines changed: 2 additions & 2 deletions b/‎.claude/commands/bootstrap.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.cursor/setup-worktree.sh‎
Lines changed: 2 additions & 2 deletions b/‎.cursor/setup-worktree.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/actions/setup-gpu-test-env/action.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-gpu-test-env/action.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/gpu-tests.yml‎
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/gpu-tests.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 1 addition & 1 deletion b/‎AGENTS.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 7 additions & 7 deletions b/‎Makefile‎
Lines changed: 7 additions & 7 deletions
@@ -55,7 +55,7 @@ Common `ty` error patterns:
 
 | Error | Likely cause | Fix |
 |-------|-------------|-----|
-| `unresolved-import` | Missing extra in venv | Run `uv sync --frozen --extra cu128 --extra engine --group dev` |
+| `unresolved-import` | Missing extra in venv | Run `uv sync --frozen --extra cu129 --extra engine --group dev` |
 | `unresolved-attribute` | Computed property treated as config field | Check if the attribute is a `@property`, not a Pydantic field |
 | `possibly-unbound` | Variable assigned only in one branch | Add an `else` branch or initialise before the conditional |
 | `invalid-argument-type` | Wrong type passed to function | Check the function signature; use `cast()` only as a last resort |
@@ -87,8 +87,8 @@ gh run view <run-id> --log-failed
 
 ## Import / Dependency Errors
 
-- Check if the import requires an extras gate: `cpu`, `cu128`, or `engine`
-- Common: `vllm`, `torch`, `unsloth` need `cpu` or `cu128` extra
+- Check if the import requires an extras gate: `cpu`, `cu129`, or `engine`
+- Common: `vllm`, `torch`, `unsloth` need `cpu` or `cu129` extra
 - Use the `diagnose-deps` skill for lockfile diff diagnosis after `uv lock`
 - Run: `uv run tools/diff-lockfile.py` to see what changed
 
@@ -97,7 +97,7 @@ gh run view <run-id> --log-failed
 | Error | Likely Cause | Fix |
 |-------|-------------|-----|
 | `CUDA out of memory` | Batch too large or model too big | Reduce `batch_size` or use quantization |
-| `CUDA not available` | Wrong extra installed | Reinstall with `make bootstrap-nss cu128` |
+| `CUDA not available` | Wrong extra installed | Reinstall with `make bootstrap-nss cu129` |
 | `NCCL error` | Multi-GPU issues | Use `CUDA_VISIBLE_DEVICES=0` for single-GPU |
 
 ### Running GPU / e2e Tests
 
@@ -29,10 +29,10 @@ uv sync --frozen
 
 This creates a local `.venv` in the worktree. With uv's cache the install takes ~2-3 seconds on a warm cache.
 
-If you need different extras (e.g. `cu128` vs `cpu`), pass them explicitly:
+If you need different extras (e.g. `cu129` vs `cpu`), pass them explicitly:
 
 ```bash
-uv sync --frozen --extra cu128 --extra engine --group dev
+uv sync --frozen --extra cu129 --extra engine --group dev
 ```
 
 Never run bare `uv sync` without `--frozen` -- it re-locks `uv.lock` and creates dirty state.
 
@@ -19,7 +19,7 @@ make bootstrap-tools && make bootstrap-nss cpu
 # Pick a variant:
 make bootstrap-nss dev       # dev tools only (no engine/torch)
 make bootstrap-nss cpu       # + engine + CPU PyTorch
-make bootstrap-nss cu128     # + engine + CUDA 12.8 PyTorch
+make bootstrap-nss cu129     # + engine + CUDA 12.9 PyTorch
 make bootstrap-nss engine    # + engine (no torch)
 ```
 
@@ -30,11 +30,11 @@ Under the hood: `uv sync --frozen --extra <extra> [--extra engine] --group dev`
 | Extra | What it installs |
 |-------|------------------|
 | `cpu` | PyTorch CPU, faiss-cpu, flashinfer (Linux only) |
-| `cu128` | PyTorch+CUDA 12.8, faiss-gpu, flashinfer-jit-cache |
+| `cu129` | PyTorch+CUDA 12.9, faiss-gpu, flashinfer-jit-cache |
 | `engine` | ML pipeline deps (outlines, wandb, tiktoken, etc.) -- no torch |
 | `microservices` | `nemo-microservices` from local path |
 
-`cpu` and `cu128` conflict -- you must pick one, never both. Enforced in `[tool.uv] conflicts`.
+`cpu` and `cu129` conflict -- you must pick one, never both. Enforced in `[tool.uv] conflicts`.
 
 ## Index Management
 
@@ -43,9 +43,9 @@ PyTorch wheels come from dedicated indexes, not PyPI:
 | Index | URL | Used for |
 |-------|-----|----------|
 | `pytorch-cpu` | `download.pytorch.org/whl/cpu` | torch, torchvision (CPU, Linux) |
-| `pytorch-cu128` | `download.pytorch.org/whl/cu128` | torch, torchvision, triton, xformers (CUDA) |
+| `pytorch-cu129` | `download.pytorch.org/whl/cu129` | torch, torchvision, triton (CUDA) |
 | `nv-shared-pypi-local` | NVIDIA Artifactory | Internal NVIDIA packages |
-| `flashinfer-jit-cache` | `flashinfer.ai/whl/cu128` | FlashInfer JIT cache |
+| `flashinfer-jit-cache` | `flashinfer.ai/whl/cu129` | FlashInfer JIT cache |
 | `nvidia-pypi-public` | `pypi.nvidia.com` | Public NVIDIA packages |
 
 All indexes are `explicit = true` (only used when a package is mapped to them in `[tool.uv.sources]`).
 
@@ -13,9 +13,9 @@ Set up the development environment from scratch.
 2. Install Python dependencies (choose one):
    ```bash
    make bootstrap-nss cpu    # CPU-only (macOS or Linux without GPU)
-   make bootstrap-nss cuda   # CUDA 12.8 (Linux with NVIDIA GPU)
+   make bootstrap-nss cuda   # CUDA 12.9 (Linux with NVIDIA GPU)
    make bootstrap-nss engine # Engine dependencies only (no torch)
    make bootstrap-nss dev    # Minimal dev dependencies only
    ```
 
-Note: `cuda` is an alias for `cu128`. Both are equivalent.
+Note: `cuda` is an alias for `cu129`. Both are equivalent.
@@ -16,10 +16,10 @@ fi
 
 # Bare --frozen installs the base environment. For GPU dev work (ty, import
 # checks, GPU tests) run the full command manually after setup:
-#   uv sync --frozen --extra cu128 --extra engine --group dev
+#   uv sync --frozen --extra cu129 --extra engine --group dev
 uv sync --frozen
 echo "Venv ready: $(pwd)/.venv"
-echo "Note: for GPU extras run: uv sync --frozen --extra cu128 --extra engine --group dev"
+echo "Note: for GPU extras run: uv sync --frozen --extra cu129 --extra engine --group dev"
 
 for _envfile in .env .env.local mise.local.toml .local.envrc; do
     if [ -f "$ROOT_WORKTREE_PATH/$_envfile" ]; then
 
@@ -27,7 +27,7 @@ inputs:
   cuda-extra:
     description: "CUDA dependency extra to bootstrap"
     required: false
-    default: "cu128"
+    default: "cu129"
 
 runs:
   using: "composite"
 
@@ -126,8 +126,26 @@ jobs:
         with:
           fetch-depth: 0
 
+<<<<<<< HEAD
       - name: Setup GPU test environment
         uses: ./.github/actions/setup-gpu-test-env
+=======
+      - name: Install make
+        run: apt-get update && apt-get install -y --no-install-recommends make
+
+      - name: Setup Python environment
+        uses: ./.github/actions/setup-python-env
+        with:
+          python-version: "3.11"
+          bootstrap-tools: "true"
+
+      - name: Bootstrap CUDA environment
+        run: make bootstrap-nss cu129
+
+      - name: Check GPU availability
+        run: |
+          uv run python -c "import torch; print('cuda available:', torch.cuda.is_available()); print('device count:', torch.cuda.device_count())"
+>>>>>>> 4a11f2bd (chore(deps): bump vllm 0.18→0.20 + torch 2.10→2.11 stack)
 
       - name: Run GPU E2E tests
         timeout-minutes: 45
 
@@ -24,7 +24,7 @@ Common commands: `make test` (unit tests), `make format` (auto-fix formatting +
 The canonical `uv sync` command for a full GPU/dev environment is:
 
 ```bash
-uv sync --frozen --extra cu128 --extra engine --group dev
+uv sync --frozen --extra cu129 --extra engine --group dev
 ```
 
 Bare `uv sync --frozen` (without extras) installs an incomplete environment -- `ty`, import checks, and GPU tests will fail.
 
@@ -59,7 +59,7 @@ Please read our [Code of Conduct](CODE_OF_CONDUCT.md) before contributing.
 
    # Install Python dependencies (choose one)
    make bootstrap-nss cpu    # CPU-only (macOS or Linux without GPU)
-   make bootstrap-nss cuda   # CUDA 12.8 (Linux with NVIDIA GPU)
+   make bootstrap-nss cuda   # CUDA 12.9 (Linux with NVIDIA GPU)
    make bootstrap-nss engine # Engine dependencies only
    make bootstrap-nss dev    # Minimal dev dependencies only
   ```
 
@@ -13,7 +13,7 @@ NSS_ROOT_PATH := $(shell pwd)
 # Normalize architecture names
 ifeq ($(ARCH),x86_64)
 	ARCH := amd64
-	PYTORCH_DEPS := cu128
+	PYTORCH_DEPS := cu129
 	export BUILD_ARCH ?= linux/amd64
 endif
 ifeq ($(ARCH),aarch64)
@@ -82,12 +82,12 @@ verify-python-version: ## Verify Python version and install if necessary
 	uv venv --seed --allow-existing --python 3.11
 
 .PHONY: bootstrap-python
-bootstrap-python: .venv ## Bootstrap Python dependencies. Set PYTORCH_DEPS to 'cpu' or 'cu128'. Here mostly for legacy usage.
+bootstrap-python: .venv ## Bootstrap Python dependencies. Set PYTORCH_DEPS to 'cpu' or 'cu129'. Here mostly for legacy usage.
 	uv sync --frozen --extra ${PYTORCH_DEPS} --extra engine --group dev
 
 # Dynamic targets for bootstrap-nss
 # Usage: make bootstrap-nss {dev,engine,cpu,cuda}
-BOOTSTRAP_EXTRAS := dev engine cpu cuda cu128
+BOOTSTRAP_EXTRAS := dev engine cpu cuda cu129
 $(BOOTSTRAP_EXTRAS):
 	@:
 
@@ -97,9 +97,9 @@ bootstrap-nss: .venv ## Bootstrap Python dependencies. Usage: make bootstrap-nss
 	@echo "~~~~~~"
 	@echo "attempting to install nss package with primary extra: $(EXTRA)"
 	@if [ "$(EXTRA)" = "cuda" ]; then \
-		uv sync --frozen --extra cu128 --extra engine --group dev; \
-	elif [ "$(EXTRA)" = "cu128" ]; then \
-		uv sync --frozen --extra cu128 --extra engine --group dev; \
+		uv sync --frozen --extra cu129 --extra engine --group dev; \
+	elif [ "$(EXTRA)" = "cu129" ]; then \
+		uv sync --frozen --extra cu129 --extra engine --group dev; \
 	elif [ "$(EXTRA)" = "cpu" ]; then \
 		uv sync --frozen --extra cpu --extra engine --group dev; \
 	elif [ "$(EXTRA)" = "engine" ]; then \
@@ -469,7 +469,7 @@ NSS_DATASETS := clinc_oos dow_jones_index
 
 define nss_combo_test
 test-nss-$(1)-$(2)-ci: ## Run pytest test for $(shell echo $(1) | tr '_' '-') config with $(shell echo $(2) | tr '_' '-') dataset
-	$(MAKE) bootstrap-nss cu128
+	$(MAKE) bootstrap-nss cu129
 	$(PYTEST_NO_XDIST_CMD) -vv $(PYTEST_CI_OPTS) $(NSS_ROOT_PATH)/tests/e2e/test_dataset_config.py -k "test_$(2)_dataset[$(subst _,-,$(1))]"
 endef