kaito-project · robert-cronin · Jun 30, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 24, 2026
@@ -240,3 +240,25 @@ jobs:
             exit 1
           fi
           echo "✅ ldflags injection landed: DynamoVersion=${DYNAMO_VERSION}"
+
+  gpu-e2e-check:
+    # Cluster-free gate for the GPU e2e module (test/e2e/gpu). The full suite
+    # needs a GPU cluster and runs out-of-band via scripts/gpu-e2e.sh, but the
+    # module must still stay formatted, vet-clean, and compilable, and its
+    # cluster-free packages (sched, e2eutil) carry real unit tests. This job
+    # runs all of that on a plain runner so the e2e code can't silently rot
+    # between GPU runs.
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
+
+      - name: Setup Go
+        uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0
+        with:
+          go-version: "1.25"
+          cache-dependency-path: test/e2e/gpu/go.mod
+
+      - name: Run GPU e2e module checks
+        run: make gpu-e2e-check
@@ -59,6 +59,9 @@ go.work.sum
 # Provider build outputs
 providers/*/bin/
 
+# GPU e2e per-run result bundles
+gpu-e2e-results/
+
 # Playwright
 /test-results/
 /playwright-report/

@@ -1,4 +1,4 @@
-.PHONY: install dev dev-frontend dev-backend build compile lint test test-coverage test-coverage-backend test-coverage-frontend clean help providers-test verify-versions test-verify-versions
+.PHONY: install dev dev-frontend dev-backend build compile lint test test-coverage test-coverage-backend test-coverage-frontend clean help providers-test gpu-e2e gpu-e2e-check verify-versions test-verify-versions
 .PHONY: controller-build controller-docker-build controller-install controller-deploy controller-generate generate-deploy-manifests
 .PHONY: model-downloader-docker-build setup-gateway cleanup-gateway
 
@@ -59,6 +59,8 @@ help:
 	@echo ""
 	@echo "Provider Targets:"
 	@echo "  providers-test         Run all provider tests"
+	@echo "  gpu-e2e                Run GPU e2e suite on a GPU cluster (GPU_E2E_ARGS=...)"
+	@echo "  gpu-e2e-check          Cluster-free checks for the GPU e2e module (gofmt, vet, compile, unit tests)"
 	@echo ""
 	@echo "Cluster Setup Targets:"
 	@echo "  setup-gateway          Install Gateway API CRDs, Istio, BBR, and the inference Gateway"
@@ -200,6 +202,31 @@ providers-test: verify-versions
 	cd providers/vllm && go test ./...
 	@echo "✅ Provider tests completed"
 
+# Run the GPU end-to-end suite against a pre-existing GPU cluster.
+# All logic lives in scripts/gpu-e2e.sh; pass flags via GPU_E2E_ARGS.
+# Example: make gpu-e2e GPU_E2E_ARGS="--provider all --registry quay.io/surajd"
+gpu-e2e:
+	@bash scripts/gpu-e2e.sh $(GPU_E2E_ARGS)
+
+# Cluster-free validation of the GPU e2e module (test/e2e/gpu). Runs in CI on a
+# plain runner: it never touches a cluster. Three guarantees:
+#   1. gofmt   — the module stays formatted.
+#   2. vet + compile under -tags=e2e — the cluster-coupled suite keeps building
+#      even though CI never runs it (catches selector/API drift at PR time).
+#   3. unit tests for the cluster-free packages (sched, e2eutil) — the
+#      classifier, response parser, and storage-class injector are exercised
+#      for real. These carry no build tag, so `go test` picks them up directly.
+gpu-e2e-check:
+	@echo "▶ gofmt"
+	@test -z "$$(gofmt -l test/e2e/gpu)" || { echo "❌ gofmt: run 'gofmt -w test/e2e/gpu'"; gofmt -l test/e2e/gpu; exit 1; }
+	@echo "▶ go vet (-tags=e2e)"
+	go vet -C test/e2e/gpu -tags=e2e ./...
+	@echo "▶ compile e2e suite (-tags=e2e)"
+	go test -C test/e2e/gpu -tags=e2e -c -o /dev/null ./
+	@echo "▶ unit tests (cluster-free packages)"
+	go test -C test/e2e/gpu ./sched/ ./e2eutil/
+	@echo "✅ GPU e2e module checks passed"
+
 # Generate deploy manifests for controller and dashboard
 generate-deploy-manifests:
 	cd controller && $(MAKE) kustomize

@@ -721,6 +721,80 @@ curl http://localhost:5000/v1/chat/completions \
   }'
 ```
 
+## GPU End-to-End Testing
+
+`make gpu-e2e` runs a real-GPU end-to-end suite that deploys each inference
+provider through a `ModelDeployment`, drives it to `Running`, and asserts that
+inference actually serves through the inference gateway. Unlike the CPU/mocker
+e2e lanes, it requires real GPU hardware and an already-provisioned cluster — it
+never creates or deletes the cluster.
+
+The harness (`scripts/gpu-e2e.sh`) builds and pushes the controller and provider
+images, installs any missing upstream operator, deploys everything, then runs
+the Go suite under `test/e2e/gpu/`. Providers covered: **Dynamo, vLLM, KAITO**
+(KubeRay is not yet supported).
+
+### Cluster preconditions
+
+The harness installs none of these (except a missing operator via `setup-<p>`):
+
+- **GPU nodes** with the NVIDIA GPU Operator and NFD enabled, so nodes advertise
+  `nvidia.com/gpu` and the `nvidia.com/gpu.present=true` label.
+- **An RWX-capable StorageClass.** The Dynamo model-cache PVC defaults to
+  `ReadWriteMany`; Azure Disk classes are `ReadWriteOnce` and will leave the PVC
+  `Pending`. The default is `azurefile-premium`; override with `--storage-class`.
+- **The inference gateway** (Gateway API CRDs + GAIE + Istio + BBR + a `Gateway`
+  named `inference-gateway`). On a fresh cluster `make -C providers/dynamo
+  setup-dynamo` installs it; otherwise it must already be present and
+  `Programmed`. The suite fails fast if it is missing.
+- **Pull access to the pushed images.** The manager manifests carry no
+  `imagePullSecret`, so the images must be public or the nodes must have pull
+  access. New registry repositories often default to private — make them public
+  once.
+
+### Running it
+
+```bash
+# All three providers, building+pushing images to your registry:
+make gpu-e2e GPU_E2E_ARGS="--provider all --registry <your-registry>"
+
+# A single provider:
+make gpu-e2e GPU_E2E_ARGS="--provider vllm --registry <your-registry>"
+
+# Re-test without rebuilding (requires an explicit, already-pushed tag):
+make gpu-e2e GPU_E2E_ARGS="--provider dynamo --skip-build \
+    --registry <your-registry> --img-tag <tag>"
+
+# Run the Go suite directly against an already-deployed cluster (no rebuild):
+go test -C test/e2e/gpu -tags=e2e -v -run 'TestGPUProviders/vllm' ./
+```
+
+Flags are passed to the script via `GPU_E2E_ARGS`; pass them inside the quotes,
+not as bare `make` arguments. See `scripts/gpu-e2e.sh --help` for the full list.
+Key flags: `--provider`, `--registry` (required when building), `--img-tag`,
+`--storage-class`, `--skip-install`, `--skip-build`, `--keep`.
+
+### Environment knobs
+
+The script forwards these to the Go suite; you can also set them directly when
+running `go test`:
+
+| Variable | Meaning |
+|----------|---------|
+| `GPU_E2E_STORAGE_CLASS` | RWX StorageClass injected into the Dynamo fixture and asserted on (default `azurefile-premium`). Set by `--storage-class`. |
+| `GPU_E2E_KEEP` | When `true`, leave `ModelDeployment`s running after the test for inspection. Set by `--keep`. |
+| `GPU_E2E_RESULTS_DIR` | Optional override for where per-case result bundles are written (default `test/e2e/gpu/gpu-e2e-results/<timestamp>/`). |
+| `GPU_E2E_RUN_TS` | Optional fixed timestamp for the results directory name. |
+
+### Outcomes
+
+Each case ends as **PASS**, **FAIL**, or **SKIP**. A `SKIP` means the cluster
+lacks the capacity to schedule that case (more GPUs requested than any node has,
+or no GPU free before the scheduling deadline) — it does not fail the run. Only a
+genuine error (a broken deployment, failed inference, or orphaned resources after
+delete) is a `FAIL`. Per-case logs and a `result` marker are written under the
+results directory.
+
 ## Troubleshooting
 
 ### Controller not reconciling

@@ -212,6 +212,20 @@ Replace `provider.name` with your gateway implementation (`istio`, `gke`, or omi
 
 See the [upstream multi-model guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/serving-multiple-inference-pools-latest/) for full details.
 
+> **Known limitation — BBR restart on each new model.** BBR builds its model
+> registry only at startup and does not dynamically watch InferencePools, so the
+> controller triggers a rolling restart of the shared BBR Deployment once per new
+> `ModelDeployment` (tracked by the `airunway.ai/bbr-restarted` annotation). The
+> restart is **not zero-downtime**: while BBR is restarting, its registry is
+> incomplete, so an in-flight request for an *already-serving* model can miss its
+> `X-Gateway-Model-Name` header and mis-route to another model's InferencePool.
+> With disaggregated Dynamo serving this surfaces as a `Worker ID required
+> (--direct-route)` 500 on a concurrent aggregated request. This mainly affects
+> deploying multiple models close together; once all models are settled, routing
+> is correct and stable. A zero-downtime BBR reload (or a BBR that watches
+> InferencePools) would remove the window. The GPU e2e suite leaves
+> disaggregated serving out of its default matrix for this reason.
+
 ### Auto-detection with Multiple Gateways
 
 When no explicit gateway is configured and multiple Gateway resources exist in the cluster, the controller looks for one labeled with:

@@ -128,7 +128,8 @@ test-e2e:
 
 ## Run the CPU-only mocker e2e tests (aggregated + disaggregated). Requires a cluster
 ## with the Dynamo platform (see setup-dynamo-mocker) and the dynamo provider deployed.
-## No -run filter: the GPU lane (TestDynamoProviderE2E) self-skips without DYNAMO_INSTALLED,
-## so dropping it lets the unit-style TestInjectMockerAnnotation run here too.
+## No -run filter: the GPU lanes (TestDynamoMultiNodeE2E, TestDynamoStorageValidationE2E)
+## self-skip without DYNAMO_INSTALLED, so dropping it lets the unit-style
+## TestInjectMockerAnnotation run here too.
 test-e2e-mocker:
 	DYNAMO_MOCKER=true go test -count=1 -tags=e2e -v -timeout 30m ./test/e2e/