feat(recipes): add GKE COS training overlays for H100 #352

Workflow file for this run

.github/workflows/gpu-h100-inference-test.yaml at dc517cd

	# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	name: GPU Inference Test (nvkind + H100)

	on:
	schedule:
	- cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test
	push:
	branches:
	- "pull-request/[0-9]+"
	paths:
	- '.github/workflows/gpu-h100-inference-test.yaml'
	- '.github/actions/gpu-cluster-setup/**'
	- '.github/actions/gpu-operator-install/**'
	- '.github/actions/aicr-build/**'
	- 'validators/*/Dockerfile'
	- 'pkg/evidence/**'
	- 'pkg/validator/checks/conformance/**'
	- 'pkg/validator/checks/deployment/**'
	- '.github/actions/gpu-test-cleanup/**'
	- '.github/actions/load-versions/**'
	- 'tests/manifests/**'
	- 'tests/chainsaw/ai-conformance/**'
	- 'recipes/components/dynamo-platform/**'
	- 'recipes/components/prometheus-adapter/**'
	- 'recipes/overlays/kind.yaml'
	- 'recipes/overlays/kind-inference.yaml'
	- 'recipes/overlays/h100-kind-inference.yaml'
	- 'recipes/overlays/h100-kind-inference-dynamo.yaml'
	- 'kwok/manifests/karpenter/**'
	- 'kwok/scripts/install-karpenter-kwok.sh'
	workflow_dispatch: {} # Allow manual runs

	permissions:
	contents: read

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:

	gpu-inference-test:
	name: GPU Inference Test (nvkind + H100)
	runs-on: linux-amd64-gpu-h100-latest-1
	timeout-minutes: 45

	env:
	KIND_CLUSTER_NAME: gpu-inference-test

	steps:

	- name: Checkout Code
	uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	persist-credentials: false

	- name: Set up GPU cluster
	uses: ./.github/actions/gpu-cluster-setup

	- name: Build aicr
	uses: ./.github/actions/aicr-build

	- name: Install GPU operator (bundle)
	uses: ./.github/actions/gpu-operator-install
	with:
	method: bundle
	accelerator: h100
	platform: dynamo

	# --- Snapshot and validation ---

	- name: Snapshot and validate GPU
	uses: ./.github/actions/gpu-snapshot-validate
	with:
	gpu_model: H100
	min_gpu_count: '1'
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}

	# --- Install Karpenter before validation so cluster-autoscaling check passes ---

	- name: Install Karpenter + KWOK (setup)
	uses: ./.github/actions/install-karpenter-kwok
	with:
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}

	# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
	# Includes self-contained secure-accelerator-access check (creates its own
	# DRA test resources, validates, and cleans up automatically).

	- name: Validate cluster
	run: \|
	AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
	./aicr validate \
	--recipe recipe.yaml \
	--phase deployment \
	--phase conformance \
	--namespace gpu-operator \
	--kubeconfig="${HOME}/.kube/config" \
	--require-gpu \
	--image=ko.local:smoke-test \
	--timeout=10m \
	--output=validation-result.yaml \
	--evidence-dir=conformance-evidence

	- name: Load versions
	id: versions
	uses: ./.github/actions/load-versions

	- name: Install chainsaw
	uses: ./.github/actions/setup-build-tools
	with:
	install_chainsaw: 'true'
	chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'

	- name: Run chainsaw health checks
	run: \|
	chainsaw test \
	--test-dir tests/chainsaw/ai-conformance/kind \
	--config tests/chainsaw/chainsaw-config.yaml

	# --- Dynamo vLLM inference smoke test ---

	- name: Deploy Dynamo vLLM smoke test
	run: \|
	# Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
	# The kai-scheduler chart creates default-parent-queue + default-queue on install,
	# but Dynamo needs its own queue as a child of the parent.
	kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
	apiVersion: scheduling.run.ai/v2
	kind: Queue
	metadata:
	name: dynamo
	spec:
	parentQueue: default-parent-queue
	resources:
	gpu:
	quota: 0
	limit: -1
	overQuotaWeight: 1
	cpu:
	quota: 0
	limit: -1
	overQuotaWeight: 1
	memory:
	quota: 0
	limit: -1
	overQuotaWeight: 1
	EOF

	kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
	-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system

	echo "Waiting for DynamoGraphDeployment to be reconciled..."
	for i in $(seq 1 60); do
	PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	get dynamographdeployment vllm-smoke-test \
	-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
	if [[ "${PHASE}" == "True" ]]; then
	echo "DynamoGraphDeployment is ready."
	break
	fi
	echo "Waiting for DGD readiness... (${i}/60)"
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null \|\| true
	sleep 10
	done

	if [[ "${PHASE}" != "True" ]]; then
	echo "::error::DynamoGraphDeployment did not become ready within 10 minutes"
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null \|\| true
	exit 1
	fi

	echo "Dynamo pods:"
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods

	- name: Validate Dynamo inference
	run: \|
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	port-forward svc/vllm-smoke-test-frontend 8000:8000 &
	PF_PID=$!
	sleep 3

	cleanup() { kill "${PF_PID}" 2>/dev/null \|\| true; }
	trap cleanup EXIT

	echo "=== Waiting for /v1/models (model registration may take time after worker ready) ==="
	for i in $(seq 1 30); do
	MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null \|\| echo '{"data":[]}')
	if echo "${MODELS}" \| jq -e '.data \| length > 0' >/dev/null 2>&1; then
	echo "Models available after ${i} attempt(s)."
	break
	fi
	echo "Waiting for model registration... (${i}/30)"
	sleep 10
	done
	echo "${MODELS}" \| jq .
	if ! echo "${MODELS}" \| jq -e '.data \| length > 0' >/dev/null 2>&1; then
	echo "::error::No models reported by frontend after 5 minutes"
	exit 1
	fi

	echo "=== Sending chat completion ==="
	RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \
	-H "Content-Type: application/json" \
	-d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}')
	echo "${RESPONSE}" \| jq .

	CONTENT=$(echo "${RESPONSE}" \| jq -r '.choices[0].message.content')
	if [[ -z "${CONTENT}" \|\| "${CONTENT}" == "null" ]]; then
	echo "::error::Empty response from vLLM"
	exit 1
	fi
	echo "Dynamo vLLM inference smoke test passed."

	# --- Evidence collection ---

	- name: Collect AI conformance evidence
	if: always()
	run: \|
	go run ./tests/chainsaw/ai-conformance/ \
	--dir tests/chainsaw/ai-conformance/kind \
	--file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
	--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
	--kubeconfig="${HOME}/.kube/config" \
	--debug

	- name: Upload conformance evidence
	if: always()
	uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
	with:
	name: conformance-evidence
	path: \|
	conformance-evidence/
	validation-result.yaml
	if-no-files-found: warn

	- name: Debug diagnostics
	if: failure()
	run: \|
	echo "=== ClusterPolicy status ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null \|\| true
	echo "=== GPU Operator pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null \|\| true
	echo "=== Non-running pods (all namespaces) ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null \|\| true
	echo "=== Recent events (gpu-operator) ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null \| tail -30 \|\| true
	echo "=== Dynamo pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null \|\| true
	echo "=== DynamoGraphDeployment status ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null \|\| true
	echo "=== Dynamo vLLM frontend logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null \|\| true
	echo "=== Dynamo vLLM frontend previous logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null \|\| true
	echo "=== Dynamo vLLM worker logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null \|\| true
	echo "=== Dynamo vLLM worker previous logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null \|\| true
	echo "=== Dynamo operator logs ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
	logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null \|\| true
	echo "=== Recent events (dynamo-system) ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null \| tail -30 \|\| true
	echo "=== Custom metrics API ==="
	for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
	echo "--- ${METRIC} ---"
	for NS in gpu-operator dynamo-system; do
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
	"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null \| jq . \|\| true
	done
	done
	echo "=== prometheus-adapter pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null \|\| true
	echo "=== kgateway pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null \|\| true
	echo "=== GatewayClass status ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null \|\| true
	echo "=== Gateway status ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null \|\| true
	echo "=== DCGM Exporter pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
	get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null \|\| true
	echo "=== Monitoring pods ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null \|\| true
	echo "=== DRA ResourceSlices ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null \|\| true
	echo "=== Node status ==="
	kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null \|\| true

	- name: Dynamo vLLM cleanup
	if: always()
	run: \|
	kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
	-f tests/manifests/dynamo-vllm-smoke-test.yaml \
	-n dynamo-system --ignore-not-found 2>/dev/null \|\| true

	- name: GPU Test Cleanup
	if: always()
	uses: ./.github/actions/gpu-test-cleanup
	with:
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}
	artifact_name_prefix: gpu-inference-test-debug

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat(recipes): add GKE COS training overlays for H100 #352

Workflow file

feat(recipes): add GKE COS training overlays for H100 #352

Uh oh!

Workflow file for this run