feat(recipes): add GKE COS training overlays for H100 #352
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| name: GPU Inference Test (nvkind + H100) | |
| on: | |
| schedule: | |
| - cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| paths: | |
| - '.github/workflows/gpu-h100-inference-test.yaml' | |
| - '.github/actions/gpu-cluster-setup/**' | |
| - '.github/actions/gpu-operator-install/**' | |
| - '.github/actions/aicr-build/**' | |
| - 'validators/*/Dockerfile' | |
| - 'pkg/evidence/**' | |
| - 'pkg/validator/checks/conformance/**' | |
| - 'pkg/validator/checks/deployment/**' | |
| - '.github/actions/gpu-test-cleanup/**' | |
| - '.github/actions/load-versions/**' | |
| - 'tests/manifests/**' | |
| - 'tests/chainsaw/ai-conformance/**' | |
| - 'recipes/components/dynamo-platform/**' | |
| - 'recipes/components/prometheus-adapter/**' | |
| - 'recipes/overlays/kind.yaml' | |
| - 'recipes/overlays/kind-inference.yaml' | |
| - 'recipes/overlays/h100-kind-inference.yaml' | |
| - 'recipes/overlays/h100-kind-inference-dynamo.yaml' | |
| - 'kwok/manifests/karpenter/**' | |
| - 'kwok/scripts/install-karpenter-kwok.sh' | |
| workflow_dispatch: {} # Allow manual runs | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| gpu-inference-test: | |
| name: GPU Inference Test (nvkind + H100) | |
| runs-on: linux-amd64-gpu-h100-latest-1 | |
| timeout-minutes: 45 | |
| env: | |
| KIND_CLUSTER_NAME: gpu-inference-test | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| persist-credentials: false | |
| - name: Set up GPU cluster | |
| uses: ./.github/actions/gpu-cluster-setup | |
| - name: Build aicr | |
| uses: ./.github/actions/aicr-build | |
| - name: Install GPU operator (bundle) | |
| uses: ./.github/actions/gpu-operator-install | |
| with: | |
| method: bundle | |
| accelerator: h100 | |
| platform: dynamo | |
| # --- Snapshot and validation --- | |
| - name: Snapshot and validate GPU | |
| uses: ./.github/actions/gpu-snapshot-validate | |
| with: | |
| gpu_model: H100 | |
| min_gpu_count: '1' | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Install Karpenter before validation so cluster-autoscaling check passes --- | |
| - name: Install Karpenter + KWOK (setup) | |
| uses: ./.github/actions/install-karpenter-kwok | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| # --- Validate cluster (Go conformance checks run inside K8s Jobs) --- | |
| # Includes self-contained secure-accelerator-access check (creates its own | |
| # DRA test resources, validates, and cleans up automatically). | |
| - name: Validate cluster | |
| run: | | |
| AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \ | |
| ./aicr validate \ | |
| --recipe recipe.yaml \ | |
| --phase deployment \ | |
| --phase conformance \ | |
| --namespace gpu-operator \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --require-gpu \ | |
| --image=ko.local:smoke-test \ | |
| --timeout=10m \ | |
| --output=validation-result.yaml \ | |
| --evidence-dir=conformance-evidence | |
| - name: Load versions | |
| id: versions | |
| uses: ./.github/actions/load-versions | |
| - name: Install chainsaw | |
| uses: ./.github/actions/setup-build-tools | |
| with: | |
| install_chainsaw: 'true' | |
| chainsaw_version: '${{ steps.versions.outputs.chainsaw }}' | |
| - name: Run chainsaw health checks | |
| run: | | |
| chainsaw test \ | |
| --test-dir tests/chainsaw/ai-conformance/kind \ | |
| --config tests/chainsaw/chainsaw-config.yaml | |
| # --- Dynamo vLLM inference smoke test --- | |
| - name: Deploy Dynamo vLLM smoke test | |
| run: | | |
| # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo). | |
| # The kai-scheduler chart creates default-parent-queue + default-queue on install, | |
| # but Dynamo needs its own queue as a child of the parent. | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF' | |
| apiVersion: scheduling.run.ai/v2 | |
| kind: Queue | |
| metadata: | |
| name: dynamo | |
| spec: | |
| parentQueue: default-parent-queue | |
| resources: | |
| gpu: | |
| quota: 0 | |
| limit: -1 | |
| overQuotaWeight: 1 | |
| cpu: | |
| quota: 0 | |
| limit: -1 | |
| overQuotaWeight: 1 | |
| memory: | |
| quota: 0 | |
| limit: -1 | |
| overQuotaWeight: 1 | |
| EOF | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \ | |
| -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system | |
| echo "Waiting for DynamoGraphDeployment to be reconciled..." | |
| for i in $(seq 1 60); do | |
| PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| get dynamographdeployment vllm-smoke-test \ | |
| -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) | |
| if [[ "${PHASE}" == "True" ]]; then | |
| echo "DynamoGraphDeployment is ready." | |
| break | |
| fi | |
| echo "Waiting for DGD readiness... (${i}/60)" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true | |
| sleep 10 | |
| done | |
| if [[ "${PHASE}" != "True" ]]; then | |
| echo "::error::DynamoGraphDeployment did not become ready within 10 minutes" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true | |
| exit 1 | |
| fi | |
| echo "Dynamo pods:" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods | |
| - name: Validate Dynamo inference | |
| run: | | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| port-forward svc/vllm-smoke-test-frontend 8000:8000 & | |
| PF_PID=$! | |
| sleep 3 | |
| cleanup() { kill "${PF_PID}" 2>/dev/null || true; } | |
| trap cleanup EXIT | |
| echo "=== Waiting for /v1/models (model registration may take time after worker ready) ===" | |
| for i in $(seq 1 30); do | |
| MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}') | |
| if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then | |
| echo "Models available after ${i} attempt(s)." | |
| break | |
| fi | |
| echo "Waiting for model registration... (${i}/30)" | |
| sleep 10 | |
| done | |
| echo "${MODELS}" | jq . | |
| if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then | |
| echo "::error::No models reported by frontend after 5 minutes" | |
| exit 1 | |
| fi | |
| echo "=== Sending chat completion ===" | |
| RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \ | |
| -H "Content-Type: application/json" \ | |
| -d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}') | |
| echo "${RESPONSE}" | jq . | |
| CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content') | |
| if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then | |
| echo "::error::Empty response from vLLM" | |
| exit 1 | |
| fi | |
| echo "Dynamo vLLM inference smoke test passed." | |
| # --- Evidence collection --- | |
| - name: Collect AI conformance evidence | |
| if: always() | |
| run: | | |
| go run ./tests/chainsaw/ai-conformance/ \ | |
| --dir tests/chainsaw/ai-conformance/kind \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ | |
| --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ | |
| --kubeconfig="${HOME}/.kube/config" \ | |
| --debug | |
| - name: Upload conformance evidence | |
| if: always() | |
| uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 | |
| with: | |
| name: conformance-evidence | |
| path: | | |
| conformance-evidence/ | |
| validation-result.yaml | |
| if-no-files-found: warn | |
| - name: Debug diagnostics | |
| if: failure() | |
| run: | | |
| echo "=== ClusterPolicy status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true | |
| echo "=== GPU Operator pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true | |
| echo "=== Non-running pods (all namespaces) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true | |
| echo "=== Recent events (gpu-operator) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true | |
| echo "=== Dynamo pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true | |
| echo "=== DynamoGraphDeployment status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true | |
| echo "=== Dynamo vLLM frontend logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true | |
| echo "=== Dynamo vLLM frontend previous logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true | |
| echo "=== Dynamo vLLM worker logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true | |
| echo "=== Dynamo vLLM worker previous logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true | |
| echo "=== Dynamo operator logs ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ | |
| logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true | |
| echo "=== Recent events (dynamo-system) ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true | |
| echo "=== Custom metrics API ===" | |
| for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do | |
| echo "--- ${METRIC} ---" | |
| for NS in gpu-operator dynamo-system; do | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \ | |
| "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true | |
| done | |
| done | |
| echo "=== prometheus-adapter pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true | |
| echo "=== kgateway pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true | |
| echo "=== GatewayClass status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true | |
| echo "=== Gateway status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true | |
| echo "=== DCGM Exporter pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \ | |
| get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true | |
| echo "=== Monitoring pods ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true | |
| echo "=== DRA ResourceSlices ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true | |
| echo "=== Node status ===" | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true | |
| - name: Dynamo vLLM cleanup | |
| if: always() | |
| run: | | |
| kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \ | |
| -f tests/manifests/dynamo-vllm-smoke-test.yaml \ | |
| -n dynamo-system --ignore-not-found 2>/dev/null || true | |
| - name: GPU Test Cleanup | |
| if: always() | |
| uses: ./.github/actions/gpu-test-cleanup | |
| with: | |
| cluster_name: ${{ env.KIND_CLUSTER_NAME }} | |
| artifact_name_prefix: gpu-inference-test-debug |