Skip to content

feat(validator): add Kubeflow Trainer support to robust-controller check #328

feat(validator): add Kubeflow Trainer support to robust-controller check

feat(validator): add Kubeflow Trainer support to robust-controller check #328

# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: GPU Inference Test (nvkind + H100)
on:
schedule:
- cron: '15 6,18 * * *' # Every 12 hours (2x daily), offset from T4 smoke test
push:
branches:
- "pull-request/[0-9]+"
paths:
- '.github/workflows/gpu-h100-inference-test.yaml'
- '.github/actions/gpu-cluster-setup/**'
- '.github/actions/gpu-operator-install/**'
- '.github/actions/aicr-build/**'
- 'validators/*/Dockerfile'
- 'pkg/evidence/**'
- 'pkg/validator/checks/conformance/**'
- 'pkg/validator/checks/deployment/**'
- '.github/actions/gpu-test-cleanup/**'
- '.github/actions/load-versions/**'
- 'tests/manifests/**'
- 'tests/chainsaw/ai-conformance/**'
- 'recipes/components/dynamo-platform/**'
- 'recipes/components/prometheus-adapter/**'
- 'recipes/overlays/kind.yaml'
- 'recipes/overlays/kind-inference.yaml'
- 'recipes/overlays/h100-kind-inference.yaml'
- 'recipes/overlays/h100-kind-inference-dynamo.yaml'
- 'kwok/manifests/karpenter/**'
- 'kwok/scripts/install-karpenter-kwok.sh'
workflow_dispatch: {} # Allow manual runs
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
gpu-inference-test:
name: GPU Inference Test (nvkind + H100)
runs-on: linux-amd64-gpu-h100-latest-1
timeout-minutes: 45
env:
KIND_CLUSTER_NAME: gpu-inference-test
steps:
- name: Checkout Code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false
- name: Set up GPU cluster
uses: ./.github/actions/gpu-cluster-setup
- name: Build aicr
uses: ./.github/actions/aicr-build
- name: Install GPU operator (bundle)
uses: ./.github/actions/gpu-operator-install
with:
method: bundle
accelerator: h100
platform: dynamo
# --- Snapshot and validation ---
- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
with:
gpu_model: H100
min_gpu_count: '1'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Install Karpenter before validation so cluster-autoscaling check passes ---
- name: Install Karpenter + KWOK (setup)
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# Includes self-contained secure-accelerator-access check (creates its own
# DRA test resources, validates, and cleans up automatically).
- name: Validate cluster
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase deployment \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence
- name: Load versions
id: versions
uses: ./.github/actions/load-versions
- name: Install chainsaw
uses: ./.github/actions/setup-build-tools
with:
install_chainsaw: 'true'
chainsaw_version: '${{ steps.versions.outputs.chainsaw }}'
- name: Run chainsaw health checks
run: |
chainsaw test \
--test-dir tests/chainsaw/ai-conformance/kind \
--config tests/chainsaw/chainsaw-config.yaml
# --- Dynamo vLLM inference smoke test ---
- name: Deploy Dynamo vLLM smoke test
run: |
# Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
# The kai-scheduler chart creates default-parent-queue + default-queue on install,
# but Dynamo needs its own queue as a child of the parent.
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
apiVersion: scheduling.run.ai/v2
kind: Queue
metadata:
name: dynamo
spec:
parentQueue: default-parent-queue
resources:
gpu:
quota: 0
limit: -1
overQuotaWeight: 1
cpu:
quota: 0
limit: -1
overQuotaWeight: 1
memory:
quota: 0
limit: -1
overQuotaWeight: 1
EOF
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
echo "Waiting for DynamoGraphDeployment to be reconciled..."
for i in $(seq 1 60); do
PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
get dynamographdeployment vllm-smoke-test \
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [[ "${PHASE}" == "True" ]]; then
echo "DynamoGraphDeployment is ready."
break
fi
echo "Waiting for DGD readiness... (${i}/60)"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true
sleep 10
done
if [[ "${PHASE}" != "True" ]]; then
echo "::error::DynamoGraphDeployment did not become ready within 10 minutes"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true
exit 1
fi
echo "Dynamo pods:"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods
- name: Validate Dynamo inference
run: |
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
port-forward svc/vllm-smoke-test-frontend 8000:8000 &
PF_PID=$!
sleep 3
cleanup() { kill "${PF_PID}" 2>/dev/null || true; }
trap cleanup EXIT
echo "=== Waiting for /v1/models (model registration may take time after worker ready) ==="
for i in $(seq 1 30); do
MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}')
if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
echo "Models available after ${i} attempt(s)."
break
fi
echo "Waiting for model registration... (${i}/30)"
sleep 10
done
echo "${MODELS}" | jq .
if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
echo "::error::No models reported by frontend after 5 minutes"
exit 1
fi
echo "=== Sending chat completion ==="
RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}')
echo "${RESPONSE}" | jq .
CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content')
if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then
echo "::error::Empty response from vLLM"
exit 1
fi
echo "Dynamo vLLM inference smoke test passed."
# --- Evidence collection ---
- name: Collect AI conformance evidence
if: always()
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind \
--file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug
- name: Upload conformance evidence
if: always()
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
with:
name: conformance-evidence
path: |
conformance-evidence/
validation-result.yaml
if-no-files-found: warn
- name: Debug diagnostics
if: failure()
run: |
echo "=== ClusterPolicy status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get clusterpolicy -o yaml 2>/dev/null || true
echo "=== GPU Operator pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get pods -o wide 2>/dev/null || true
echo "=== Non-running pods (all namespaces) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded 2>/dev/null || true
echo "=== Recent events (gpu-operator) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== Dynamo pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
echo "=== DynamoGraphDeployment status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true
echo "=== Dynamo vLLM frontend logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM frontend previous logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM worker logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM worker previous logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true
echo "=== Dynamo operator logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
echo "=== Recent events (dynamo-system) ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== Custom metrics API ==="
for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
echo "--- ${METRIC} ---"
for NS in gpu-operator dynamo-system; do
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
done
done
echo "=== prometheus-adapter pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
echo "=== kgateway pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n kgateway-system get pods -o wide 2>/dev/null || true
echo "=== GatewayClass status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
echo "=== Gateway status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get gateways -A -o yaml 2>/dev/null || true
echo "=== DCGM Exporter pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
get pods -l app=nvidia-dcgm-exporter -o wide 2>/dev/null || true
echo "=== Monitoring pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -o wide 2>/dev/null || true
echo "=== DRA ResourceSlices ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get resourceslices -o wide 2>/dev/null || true
echo "=== Node status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
- name: Dynamo vLLM cleanup
if: always()
run: |
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
-f tests/manifests/dynamo-vllm-smoke-test.yaml \
-n dynamo-system --ignore-not-found 2>/dev/null || true
- name: GPU Test Cleanup
if: always()
uses: ./.github/actions/gpu-test-cleanup
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
artifact_name_prefix: gpu-inference-test-debug