2525 - ' .github/actions/gpu-cluster-setup/**'
2626 - ' .github/actions/gpu-operator-install/**'
2727 - ' .github/actions/aicr-build/**'
28+ - ' Dockerfile.validator'
29+ - ' pkg/validator/checks/conformance/**'
2830 - ' .github/actions/gpu-test-cleanup/**'
2931 - ' .github/actions/load-versions/**'
3032 - ' tests/manifests/**'
@@ -107,6 +109,39 @@ jobs:
107109 fi
108110 echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
109111
112+ # --- Deploy DRA test pod (prerequisite for secure-accelerator-access check) ---
113+
114+ - name : Deploy DRA GPU test
115+ run : |
116+ kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
117+ -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
118+
119+ echo "Waiting for DRA GPU test pod to complete..."
120+ if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
121+ wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
122+ echo "DRA GPU allocation test passed."
123+ else
124+ echo "::error::DRA GPU test pod did not succeed"
125+ kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
126+ logs pod/dra-gpu-test 2>/dev/null || true
127+ kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
128+ get pod/dra-gpu-test -o yaml 2>/dev/null || true
129+ exit 1
130+ fi
131+
132+ echo "=== DRA GPU test logs ==="
133+ kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
134+ logs pod/dra-gpu-test
135+
136+ # --- Install Karpenter before validation so cluster-autoscaling check passes ---
137+
138+ - name : Install Karpenter + KWOK (setup)
139+ run : bash kwok/scripts/validate-cluster-autoscaling.sh --setup
140+
141+ # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
142+ # Replaces previous bash assertion steps for: inference-gateway,
143+ # accelerator-metrics, pod-autoscaling, secure-accelerator-access.
144+
110145 - name : Validate cluster
111146 run : |
112147 ./aicr validate \
@@ -131,43 +166,6 @@ jobs:
131166 --test-dir tests/chainsaw/ai-conformance/kind \
132167 --config tests/chainsaw/chainsaw-config.yaml
133168
134- # --- Inference Gateway validation (CNCF AI Conformance #6) ---
135-
136- - name : Validate inference gateway
137- run : |
138- echo "=== GatewayClass ==="
139- GC_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
140- get gatewayclass kgateway \
141- -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null)
142- echo "GatewayClass accepted: ${GC_STATUS}"
143- if [[ "${GC_STATUS}" != "True" ]]; then
144- echo "::error::GatewayClass 'kgateway' not accepted"
145- kubectl --context="kind-${KIND_CLUSTER_NAME}" get gatewayclass -o yaml 2>/dev/null || true
146- exit 1
147- fi
148-
149- echo "=== Gateway ==="
150- GW_STATUS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
151- get gateway inference-gateway -n kgateway-system \
152- -o jsonpath='{.status.conditions[?(@.type=="Programmed")].status}' 2>/dev/null)
153- echo "Gateway programmed: ${GW_STATUS}"
154- if [[ "${GW_STATUS}" != "True" ]]; then
155- echo "::error::Gateway 'inference-gateway' not programmed"
156- kubectl --context="kind-${KIND_CLUSTER_NAME}" \
157- get gateway inference-gateway -n kgateway-system -o yaml 2>/dev/null || true
158- exit 1
159- fi
160-
161- echo "=== Gateway API CRDs ==="
162- kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
163- grep -E "gateway\.networking\.k8s\.io" || true
164-
165- echo "=== Inference extension CRDs ==="
166- kubectl --context="kind-${KIND_CLUSTER_NAME}" get crds 2>/dev/null | \
167- grep -E "inference\.networking" || true
168-
169- echo "Inference gateway validation passed."
170-
171169 # --- Dynamo vLLM inference smoke test ---
172170
173171 - name : Deploy Dynamo vLLM smoke test
@@ -255,209 +253,10 @@ jobs:
255253 fi
256254 echo "Dynamo vLLM inference smoke test passed."
257255
258- # --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
259-
260- - name : Validate accelerator metrics
261-
262- run : |
263- echo "=== DCGM Exporter pod ==="
264- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
265- get pods -l app=nvidia-dcgm-exporter -o wide
266- DCGM_POD=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
267- get pods -l app=nvidia-dcgm-exporter -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
268- if [[ -z "${DCGM_POD}" ]]; then
269- echo "::error::DCGM Exporter pod not found"
270- exit 1
271- fi
272- echo "DCGM Exporter pod: ${DCGM_POD}"
273-
274- echo "=== Query DCGM metrics endpoint ==="
275- METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" run dcgm-probe \
276- --rm -i --restart=Never --image=curlimages/curl \
277- -- curl -sf http://nvidia-dcgm-exporter.gpu-operator.svc:9400/metrics 2>/dev/null)
278-
279- for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
280- if echo "${METRICS}" | grep -q "^${METRIC}"; then
281- echo "${METRIC}: $(echo "${METRICS}" | grep "^${METRIC}" | head -1)"
282- else
283- echo "::warning::Metric ${METRIC} not found in DCGM output"
284- fi
285- done
286-
287- echo "=== Prometheus scraping GPU metrics ==="
288- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring \
289- port-forward svc/kube-prometheus-prometheus 9090:9090 &
290- PF_PID=$!
291- sleep 3
292-
293- cleanup_pf() { kill "${PF_PID}" 2>/dev/null || true; }
294- trap cleanup_pf EXIT
295-
296- for METRIC in DCGM_FI_DEV_GPU_UTIL DCGM_FI_DEV_FB_USED DCGM_FI_DEV_GPU_TEMP DCGM_FI_DEV_POWER_USAGE; do
297- RESULT=$(curl -sf "http://localhost:9090/api/v1/query?query=${METRIC}" 2>/dev/null)
298- COUNT=$(echo "${RESULT}" | jq -r '.data.result | length' 2>/dev/null)
299- if [[ "${COUNT}" -gt 0 ]]; then
300- echo "${METRIC}: ${COUNT} time series in Prometheus"
301- else
302- echo "::warning::${METRIC} not found in Prometheus (may need more scrape time)"
303- fi
304- done
305-
306- kill "${PF_PID}" 2>/dev/null || true
307- trap - EXIT
308-
309- echo "=== Custom Metrics API ==="
310- CUSTOM_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
311- get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
312- if [[ -n "${CUSTOM_METRICS}" ]]; then
313- echo "Custom metrics API is available"
314- echo "${CUSTOM_METRICS}" | jq -r '.resources[].name' 2>/dev/null | head -20 || true
315- else
316- echo "::warning::Custom metrics API not available (prometheus-adapter may need time)"
317- fi
318-
319- echo "Accelerator metrics validation passed."
320-
321- # --- Pod Autoscaling readiness validation (CNCF AI Conformance #8b) ---
322- # Validates the custom metrics pipeline (DCGM → Prometheus → prometheus-adapter
323- # → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
324- # Deployments), so we validate the API directly rather than creating an HPA.
325- #
326- # DCGM exporter pod-mapping relabels metrics with the GPU workload's
327- # namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
328- # (idle GPU) or dynamo-system (active workload). prometheus-adapter also
329- # needs relist cycles (30s each) to discover new label combinations, so
330- # we poll with retries.
331-
332- - name : Validate custom metrics for pod autoscaling
333-
334- run : |
335- echo "=== Custom metrics API availability ==="
336- RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
337- get --raw /apis/custom.metrics.k8s.io/v1beta1 2>/dev/null)
338- if [[ -z "${RESOURCES}" ]]; then
339- echo "::error::Custom metrics API not available"
340- exit 1
341- fi
342- echo "Custom metrics API is available"
343- echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
344-
345- NAMESPACES="gpu-operator dynamo-system"
346- METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
347-
348- # Poll for up to 3 minutes — prometheus-adapter relists every 30s and
349- # avg_over_time(...[2m]) queries need sufficient data points.
350- HAS_METRICS=false
351- for ATTEMPT in $(seq 1 18); do
352- for METRIC in ${METRICS}; do
353- for NS in ${NAMESPACES}; do
354- RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
355- "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
356- if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
357- echo "${METRIC} metrics available in ${NS}:"
358- echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
359- HAS_METRICS=true
360- break 3
361- fi
362- done
363- done
364- echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
365- sleep 10
366- done
367-
368- if [[ "${HAS_METRICS}" != "true" ]]; then
369- echo "::error::No GPU custom metrics available via custom metrics API (prometheus-adapter pipeline broken)"
370- exit 1
371- fi
372-
373- echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
374-
375256 # --- Cluster Autoscaling validation ---
376257
377258 - name : Cluster Autoscaling (Karpenter + KWOK)
378-
379- run : bash kwok/scripts/validate-cluster-autoscaling.sh
380-
381- # --- DRA GPU allocation test ---
382-
383- - name : Deploy DRA GPU test
384-
385- run : |
386- kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
387- -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
388-
389- echo "Waiting for DRA GPU test pod to complete..."
390- if kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
391- wait --for=jsonpath='{.status.phase}'=Succeeded pod/dra-gpu-test --timeout=120s; then
392- echo "DRA GPU allocation test passed."
393- else
394- echo "::error::DRA GPU test pod did not succeed"
395- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
396- logs pod/dra-gpu-test 2>/dev/null || true
397- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
398- get pod/dra-gpu-test -o yaml 2>/dev/null || true
399- exit 1
400- fi
401-
402- echo "=== DRA GPU test logs ==="
403- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
404- logs pod/dra-gpu-test
405-
406- # --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
407-
408- - name : Validate secure accelerator access
409-
410- run : |
411- echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
412-
413- # Check pod uses resourceClaims (DRA), not resources.limits (device plugin)
414- RESOURCE_CLAIMS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
415- get pod/dra-gpu-test -o jsonpath='{.spec.resourceClaims}' 2>/dev/null)
416- if [[ -z "${RESOURCE_CLAIMS}" || "${RESOURCE_CLAIMS}" == "null" ]]; then
417- echo "::error::Pod does not use DRA resourceClaims"
418- exit 1
419- fi
420- echo "Pod uses DRA resourceClaims: ${RESOURCE_CLAIMS}"
421-
422- # Verify no nvidia.com/gpu in resources.limits (device plugin pattern)
423- GPU_LIMITS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
424- get pod/dra-gpu-test \
425- -o jsonpath='{.spec.containers[0].resources.limits.nvidia\.com/gpu}' 2>/dev/null)
426- if [[ -n "${GPU_LIMITS}" && "${GPU_LIMITS}" != "null" ]]; then
427- echo "::error::Pod uses device plugin (nvidia.com/gpu limits) instead of DRA"
428- exit 1
429- fi
430- echo "No device plugin resources.limits — GPU access via DRA only"
431-
432- # Verify no hostPath volumes to /dev/nvidia*
433- VOLUMES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
434- get pod/dra-gpu-test -o jsonpath='{.spec.volumes}' 2>/dev/null)
435- if echo "${VOLUMES}" | grep -q "hostPath" && echo "${VOLUMES}" | grep -q "/dev/nvidia"; then
436- echo "::error::Pod has hostPath volume mount to /dev/nvidia*"
437- exit 1
438- fi
439- echo "No hostPath volumes to /dev/nvidia* — access is DRA-mediated"
440-
441- # Verify container security (no privilege escalation)
442- PRIV_ESC=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
443- get pod/dra-gpu-test \
444- -o jsonpath='{.spec.containers[0].securityContext.allowPrivilegeEscalation}' 2>/dev/null)
445- echo "allowPrivilegeEscalation: ${PRIV_ESC}"
446-
447- # Verify only 1 GPU visible (allocated count matches)
448- GPU_COUNT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
449- logs pod/dra-gpu-test 2>/dev/null | grep -c "/dev/nvidia[0-9]" || echo "0")
450- echo "GPU devices visible in container: ${GPU_COUNT}"
451- if [[ "${GPU_COUNT}" -lt 1 ]]; then
452- echo "::error::No GPU devices visible in container"
453- exit 1
454- fi
455-
456- echo "=== ResourceClaim allocation ==="
457- kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dra-test \
458- get resourceclaim gpu-claim -o wide
459-
460- echo "Secure accelerator access validation passed."
259+ run : bash kwok/scripts/validate-cluster-autoscaling.sh --exercise
461260
462261 - name : DRA GPU test cleanup
463262 if : always()
0 commit comments