3636 - ' recipes/overlays/kind-inference.yaml'
3737 - ' recipes/overlays/h100-kind-inference.yaml'
3838 - ' recipes/overlays/h100-kind-inference-dynamo.yaml'
39+ - ' kwok/manifests/karpenter/**'
40+ - ' kwok/scripts/install-karpenter-kwok.sh'
41+ - ' kwok/scripts/validate-cluster-autoscaling.sh'
3942 workflow_dispatch : {} # Allow manual runs
4043
4144permissions :
@@ -169,6 +172,31 @@ jobs:
169172
170173 - name : Deploy Dynamo vLLM smoke test
171174 run : |
175+ # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
176+ # The kai-scheduler chart creates default-parent-queue + default-queue on install,
177+ # but Dynamo needs its own queue as a child of the parent.
178+ kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
179+ apiVersion: scheduling.run.ai/v2
180+ kind: Queue
181+ metadata:
182+ name: dynamo
183+ spec:
184+ parentQueue: default-parent-queue
185+ resources:
186+ gpu:
187+ quota: 0
188+ limit: -1
189+ overQuotaWeight: 1
190+ cpu:
191+ quota: 0
192+ limit: -1
193+ overQuotaWeight: 1
194+ memory:
195+ quota: 0
196+ limit: -1
197+ overQuotaWeight: 1
198+ EOF
199+
172200 kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
173201 -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
174202
@@ -230,6 +258,7 @@ jobs:
230258 # --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
231259
232260 - name : Validate accelerator metrics
261+
233262 run : |
234263 echo "=== DCGM Exporter pod ==="
235264 kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
@@ -294,11 +323,14 @@ jobs:
294323 # → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
295324 # Deployments), so we validate the API directly rather than creating an HPA.
296325 #
297- # Note: DCGM exporter runs as a DaemonSet in gpu-operator namespace, so
298- # Prometheus labels GPU metrics with namespace=gpu-operator. We query that
299- # namespace to validate the full metrics pipeline.
326+ # DCGM exporter pod-mapping relabels metrics with the GPU workload's
327+ # namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
328+ # (idle GPU) or dynamo-system (active workload). prometheus-adapter also
329+ # needs relist cycles (30s each) to discover new label combinations, so
330+ # we poll with retries.
300331
301332 - name : Validate custom metrics for pod autoscaling
333+
302334 run : |
303335 echo "=== Custom metrics API availability ==="
304336 RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
@@ -310,23 +342,27 @@ jobs:
310342 echo "Custom metrics API is available"
311343 echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
312344
313- # DCGM exporter runs in gpu-operator namespace, so custom metrics are
314- # attributed to the DCGM exporter pod there (not workload pods).
315- METRICS_NS="gpu-operator"
345+ NAMESPACES="gpu-operator dynamo-system"
346+ METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
316347
317- # At least one GPU metric must be available via the custom metrics API
348+ # Poll for up to 3 minutes — prometheus-adapter relists every 30s and
349+ # avg_over_time(...[2m]) queries need sufficient data points.
318350 HAS_METRICS=false
319- for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
320- echo "=== Query ${METRIC} ==="
321- RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
322- "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${METRICS_NS}/pods/*/${METRIC}" 2>/dev/null)
323- if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
324- echo "${METRIC} metrics available:"
325- echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
326- HAS_METRICS=true
327- else
328- echo "::warning::${METRIC} not available in ${METRICS_NS} namespace"
329- fi
351+ for ATTEMPT in $(seq 1 18); do
352+ for METRIC in ${METRICS}; do
353+ for NS in ${NAMESPACES}; do
354+ RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
355+ "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
356+ if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
357+ echo "${METRIC} metrics available in ${NS}:"
358+ echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
359+ HAS_METRICS=true
360+ break 3
361+ fi
362+ done
363+ done
364+ echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
365+ sleep 10
330366 done
331367
332368 if [[ "${HAS_METRICS}" != "true" ]]; then
@@ -336,9 +372,16 @@ jobs:
336372
337373 echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
338374
375+ # --- Cluster Autoscaling validation ---
376+
377+ - name : Cluster Autoscaling (Karpenter + KWOK)
378+
379+ run : bash kwok/scripts/validate-cluster-autoscaling.sh
380+
339381 # --- DRA GPU allocation test ---
340382
341383 - name : Deploy DRA GPU test
384+
342385 run : |
343386 kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
344387 -f docs/conformance/cncf/manifests/dra-gpu-test.yaml
@@ -363,6 +406,7 @@ jobs:
363406 # --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
364407
365408 - name : Validate secure accelerator access
409+
366410 run : |
367411 echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
368412
@@ -474,8 +518,10 @@ jobs:
474518 echo "=== Custom metrics API ==="
475519 for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
476520 echo "--- ${METRIC} ---"
477- kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
478- "/apis/custom.metrics.k8s.io/v1beta1/namespaces/gpu-operator/pods/*/${METRIC}" 2>/dev/null | jq . || true
521+ for NS in gpu-operator dynamo-system; do
522+ kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
523+ "/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
524+ done
479525 done
480526 echo "=== prometheus-adapter pods ==="
481527 kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true
0 commit comments