Skip to content

Commit 3e408d3

Browse files
committed
feat(ci): add metrics-driven cluster autoscaling validation with Karpenter + KWOK
Add CNCF AI Conformance #8a (cluster_autoscaling) validation to both H100 GPU workflows. This tests the full metrics-driven autoscaling chain: DCGM metrics → Prometheus → prometheus-adapter external metric → HPA scales Deployment → pending GPU pods → Karpenter provisions KWOK nodes → pods schedule → consolidation on cleanup Key changes: - Add Karpenter KWOK provider install script (build from source via ko, side-load into kind, deploy via Helm) - Add GPU instance types, NodePool, and KWOKNodeClass manifests - Add Deployment + HPA manifest using external dcgm_gpu_memory_used metric - Add external metrics rules and workload-attributed pod metrics to prometheus-adapter for HPA consumption - Add cluster autoscaling step to both inference and training workflows Signed-off-by: Davanum Srinivas <dsrinivas@nvidia.com>
1 parent e55f9a2 commit 3e408d3

File tree

9 files changed

+877
-0
lines changed

9 files changed

+877
-0
lines changed

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ on:
3636
- 'recipes/overlays/kind-inference.yaml'
3737
- 'recipes/overlays/h100-kind-inference.yaml'
3838
- 'recipes/overlays/h100-kind-inference-dynamo.yaml'
39+
- 'kwok/manifests/karpenter/**'
40+
- 'kwok/scripts/install-karpenter-kwok.sh'
41+
- 'recipes/components/prometheus-adapter/**'
3942
workflow_dispatch: {} # Allow manual runs
4043

4144
permissions:
@@ -336,6 +339,145 @@ jobs:
336339
337340
echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
338341
342+
# --- Cluster Autoscaling validation (CNCF AI Conformance #8a) ---
343+
# Validates the full metrics-driven autoscaling chain:
344+
# GPU workload → DCGM metrics → Prometheus → prometheus-adapter (external metric)
345+
# → HPA scales Deployment → pending pods → Karpenter → KWOK nodes provisioned
346+
#
347+
# Uses dcgm_gpu_memory_used external metric (always > 0 when a GPU exists)
348+
# to trigger HPA scaling, which overflows onto Karpenter-provisioned KWOK nodes.
349+
350+
- name: "CNCF AI Conformance #8a - Cluster Autoscaling (Karpenter + KWOK)"
351+
run: |
352+
set -euo pipefail
353+
354+
echo "=== Installing Karpenter with KWOK provider ==="
355+
export KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}"
356+
export KARPENTER_VERSION=$(yq eval '.testing_tools.karpenter' .settings.yaml)
357+
bash kwok/scripts/install-karpenter-kwok.sh
358+
359+
echo "=== Creating NodePool and KWOKNodeClass ==="
360+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
361+
-f kwok/manifests/karpenter/nodepool.yaml
362+
363+
echo "=== Verifying external metrics API has GPU metrics ==="
364+
EXT_AVAILABLE=false
365+
for i in $(seq 1 12); do
366+
EXT_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
367+
/apis/external.metrics.k8s.io/v1beta1 2>/dev/null)
368+
if [[ -n "${EXT_METRICS}" ]] && echo "${EXT_METRICS}" | jq -e '.resources[]? | select(.name=="dcgm_gpu_memory_used")' >/dev/null 2>&1; then
369+
echo "External metric dcgm_gpu_memory_used is available"
370+
EXT_AVAILABLE=true
371+
break
372+
fi
373+
echo "Waiting for external metrics API... (${i}/12)"
374+
sleep 10
375+
done
376+
if [[ "${EXT_AVAILABLE}" != "true" ]]; then
377+
echo "::error::External metric dcgm_gpu_memory_used not available"
378+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw /apis/external.metrics.k8s.io/v1beta1 2>/dev/null | jq . || true
379+
exit 1
380+
fi
381+
382+
# Query the metric value to confirm it's non-zero
383+
EXT_VALUE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
384+
"/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>/dev/null)
385+
echo "External metric value: $(echo "${EXT_VALUE}" | jq -r '.items[0].value // "N/A"' 2>/dev/null)"
386+
387+
echo "=== Deploying HPA-driven autoscaling test ==="
388+
kubectl --context="kind-${KIND_CLUSTER_NAME}" create namespace autoscaling-test
389+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
390+
-f kwok/manifests/karpenter/hpa-gpu-scale-test.yaml
391+
392+
echo "=== Waiting for HPA to read metrics and scale ==="
393+
HPA_SCALED=false
394+
for i in $(seq 1 20); do
395+
DESIRED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
396+
get hpa gpu-overflow-hpa -o jsonpath='{.status.desiredReplicas}' 2>/dev/null)
397+
CURRENT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
398+
get hpa gpu-overflow-hpa -o jsonpath='{.status.currentReplicas}' 2>/dev/null)
399+
METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
400+
get hpa gpu-overflow-hpa -o jsonpath='{.status.currentMetrics}' 2>/dev/null)
401+
402+
if [[ -n "${DESIRED}" && "${DESIRED}" -gt 1 ]]; then
403+
echo "HPA scaled: desired=${DESIRED} current=${CURRENT}"
404+
echo "HPA metrics: ${METRICS}"
405+
HPA_SCALED=true
406+
break
407+
fi
408+
echo "Waiting for HPA to compute scaling decision... desired=${DESIRED:-?} (${i}/20)"
409+
sleep 15
410+
done
411+
if [[ "${HPA_SCALED}" != "true" ]]; then
412+
echo "::error::HPA did not scale beyond 1 replica"
413+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
414+
exit 1
415+
fi
416+
417+
echo "=== Waiting for Karpenter to provision KWOK nodes ==="
418+
KWOK_NODES=0
419+
for i in $(seq 1 30); do
420+
KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
421+
-l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
422+
if [[ "$KWOK_NODES" -gt 0 ]]; then
423+
echo "Karpenter provisioned ${KWOK_NODES} KWOK GPU node(s)"
424+
break
425+
fi
426+
echo "Waiting for Karpenter to provision nodes... (${i}/30)"
427+
sleep 10
428+
done
429+
if [[ "$KWOK_NODES" -eq 0 ]]; then
430+
echo "::error::Karpenter did not provision GPU nodes"
431+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n karpenter logs deployment/karpenter --tail=50 2>/dev/null || true
432+
exit 1
433+
fi
434+
435+
echo "=== Verifying nodes have GPU capacity ==="
436+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
437+
-l karpenter.sh/nodepool=gpu-autoscaling-test \
438+
-o jsonpath='{range .items[*]}{.metadata.name}: nvidia.com/gpu={.status.capacity.nvidia\.com/gpu}{"\n"}{end}'
439+
440+
echo "=== Verifying pods scheduled onto KWOK nodes ==="
441+
SCHEDULED=0
442+
TOTAL=0
443+
for i in $(seq 1 20); do
444+
SCHEDULED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
445+
--field-selector=status.phase!=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
446+
TOTAL=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
447+
--no-headers 2>/dev/null | wc -l | tr -d ' ')
448+
if [[ "$SCHEDULED" -eq "$TOTAL" && "$TOTAL" -gt 1 ]]; then
449+
echo "All ${TOTAL} GPU pods scheduled successfully (HPA-driven)"
450+
break
451+
fi
452+
echo "Waiting for pods to schedule... (${SCHEDULED}/${TOTAL}, attempt ${i}/20)"
453+
sleep 5
454+
done
455+
if [[ "$TOTAL" -le 1 ]]; then
456+
echo "::error::HPA did not create additional replicas"
457+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
458+
exit 1
459+
fi
460+
461+
echo "=== Full chain verified ==="
462+
echo " GPU metrics → Prometheus → external metrics API → HPA → Deployment scaled"
463+
echo " → pending pods → Karpenter → ${KWOK_NODES} KWOK node(s) → ${TOTAL} pods scheduled"
464+
465+
echo "=== Testing scale-down (consolidation) ==="
466+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete namespace autoscaling-test --wait=false
467+
sleep 15
468+
for i in $(seq 1 12); do
469+
KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
470+
-l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
471+
if [[ "$KWOK_NODES" -eq 0 ]]; then
472+
echo "Karpenter consolidated all KWOK nodes (scale to zero)"
473+
break
474+
fi
475+
echo "Waiting for consolidation... (${KWOK_NODES} nodes remaining, ${i}/12)"
476+
sleep 10
477+
done
478+
479+
echo "=== Cluster autoscaling validation PASSED ==="
480+
339481
# --- DRA GPU allocation test ---
340482

341483
- name: Deploy DRA GPU test

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ on:
3232
- 'recipes/components/dynamo-platform/**'
3333
- 'recipes/overlays/kind.yaml'
3434
- 'recipes/overlays/h100-kind-training.yaml'
35+
- 'kwok/manifests/karpenter/**'
36+
- 'kwok/scripts/install-karpenter-kwok.sh'
37+
- 'recipes/components/prometheus-adapter/**'
3538
workflow_dispatch: {} # Allow manual runs
3639

3740
permissions:
@@ -172,6 +175,145 @@ jobs:
172175
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
173176
logs gang-worker-1 2>/dev/null || true
174177
178+
# --- Cluster Autoscaling validation (CNCF AI Conformance #8a) ---
179+
# Validates the full metrics-driven autoscaling chain:
180+
# GPU workload → DCGM metrics → Prometheus → prometheus-adapter (external metric)
181+
# → HPA scales Deployment → pending pods → Karpenter → KWOK nodes provisioned
182+
#
183+
# Uses dcgm_gpu_memory_used external metric (always > 0 when a GPU exists)
184+
# to trigger HPA scaling, which overflows onto Karpenter-provisioned KWOK nodes.
185+
186+
- name: "CNCF AI Conformance #8a - Cluster Autoscaling (Karpenter + KWOK)"
187+
run: |
188+
set -euo pipefail
189+
190+
echo "=== Installing Karpenter with KWOK provider ==="
191+
export KIND_CLUSTER_NAME="${KIND_CLUSTER_NAME}"
192+
export KARPENTER_VERSION=$(yq eval '.testing_tools.karpenter' .settings.yaml)
193+
bash kwok/scripts/install-karpenter-kwok.sh
194+
195+
echo "=== Creating NodePool and KWOKNodeClass ==="
196+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
197+
-f kwok/manifests/karpenter/nodepool.yaml
198+
199+
echo "=== Verifying external metrics API has GPU metrics ==="
200+
EXT_AVAILABLE=false
201+
for i in $(seq 1 12); do
202+
EXT_METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
203+
/apis/external.metrics.k8s.io/v1beta1 2>/dev/null)
204+
if [[ -n "${EXT_METRICS}" ]] && echo "${EXT_METRICS}" | jq -e '.resources[]? | select(.name=="dcgm_gpu_memory_used")' >/dev/null 2>&1; then
205+
echo "External metric dcgm_gpu_memory_used is available"
206+
EXT_AVAILABLE=true
207+
break
208+
fi
209+
echo "Waiting for external metrics API... (${i}/12)"
210+
sleep 10
211+
done
212+
if [[ "${EXT_AVAILABLE}" != "true" ]]; then
213+
echo "::error::External metric dcgm_gpu_memory_used not available"
214+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw /apis/external.metrics.k8s.io/v1beta1 2>/dev/null | jq . || true
215+
exit 1
216+
fi
217+
218+
# Query the metric value to confirm it's non-zero
219+
EXT_VALUE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
220+
"/apis/external.metrics.k8s.io/v1beta1/namespaces/default/dcgm_gpu_memory_used" 2>/dev/null)
221+
echo "External metric value: $(echo "${EXT_VALUE}" | jq -r '.items[0].value // "N/A"' 2>/dev/null)"
222+
223+
echo "=== Deploying HPA-driven autoscaling test ==="
224+
kubectl --context="kind-${KIND_CLUSTER_NAME}" create namespace autoscaling-test
225+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
226+
-f kwok/manifests/karpenter/hpa-gpu-scale-test.yaml
227+
228+
echo "=== Waiting for HPA to read metrics and scale ==="
229+
HPA_SCALED=false
230+
for i in $(seq 1 20); do
231+
DESIRED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
232+
get hpa gpu-overflow-hpa -o jsonpath='{.status.desiredReplicas}' 2>/dev/null)
233+
CURRENT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
234+
get hpa gpu-overflow-hpa -o jsonpath='{.status.currentReplicas}' 2>/dev/null)
235+
METRICS=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test \
236+
get hpa gpu-overflow-hpa -o jsonpath='{.status.currentMetrics}' 2>/dev/null)
237+
238+
if [[ -n "${DESIRED}" && "${DESIRED}" -gt 1 ]]; then
239+
echo "HPA scaled: desired=${DESIRED} current=${CURRENT}"
240+
echo "HPA metrics: ${METRICS}"
241+
HPA_SCALED=true
242+
break
243+
fi
244+
echo "Waiting for HPA to compute scaling decision... desired=${DESIRED:-?} (${i}/20)"
245+
sleep 15
246+
done
247+
if [[ "${HPA_SCALED}" != "true" ]]; then
248+
echo "::error::HPA did not scale beyond 1 replica"
249+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
250+
exit 1
251+
fi
252+
253+
echo "=== Waiting for Karpenter to provision KWOK nodes ==="
254+
KWOK_NODES=0
255+
for i in $(seq 1 30); do
256+
KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
257+
-l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
258+
if [[ "$KWOK_NODES" -gt 0 ]]; then
259+
echo "Karpenter provisioned ${KWOK_NODES} KWOK GPU node(s)"
260+
break
261+
fi
262+
echo "Waiting for Karpenter to provision nodes... (${i}/30)"
263+
sleep 10
264+
done
265+
if [[ "$KWOK_NODES" -eq 0 ]]; then
266+
echo "::error::Karpenter did not provision GPU nodes"
267+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n karpenter logs deployment/karpenter --tail=50 2>/dev/null || true
268+
exit 1
269+
fi
270+
271+
echo "=== Verifying nodes have GPU capacity ==="
272+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
273+
-l karpenter.sh/nodepool=gpu-autoscaling-test \
274+
-o jsonpath='{range .items[*]}{.metadata.name}: nvidia.com/gpu={.status.capacity.nvidia\.com/gpu}{"\n"}{end}'
275+
276+
echo "=== Verifying pods scheduled onto KWOK nodes ==="
277+
SCHEDULED=0
278+
TOTAL=0
279+
for i in $(seq 1 20); do
280+
SCHEDULED=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
281+
--field-selector=status.phase!=Pending --no-headers 2>/dev/null | wc -l | tr -d ' ')
282+
TOTAL=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get pods -n autoscaling-test \
283+
--no-headers 2>/dev/null | wc -l | tr -d ' ')
284+
if [[ "$SCHEDULED" -eq "$TOTAL" && "$TOTAL" -gt 1 ]]; then
285+
echo "All ${TOTAL} GPU pods scheduled successfully (HPA-driven)"
286+
break
287+
fi
288+
echo "Waiting for pods to schedule... (${SCHEDULED}/${TOTAL}, attempt ${i}/20)"
289+
sleep 5
290+
done
291+
if [[ "$TOTAL" -le 1 ]]; then
292+
echo "::error::HPA did not create additional replicas"
293+
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n autoscaling-test describe hpa gpu-overflow-hpa 2>/dev/null || true
294+
exit 1
295+
fi
296+
297+
echo "=== Full chain verified ==="
298+
echo " GPU metrics → Prometheus → external metrics API → HPA → Deployment scaled"
299+
echo " → pending pods → Karpenter → ${KWOK_NODES} KWOK node(s) → ${TOTAL} pods scheduled"
300+
301+
echo "=== Testing scale-down (consolidation) ==="
302+
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete namespace autoscaling-test --wait=false
303+
sleep 15
304+
for i in $(seq 1 12); do
305+
KWOK_NODES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes \
306+
-l karpenter.sh/nodepool=gpu-autoscaling-test --no-headers 2>/dev/null | wc -l | tr -d ' ')
307+
if [[ "$KWOK_NODES" -eq 0 ]]; then
308+
echo "Karpenter consolidated all KWOK nodes (scale to zero)"
309+
break
310+
fi
311+
echo "Waiting for consolidation... (${KWOK_NODES} nodes remaining, ${i}/12)"
312+
sleep 10
313+
done
314+
315+
echo "=== Cluster autoscaling validation PASSED ==="
316+
175317
# --- Evidence collection ---
176318

177319
- name: Collect AI conformance evidence

.settings.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ testing_tools:
4444
kwok: 'v0.7.0'
4545
chainsaw: 'v0.2.14'
4646
yq: 'v4.52.4'
47+
karpenter: 'v1.8.0'
4748

4849
# Quality Thresholds
4950
quality:

0 commit comments

Comments
 (0)