Skip to content

Commit 503ad00

Browse files
committed
feat(ci): add metrics-driven cluster autoscaling validation with Karpenter + KWOK
Add cluster autoscaling validation to both H100 GPU workflows (inference and training). The test validates the full metrics-driven autoscaling chain: DCGM metrics → Prometheus → prometheus-adapter (external metric) → HPA scales Deployment → pending pods → Karpenter → KWOK nodes New files: - kwok/scripts/install-karpenter-kwok.sh: builds Karpenter KWOK provider via ko and deploys with Helm into kind clusters - kwok/scripts/validate-cluster-autoscaling.sh: reusable E2E script that verifies external metrics, HPA scaling, node provisioning, pod scheduling, and scale-down consolidation - kwok/manifests/karpenter/: NodePool, KWOKNodeClass, HPA test workload, and GPU instance type definitions Changed files: - recipes/components/prometheus-adapter/values.yaml: add workload- attributed custom metrics, external metrics rules for cluster-wide GPU metrics (power_usage, memory_used, utilization) with namespaced: false, and 30s metrics relist interval - .github/workflows/gpu-h100-{inference,training}-test.yaml: add cluster autoscaling step and trigger paths for karpenter manifests - .settings.yaml: add karpenter v1.8.0 to testing_tools
1 parent 225d551 commit 503ad00

File tree

9 files changed

+914
-20
lines changed

9 files changed

+914
-20
lines changed

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ on:
3636
- 'recipes/overlays/kind-inference.yaml'
3737
- 'recipes/overlays/h100-kind-inference.yaml'
3838
- 'recipes/overlays/h100-kind-inference-dynamo.yaml'
39+
- 'kwok/manifests/karpenter/**'
40+
- 'kwok/scripts/install-karpenter-kwok.sh'
41+
- 'kwok/scripts/validate-cluster-autoscaling.sh'
3942
workflow_dispatch: {} # Allow manual runs
4043

4144
permissions:
@@ -169,6 +172,31 @@ jobs:
169172

170173
- name: Deploy Dynamo vLLM smoke test
171174
run: |
175+
# Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
176+
# The kai-scheduler chart creates default-parent-queue + default-queue on install,
177+
# but Dynamo needs its own queue as a child of the parent.
178+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
179+
apiVersion: scheduling.run.ai/v2
180+
kind: Queue
181+
metadata:
182+
name: dynamo
183+
spec:
184+
parentQueue: default-parent-queue
185+
resources:
186+
gpu:
187+
quota: 0
188+
limit: -1
189+
overQuotaWeight: 1
190+
cpu:
191+
quota: 0
192+
limit: -1
193+
overQuotaWeight: 1
194+
memory:
195+
quota: 0
196+
limit: -1
197+
overQuotaWeight: 1
198+
EOF
199+
172200
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
173201
-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
174202
@@ -228,8 +256,10 @@ jobs:
228256
echo "Dynamo vLLM inference smoke test passed."
229257
230258
# --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
259+
# Independent of Dynamo — run even if Dynamo deployment fails.
231260

232261
- name: Validate accelerator metrics
262+
if: success() || failure()
233263
run: |
234264
echo "=== DCGM Exporter pod ==="
235265
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
@@ -294,11 +324,14 @@ jobs:
294324
# → custom metrics API) that HPA consumes. Dynamo uses PodCliqueSets (not
295325
# Deployments), so we validate the API directly rather than creating an HPA.
296326
#
297-
# Note: DCGM exporter runs as a DaemonSet in gpu-operator namespace, so
298-
# Prometheus labels GPU metrics with namespace=gpu-operator. We query that
299-
# namespace to validate the full metrics pipeline.
327+
# DCGM exporter pod-mapping relabels metrics with the GPU workload's
328+
# namespace/pod when a GPU is in use. Metrics may appear in gpu-operator
329+
# (idle GPU) or dynamo-system (active workload). prometheus-adapter also
330+
# needs relist cycles (30s each) to discover new label combinations, so
331+
# we poll with retries.
300332

301333
- name: Validate custom metrics for pod autoscaling
334+
if: success() || failure()
302335
run: |
303336
echo "=== Custom metrics API availability ==="
304337
RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
@@ -310,23 +343,27 @@ jobs:
310343
echo "Custom metrics API is available"
311344
echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
312345
313-
# DCGM exporter runs in gpu-operator namespace, so custom metrics are
314-
# attributed to the DCGM exporter pod there (not workload pods).
315-
METRICS_NS="gpu-operator"
346+
NAMESPACES="gpu-operator dynamo-system"
347+
METRICS="gpu_utilization gpu_memory_used gpu_power_usage"
316348
317-
# At least one GPU metric must be available via the custom metrics API
349+
# Poll for up to 3 minutes — prometheus-adapter relists every 30s and
350+
# avg_over_time(...[2m]) queries need sufficient data points.
318351
HAS_METRICS=false
319-
for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
320-
echo "=== Query ${METRIC} ==="
321-
RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
322-
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${METRICS_NS}/pods/*/${METRIC}" 2>/dev/null)
323-
if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
324-
echo "${METRIC} metrics available:"
325-
echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
326-
HAS_METRICS=true
327-
else
328-
echo "::warning::${METRIC} not available in ${METRICS_NS} namespace"
329-
fi
352+
for ATTEMPT in $(seq 1 18); do
353+
for METRIC in ${METRICS}; do
354+
for NS in ${NAMESPACES}; do
355+
RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
356+
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
357+
if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
358+
echo "${METRIC} metrics available in ${NS}:"
359+
echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
360+
HAS_METRICS=true
361+
break 3
362+
fi
363+
done
364+
done
365+
echo "Waiting for custom metrics to appear... (${ATTEMPT}/18)"
366+
sleep 10
330367
done
331368
332369
if [[ "${HAS_METRICS}" != "true" ]]; then
@@ -336,9 +373,16 @@ jobs:
336373
337374
echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
338375
376+
# --- Cluster Autoscaling validation ---
377+
378+
- name: Cluster Autoscaling (Karpenter + KWOK)
379+
if: success() || failure()
380+
run: bash kwok/scripts/validate-cluster-autoscaling.sh
381+
339382
# --- DRA GPU allocation test ---
340383

341384
- name: Deploy DRA GPU test
385+
if: success() || failure()
342386
run: |
343387
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
344388
-f docs/conformance/cncf/manifests/dra-gpu-test.yaml
@@ -363,6 +407,7 @@ jobs:
363407
# --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
364408

365409
- name: Validate secure accelerator access
410+
if: success() || failure()
366411
run: |
367412
echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
368413
@@ -474,8 +519,10 @@ jobs:
474519
echo "=== Custom metrics API ==="
475520
for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
476521
echo "--- ${METRIC} ---"
477-
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
478-
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/gpu-operator/pods/*/${METRIC}" 2>/dev/null | jq . || true
522+
for NS in gpu-operator dynamo-system; do
523+
kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
524+
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null | jq . || true
525+
done
479526
done
480527
echo "=== prometheus-adapter pods ==="
481528
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n monitoring get pods -l app.kubernetes.io/name=prometheus-adapter -o wide 2>/dev/null || true

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ on:
3232
- 'recipes/components/dynamo-platform/**'
3333
- 'recipes/overlays/kind.yaml'
3434
- 'recipes/overlays/h100-kind-training.yaml'
35+
- 'kwok/manifests/karpenter/**'
36+
- 'kwok/scripts/install-karpenter-kwok.sh'
37+
- 'kwok/scripts/validate-cluster-autoscaling.sh'
38+
- 'recipes/components/prometheus-adapter/**'
3539
workflow_dispatch: {} # Allow manual runs
3640

3741
permissions:
@@ -172,6 +176,12 @@ jobs:
172176
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
173177
logs gang-worker-1 2>/dev/null || true
174178
179+
# --- Cluster Autoscaling validation ---
180+
181+
- name: Cluster Autoscaling (Karpenter + KWOK)
182+
if: success() || failure()
183+
run: bash kwok/scripts/validate-cluster-autoscaling.sh
184+
175185
# --- Evidence collection ---
176186

177187
- name: Collect AI conformance evidence

.settings.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ testing_tools:
4444
kwok: 'v0.7.0'
4545
chainsaw: 'v0.2.14'
4646
yq: 'v4.52.4'
47+
karpenter: 'v1.8.0'
4748

4849
# Quality Thresholds
4950
quality:
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# HPA-driven GPU autoscaling test for CNCF AI Conformance cluster_autoscaling.
16+
#
17+
# This validates the full metrics-driven autoscaling chain:
18+
# GPU workload → DCGM metrics → Prometheus → prometheus-adapter → HPA
19+
# → Deployment scales → pending pods → Karpenter → KWOK nodes provisioned
20+
#
21+
# The HPA uses an external GPU metric (dcgm_gpu_memory_used) which is always > 0
22+
# when a GPU exists (driver memory overhead). When the metric exceeds the low
23+
# threshold, HPA scales the Deployment beyond what the real GPU node can serve,
24+
# causing overflow pods to trigger Karpenter KWOK node provisioning.
25+
26+
---
27+
apiVersion: apps/v1
28+
kind: Deployment
29+
metadata:
30+
name: gpu-overflow-workers
31+
namespace: autoscaling-test
32+
labels:
33+
app: gpu-overflow-workers
34+
test: cluster-autoscaling
35+
spec:
36+
replicas: 1
37+
selector:
38+
matchLabels:
39+
app: gpu-overflow-workers
40+
template:
41+
metadata:
42+
labels:
43+
app: gpu-overflow-workers
44+
test: cluster-autoscaling
45+
spec:
46+
tolerations:
47+
- key: nvidia.com/gpu
48+
operator: Equal
49+
value: "present"
50+
effect: NoSchedule
51+
- key: kwok.x-k8s.io/node
52+
operator: Exists
53+
effect: NoSchedule
54+
nodeSelector:
55+
karpenter.sh/nodepool: gpu-autoscaling-test
56+
securityContext:
57+
runAsNonRoot: true
58+
runAsUser: 65534
59+
seccompProfile:
60+
type: RuntimeDefault
61+
containers:
62+
- name: gpu-workload
63+
image: ubuntu:22.04
64+
command: ["sleep", "120"]
65+
resources:
66+
limits:
67+
nvidia.com/gpu: "1"
68+
requests:
69+
nvidia.com/gpu: "1"
70+
securityContext:
71+
allowPrivilegeEscalation: false
72+
readOnlyRootFilesystem: true
73+
restartPolicy: Always
74+
---
75+
apiVersion: autoscaling/v2
76+
kind: HorizontalPodAutoscaler
77+
metadata:
78+
name: gpu-overflow-hpa
79+
namespace: autoscaling-test
80+
labels:
81+
test: cluster-autoscaling
82+
spec:
83+
scaleTargetRef:
84+
apiVersion: apps/v1
85+
kind: Deployment
86+
name: gpu-overflow-workers
87+
minReplicas: 1
88+
maxReplicas: 4
89+
metrics:
90+
# External metric: cluster-wide average GPU power draw (watts).
91+
# Power usage is always > 0 when a GPU exists (idle H100 draws ~46W).
92+
# With a low threshold, this reliably triggers scale-up.
93+
- type: External
94+
external:
95+
metric:
96+
name: dcgm_gpu_power_usage
97+
target:
98+
type: AverageValue
99+
averageValue: "10"
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
[
2+
{
3+
"name": "p5.48xlarge",
4+
"offerings": [
5+
{
6+
"Price": 98.32,
7+
"Available": true,
8+
"Requirements": [
9+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
10+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
11+
]
12+
},
13+
{
14+
"Price": 98.32,
15+
"Available": true,
16+
"Requirements": [
17+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
18+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
19+
]
20+
}
21+
],
22+
"architecture": "amd64",
23+
"operatingSystems": ["linux"],
24+
"resources": {
25+
"cpu": "192",
26+
"memory": "2048Gi",
27+
"ephemeral-storage": "3800Gi",
28+
"nvidia.com/gpu": "8"
29+
}
30+
},
31+
{
32+
"name": "g5.xlarge",
33+
"offerings": [
34+
{
35+
"Price": 1.006,
36+
"Available": true,
37+
"Requirements": [
38+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
39+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
40+
]
41+
},
42+
{
43+
"Price": 1.006,
44+
"Available": true,
45+
"Requirements": [
46+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
47+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
48+
]
49+
}
50+
],
51+
"architecture": "amd64",
52+
"operatingSystems": ["linux"],
53+
"resources": {
54+
"cpu": "4",
55+
"memory": "16Gi",
56+
"ephemeral-storage": "250Gi",
57+
"nvidia.com/gpu": "1"
58+
}
59+
},
60+
{
61+
"name": "g5.2xlarge",
62+
"offerings": [
63+
{
64+
"Price": 1.212,
65+
"Available": true,
66+
"Requirements": [
67+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
68+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
69+
]
70+
},
71+
{
72+
"Price": 1.212,
73+
"Available": true,
74+
"Requirements": [
75+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
76+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
77+
]
78+
}
79+
],
80+
"architecture": "amd64",
81+
"operatingSystems": ["linux"],
82+
"resources": {
83+
"cpu": "8",
84+
"memory": "32Gi",
85+
"ephemeral-storage": "450Gi",
86+
"nvidia.com/gpu": "1"
87+
}
88+
}
89+
]

0 commit comments

Comments
 (0)