Skip to content

Commit 6613acd

Browse files
committed
feat(ci): add metrics-driven cluster autoscaling validation with Karpenter + KWOK
Add cluster autoscaling validation to both H100 GPU workflows (inference and training). The test validates the full metrics-driven autoscaling chain: DCGM metrics → Prometheus → prometheus-adapter (external metric) → HPA scales Deployment → pending pods → Karpenter → KWOK nodes New files: - kwok/scripts/install-karpenter-kwok.sh: builds Karpenter KWOK provider via ko and deploys with Helm into kind clusters - kwok/scripts/validate-cluster-autoscaling.sh: reusable E2E script that verifies external metrics, HPA scaling, node provisioning, pod scheduling, and scale-down consolidation - kwok/manifests/karpenter/: NodePool, KWOKNodeClass, HPA test workload, and GPU instance type definitions Changed files: - recipes/components/prometheus-adapter/values.yaml: add workload- attributed custom metrics, external metrics rules for cluster-wide GPU metrics (power_usage, memory_used, utilization) with namespaced: false, and 30s metrics relist interval - .github/workflows/gpu-h100-{inference,training}-test.yaml: add cluster autoscaling step and trigger paths for karpenter manifests - .settings.yaml: add karpenter v1.8.0 to testing_tools
1 parent 225d551 commit 6613acd

File tree

9 files changed

+901
-11
lines changed

9 files changed

+901
-11
lines changed

.github/workflows/gpu-h100-inference-test.yaml

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ on:
3636
- 'recipes/overlays/kind-inference.yaml'
3737
- 'recipes/overlays/h100-kind-inference.yaml'
3838
- 'recipes/overlays/h100-kind-inference-dynamo.yaml'
39+
- 'kwok/manifests/karpenter/**'
40+
- 'kwok/scripts/install-karpenter-kwok.sh'
41+
- 'kwok/scripts/validate-cluster-autoscaling.sh'
3942
workflow_dispatch: {} # Allow manual runs
4043

4144
permissions:
@@ -169,6 +172,31 @@ jobs:
169172

170173
- name: Deploy Dynamo vLLM smoke test
171174
run: |
175+
# Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
176+
# The kai-scheduler chart creates default-parent-queue + default-queue on install,
177+
# but Dynamo needs its own queue as a child of the parent.
178+
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
179+
apiVersion: scheduling.run.ai/v2
180+
kind: Queue
181+
metadata:
182+
name: dynamo
183+
spec:
184+
parentQueue: default-parent-queue
185+
resources:
186+
gpu:
187+
quota: 0
188+
limit: -1
189+
overQuotaWeight: 1
190+
cpu:
191+
quota: 0
192+
limit: -1
193+
overQuotaWeight: 1
194+
memory:
195+
quota: 0
196+
limit: -1
197+
overQuotaWeight: 1
198+
EOF
199+
172200
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
173201
-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
174202
@@ -228,8 +256,10 @@ jobs:
228256
echo "Dynamo vLLM inference smoke test passed."
229257
230258
# --- Accelerator & AI Service Metrics validation (CNCF AI Conformance #4/#5) ---
259+
# Independent of Dynamo — run even if Dynamo deployment fails.
231260

232261
- name: Validate accelerator metrics
262+
if: success() || failure()
233263
run: |
234264
echo "=== DCGM Exporter pod ==="
235265
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator \
@@ -299,6 +329,7 @@ jobs:
299329
# namespace to validate the full metrics pipeline.
300330

301331
- name: Validate custom metrics for pod autoscaling
332+
if: success() || failure()
302333
run: |
303334
echo "=== Custom metrics API availability ==="
304335
RESOURCES=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" \
@@ -310,22 +341,27 @@ jobs:
310341
echo "Custom metrics API is available"
311342
echo "${RESOURCES}" | jq -r '.resources[].name' 2>/dev/null | head -20
312343
313-
# DCGM exporter runs in gpu-operator namespace, so custom metrics are
314-
# attributed to the DCGM exporter pod there (not workload pods).
315-
METRICS_NS="gpu-operator"
344+
# DCGM exporter pod-mapping relabels metrics with the GPU workload's
345+
# namespace/pod (not the exporter's). Check both gpu-operator (idle GPU)
346+
# and dynamo-system (active workload) namespaces.
347+
NAMESPACES="gpu-operator dynamo-system"
316348
317349
# At least one GPU metric must be available via the custom metrics API
318350
HAS_METRICS=false
319351
for METRIC in gpu_utilization gpu_memory_used gpu_power_usage; do
320352
echo "=== Query ${METRIC} ==="
321-
RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
322-
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${METRICS_NS}/pods/*/${METRIC}" 2>/dev/null)
323-
if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
324-
echo "${METRIC} metrics available:"
325-
echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
326-
HAS_METRICS=true
327-
else
328-
echo "::warning::${METRIC} not available in ${METRICS_NS} namespace"
353+
for NS in ${NAMESPACES}; do
354+
RESULT=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" get --raw \
355+
"/apis/custom.metrics.k8s.io/v1beta1/namespaces/${NS}/pods/*/${METRIC}" 2>/dev/null || true)
356+
if [[ -n "${RESULT}" ]] && echo "${RESULT}" | jq -e '.items | length > 0' >/dev/null 2>&1; then
357+
echo "${METRIC} metrics available in ${NS}:"
358+
echo "${RESULT}" | jq '.items[] | {pod: .describedObject.name, value: .value}' 2>/dev/null
359+
HAS_METRICS=true
360+
break
361+
fi
362+
done
363+
if [[ "${HAS_METRICS}" != "true" ]]; then
364+
echo "::warning::${METRIC} not available in any namespace (${NAMESPACES})"
329365
fi
330366
done
331367
@@ -336,9 +372,16 @@ jobs:
336372
337373
echo "Custom metrics pipeline validated — GPU metrics available for HPA consumption."
338374
375+
# --- Cluster Autoscaling validation ---
376+
377+
- name: Cluster Autoscaling (Karpenter + KWOK)
378+
if: success() || failure()
379+
run: bash kwok/scripts/validate-cluster-autoscaling.sh
380+
339381
# --- DRA GPU allocation test ---
340382

341383
- name: Deploy DRA GPU test
384+
if: success() || failure()
342385
run: |
343386
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
344387
-f docs/conformance/cncf/manifests/dra-gpu-test.yaml
@@ -363,6 +406,7 @@ jobs:
363406
# --- Secure Accelerator Access validation (CNCF AI Conformance #3) ---
364407

365408
- name: Validate secure accelerator access
409+
if: success() || failure()
366410
run: |
367411
echo "=== Verify DRA-mediated access (no hostPath, no device plugin) ==="
368412

.github/workflows/gpu-h100-training-test.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ on:
3232
- 'recipes/components/dynamo-platform/**'
3333
- 'recipes/overlays/kind.yaml'
3434
- 'recipes/overlays/h100-kind-training.yaml'
35+
- 'kwok/manifests/karpenter/**'
36+
- 'kwok/scripts/install-karpenter-kwok.sh'
37+
- 'kwok/scripts/validate-cluster-autoscaling.sh'
38+
- 'recipes/components/prometheus-adapter/**'
3539
workflow_dispatch: {} # Allow manual runs
3640

3741
permissions:
@@ -172,6 +176,11 @@ jobs:
172176
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gang-scheduling-test \
173177
logs gang-worker-1 2>/dev/null || true
174178
179+
# --- Cluster Autoscaling validation ---
180+
181+
- name: Cluster Autoscaling (Karpenter + KWOK)
182+
run: bash kwok/scripts/validate-cluster-autoscaling.sh
183+
175184
# --- Evidence collection ---
176185

177186
- name: Collect AI conformance evidence

.settings.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ testing_tools:
4444
kwok: 'v0.7.0'
4545
chainsaw: 'v0.2.14'
4646
yq: 'v4.52.4'
47+
karpenter: 'v1.8.0'
4748

4849
# Quality Thresholds
4950
quality:
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# HPA-driven GPU autoscaling test for CNCF AI Conformance cluster_autoscaling.
16+
#
17+
# This validates the full metrics-driven autoscaling chain:
18+
# GPU workload → DCGM metrics → Prometheus → prometheus-adapter → HPA
19+
# → Deployment scales → pending pods → Karpenter → KWOK nodes provisioned
20+
#
21+
# The HPA uses an external GPU metric (dcgm_gpu_memory_used) which is always > 0
22+
# when a GPU exists (driver memory overhead). When the metric exceeds the low
23+
# threshold, HPA scales the Deployment beyond what the real GPU node can serve,
24+
# causing overflow pods to trigger Karpenter KWOK node provisioning.
25+
26+
---
27+
apiVersion: apps/v1
28+
kind: Deployment
29+
metadata:
30+
name: gpu-overflow-workers
31+
namespace: autoscaling-test
32+
labels:
33+
app: gpu-overflow-workers
34+
test: cluster-autoscaling
35+
spec:
36+
replicas: 1
37+
selector:
38+
matchLabels:
39+
app: gpu-overflow-workers
40+
template:
41+
metadata:
42+
labels:
43+
app: gpu-overflow-workers
44+
test: cluster-autoscaling
45+
spec:
46+
tolerations:
47+
- key: nvidia.com/gpu
48+
operator: Equal
49+
value: "present"
50+
effect: NoSchedule
51+
- key: kwok.x-k8s.io/node
52+
operator: Exists
53+
effect: NoSchedule
54+
nodeSelector:
55+
karpenter.sh/nodepool: gpu-autoscaling-test
56+
securityContext:
57+
runAsNonRoot: true
58+
runAsUser: 65534
59+
seccompProfile:
60+
type: RuntimeDefault
61+
containers:
62+
- name: gpu-workload
63+
image: ubuntu:22.04
64+
command: ["sleep", "120"]
65+
resources:
66+
limits:
67+
nvidia.com/gpu: "1"
68+
requests:
69+
nvidia.com/gpu: "1"
70+
securityContext:
71+
allowPrivilegeEscalation: false
72+
readOnlyRootFilesystem: true
73+
restartPolicy: Always
74+
---
75+
apiVersion: autoscaling/v2
76+
kind: HorizontalPodAutoscaler
77+
metadata:
78+
name: gpu-overflow-hpa
79+
namespace: autoscaling-test
80+
labels:
81+
test: cluster-autoscaling
82+
spec:
83+
scaleTargetRef:
84+
apiVersion: apps/v1
85+
kind: Deployment
86+
name: gpu-overflow-workers
87+
minReplicas: 1
88+
maxReplicas: 4
89+
metrics:
90+
# External metric: cluster-wide average GPU power draw (watts).
91+
# Power usage is always > 0 when a GPU exists (idle H100 draws ~46W).
92+
# With a low threshold, this reliably triggers scale-up.
93+
- type: External
94+
external:
95+
metric:
96+
name: dcgm_gpu_power_usage
97+
target:
98+
type: AverageValue
99+
averageValue: "10"
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
[
2+
{
3+
"name": "p5.48xlarge",
4+
"offerings": [
5+
{
6+
"Price": 98.32,
7+
"Available": true,
8+
"Requirements": [
9+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
10+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
11+
]
12+
},
13+
{
14+
"Price": 98.32,
15+
"Available": true,
16+
"Requirements": [
17+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
18+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
19+
]
20+
}
21+
],
22+
"architecture": "amd64",
23+
"operatingSystems": ["linux"],
24+
"resources": {
25+
"cpu": "192",
26+
"memory": "2048Gi",
27+
"ephemeral-storage": "3800Gi",
28+
"nvidia.com/gpu": "8"
29+
}
30+
},
31+
{
32+
"name": "g5.xlarge",
33+
"offerings": [
34+
{
35+
"Price": 1.006,
36+
"Available": true,
37+
"Requirements": [
38+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
39+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
40+
]
41+
},
42+
{
43+
"Price": 1.006,
44+
"Available": true,
45+
"Requirements": [
46+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
47+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
48+
]
49+
}
50+
],
51+
"architecture": "amd64",
52+
"operatingSystems": ["linux"],
53+
"resources": {
54+
"cpu": "4",
55+
"memory": "16Gi",
56+
"ephemeral-storage": "250Gi",
57+
"nvidia.com/gpu": "1"
58+
}
59+
},
60+
{
61+
"name": "g5.2xlarge",
62+
"offerings": [
63+
{
64+
"Price": 1.212,
65+
"Available": true,
66+
"Requirements": [
67+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
68+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1a"] }
69+
]
70+
},
71+
{
72+
"Price": 1.212,
73+
"Available": true,
74+
"Requirements": [
75+
{ "key": "karpenter.sh/capacity-type", "operator": "In", "values": ["on-demand"] },
76+
{ "key": "topology.kubernetes.io/zone", "operator": "In", "values": ["us-east-1b"] }
77+
]
78+
}
79+
],
80+
"architecture": "amd64",
81+
"operatingSystems": ["linux"],
82+
"resources": {
83+
"cpu": "8",
84+
"memory": "32Gi",
85+
"ephemeral-storage": "450Gi",
86+
"nvidia.com/gpu": "1"
87+
}
88+
}
89+
]

0 commit comments

Comments
 (0)