Skip to content

Commit 7197b83

Browse files
committed
feat(conformance): capture observed cluster autoscaling evidence
1 parent 3b9c189 commit 7197b83

File tree

2 files changed

+208
-78
lines changed

2 files changed

+208
-78
lines changed

pkg/validator/checks/conformance/cluster_autoscaling_check.go

Lines changed: 144 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,16 @@ const (
4343
)
4444

4545
type clusterAutoscalingReport struct {
46-
NodePool string
47-
HPADesired int32
48-
HPACurrent int32
49-
BaselineNodes int
50-
ObservedNodes int
51-
TotalPods int
52-
ScheduledPods int
46+
NodePoolName string
47+
Namespace string
48+
DeploymentName string
49+
HPAName string
50+
HPADesiredReplicas int32
51+
HPACurrentReplicas int32
52+
BaselineNodeCount int
53+
ObservedNodeCount int
54+
ScheduledPodCount int
55+
ObservedPodCount int
5356
}
5457

5558
func init() {
@@ -88,11 +91,24 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
8891
if deploy.Spec.Replicas != nil {
8992
expected = *deploy.Spec.Replicas
9093
}
91-
recordArtifact(ctx, "Karpenter Controller",
94+
recordRawTextArtifact(ctx, "Karpenter Controller",
95+
"kubectl get deploy -n karpenter",
9296
fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s",
9397
deploy.Namespace, deploy.Name,
9498
deploy.Status.AvailableReplicas, expected,
9599
firstContainerImage(deploy.Spec.Template.Spec.Containers)))
100+
karpenterPods, podErr := ctx.Clientset.CoreV1().Pods("karpenter").List(ctx.Context, metav1.ListOptions{})
101+
if podErr != nil {
102+
recordRawTextArtifact(ctx, "Karpenter pods", "kubectl get pods -n karpenter -o wide",
103+
fmt.Sprintf("failed to list karpenter pods: %v", podErr))
104+
} else {
105+
var podSummary strings.Builder
106+
for _, pod := range karpenterPods.Items {
107+
fmt.Fprintf(&podSummary, "%-44s ready=%s phase=%s node=%s\n",
108+
pod.Name, podReadyCount(pod), pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName))
109+
}
110+
recordRawTextArtifact(ctx, "Karpenter pods", "kubectl get pods -n karpenter -o wide", podSummary.String())
111+
}
96112

97113
// 2. GPU NodePool exists with nvidia.com/gpu limits
98114
dynClient, err := getDynamicClient(ctx)
@@ -108,43 +124,77 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
108124
}
109125

110126
var gpuNodePoolNames []string
127+
var poolSummary strings.Builder
111128
for _, np := range nps.Items {
112129
limits, found, _ := unstructured.NestedMap(np.Object, "spec", "limits")
130+
limitGPU := "none"
113131
if found {
114-
if _, hasGPU := limits["nvidia.com/gpu"]; hasGPU {
132+
if raw, hasGPU := limits["nvidia.com/gpu"]; hasGPU {
115133
gpuNodePoolNames = append(gpuNodePoolNames, np.GetName())
134+
limitGPU = fmt.Sprintf("%v", raw)
116135
}
117136
}
137+
fmt.Fprintf(&poolSummary, "%-32s gpuLimit=%s\n", np.GetName(), limitGPU)
118138
}
139+
recordRawTextArtifact(ctx, "GPU NodePools",
140+
"kubectl get nodepools.karpenter.sh -o yaml", poolSummary.String())
119141
if len(gpuNodePoolNames) == 0 {
120142
return errors.New(errors.ErrCodeNotFound,
121143
"no NodePool with nvidia.com/gpu limits found")
122144
}
123145

124-
recordArtifact(ctx, "GPU NodePools",
146+
recordRawTextArtifact(ctx, "GPU NodePools (filtered)",
147+
"kubectl get nodepools.karpenter.sh",
125148
fmt.Sprintf("Count: %d\nNames: %s", len(gpuNodePoolNames),
126149
strings.Join(gpuNodePoolNames, ", ")))
127150
slog.Info("discovered GPU NodePools", "pools", gpuNodePoolNames)
128151

152+
gpuNodes, nodeErr := ctx.Clientset.CoreV1().Nodes().List(ctx.Context, metav1.ListOptions{
153+
LabelSelector: "nvidia.com/gpu.present=true",
154+
})
155+
if nodeErr != nil {
156+
recordRawTextArtifact(ctx, "GPU nodes",
157+
"kubectl get nodes -o custom-columns='NAME:.metadata.name,GPU:.status.capacity.nvidia.com/gpu'",
158+
fmt.Sprintf("failed to list GPU nodes: %v", nodeErr))
159+
} else {
160+
var nodeSummary strings.Builder
161+
for _, n := range gpuNodes.Items {
162+
gpuCap := n.Status.Capacity["nvidia.com/gpu"]
163+
instanceType := n.Labels["node.kubernetes.io/instance-type"]
164+
fmt.Fprintf(&nodeSummary, "%-44s gpu=%s instance=%s\n",
165+
n.Name, gpuCap.String(), valueOrUnknown(instanceType))
166+
}
167+
recordRawTextArtifact(ctx, "GPU nodes",
168+
"kubectl get nodes -o custom-columns='NAME:.metadata.name,GPU:.status.capacity.nvidia.com/gpu,INSTANCE-TYPE:.metadata.labels.node.kubernetes.io/instance-type'",
169+
nodeSummary.String())
170+
}
171+
129172
// 3. Behavioral validation: try each discovered GPU NodePool until one succeeds.
130173
// Multiple pools may exist (e.g. different GPU types) and not all may be viable
131174
// for this test workload.
132175
var lastErr error
133176
for _, poolName := range gpuNodePoolNames {
134177
slog.Info("attempting behavioral validation with NodePool", "nodePool", poolName)
135-
report, runErr := validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
136-
if runErr == nil {
137-
recordArtifact(ctx, "Cluster Autoscaling Behavioral Test",
138-
fmt.Sprintf("NodePool: %s\nHPA desired/current: %d/%d\nKarpenter nodes: baseline=%d observed=%d new=%d\nPods scheduled: %d/%d",
139-
report.NodePool,
140-
report.HPADesired, report.HPACurrent,
141-
report.BaselineNodes, report.ObservedNodes, report.ObservedNodes-report.BaselineNodes,
142-
report.ScheduledPods, report.TotalPods))
178+
report, validateErr := validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
179+
lastErr = validateErr
180+
if lastErr == nil {
181+
recordRawTextArtifact(ctx, "Apply test manifest",
182+
"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-scale-test.yaml",
183+
fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s for nodePool=%s",
184+
report.Namespace, report.DeploymentName, report.HPAName, report.NodePoolName))
185+
recordRawTextArtifact(ctx, "Cluster Autoscaling Behavioral Test",
186+
"kubectl get hpa && kubectl get nodes && kubectl get pods",
187+
fmt.Sprintf("NodePool: %s\nNamespace: %s\nHPA desired/current: %d/%d\nKarpenter nodes: baseline=%d observed=%d\nScheduled pods: %d/%d",
188+
report.NodePoolName, report.Namespace, report.HPADesiredReplicas,
189+
report.HPACurrentReplicas, report.BaselineNodeCount, report.ObservedNodeCount,
190+
report.ScheduledPodCount, report.ObservedPodCount))
191+
recordRawTextArtifact(ctx, "Delete test namespace",
192+
"kubectl delete namespace cluster-auto-test-<id> --ignore-not-found",
193+
fmt.Sprintf("Deleted namespace %s after cluster autoscaling test.", report.Namespace))
143194
return nil
144195
}
145-
lastErr = runErr
146196
slog.Debug("behavioral validation failed for NodePool",
147-
"nodePool", poolName, "error", runErr)
197+
"nodePool", poolName, "error", lastErr)
148198
}
149199
return lastErr
150200
}
@@ -154,10 +204,6 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
154204
// KWOK nodes → pods are scheduled. This proves the chain works end-to-end.
155205
// nodePoolName is the discovered GPU NodePool name from the precheck.
156206
func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interface, nodePoolName string) (*clusterAutoscalingReport, error) {
157-
report := &clusterAutoscalingReport{
158-
NodePool: nodePoolName,
159-
}
160-
161207
// Generate unique test resource names and namespace (prevents cross-run interference).
162208
b := make([]byte, 4)
163209
if _, err := rand.Read(b); err != nil {
@@ -167,6 +213,12 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
167213
nsName := clusterAutoTestPrefix + suffix
168214
deployName := clusterAutoTestPrefix + suffix
169215
hpaName := clusterAutoTestPrefix + suffix
216+
report := &clusterAutoscalingReport{
217+
NodePoolName: nodePoolName,
218+
Namespace: nsName,
219+
DeploymentName: deployName,
220+
HPAName: hpaName,
221+
}
170222

171223
// Create unique test namespace.
172224
ns := &corev1.Namespace{
@@ -197,47 +249,45 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
197249
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to count baseline Karpenter nodes", err)
198250
}
199251
baselineNodeCount := len(baselineNodes.Items)
200-
report.BaselineNodes = baselineNodeCount
252+
report.BaselineNodeCount = baselineNodeCount
201253
slog.Info("baseline Karpenter node count", "pool", nodePoolName, "count", baselineNodeCount)
202254

203255
// Create Deployment: GPU-requesting pods with Karpenter nodeSelector.
204256
deploy := buildClusterAutoTestDeployment(deployName, nsName, nodePoolName)
205-
_, createErr := clientset.AppsV1().Deployments(nsName).Create(
206-
ctx, deploy, metav1.CreateOptions{})
207-
if createErr != nil {
208-
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test deployment", createErr)
257+
if _, err := clientset.AppsV1().Deployments(nsName).Create(
258+
ctx, deploy, metav1.CreateOptions{}); err != nil {
259+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test deployment", err)
209260
}
210261

211262
// Create HPA targeting external metric dcgm_gpu_power_usage.
212263
hpa := buildClusterAutoTestHPA(hpaName, deployName, nsName)
213-
_, createErr = clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
214-
ctx, hpa, metav1.CreateOptions{})
215-
if createErr != nil {
216-
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", createErr)
264+
if _, err := clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
265+
ctx, hpa, metav1.CreateOptions{}); err != nil {
266+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", err)
217267
}
218268

219269
// Wait for HPA to report scaling intent.
220-
desired, current, err := waitForHPAScaleUp(ctx, clientset, nsName, hpaName, "cluster autoscaling")
270+
desired, current, err := waitForClusterAutoHPAScale(ctx, clientset, nsName, hpaName)
221271
if err != nil {
222272
return nil, err
223273
}
224-
report.HPADesired = desired
225-
report.HPACurrent = current
274+
report.HPADesiredReplicas = desired
275+
report.HPACurrentReplicas = current
226276

227277
// Wait for Karpenter to provision KWOK nodes (above baseline count).
228278
observedNodes, err := waitForKarpenterNodes(ctx, clientset, nodePoolName, baselineNodeCount)
229279
if err != nil {
230280
return nil, err
231281
}
232-
report.ObservedNodes = observedNodes
282+
report.ObservedNodeCount = observedNodes
233283

234284
// Verify pods are scheduled (not Pending) with poll loop.
235-
totalPods, scheduledPods, err := verifyPodsScheduled(ctx, clientset, nsName)
285+
scheduled, total, err := verifyPodsScheduled(ctx, clientset, nsName)
236286
if err != nil {
237287
return nil, err
238288
}
239-
report.TotalPods = totalPods
240-
report.ScheduledPods = scheduledPods
289+
report.ScheduledPodCount = scheduled
290+
report.ObservedPodCount = total
241291
return report, nil
242292
}
243293

@@ -350,12 +400,51 @@ func buildClusterAutoTestHPA(name, deployName, namespace string) *autoscalingv2.
350400
}
351401
}
352402

403+
// waitForClusterAutoHPAScale polls the HPA until desiredReplicas > currentReplicas.
404+
func waitForClusterAutoHPAScale(ctx context.Context, clientset kubernetes.Interface, namespace, hpaName string) (int32, int32, error) {
405+
waitCtx, cancel := context.WithTimeout(ctx, defaults.HPAScaleTimeout)
406+
defer cancel()
407+
var observedDesired, observedCurrent int32
408+
409+
err := wait.PollUntilContextCancel(waitCtx, defaults.HPAPollInterval, true,
410+
func(ctx context.Context) (bool, error) {
411+
hpa, getErr := clientset.AutoscalingV2().HorizontalPodAutoscalers(namespace).Get(
412+
ctx, hpaName, metav1.GetOptions{})
413+
if getErr != nil {
414+
slog.Debug("HPA not ready yet", "error", getErr)
415+
return false, nil
416+
}
417+
418+
desired := hpa.Status.DesiredReplicas
419+
current := hpa.Status.CurrentReplicas
420+
observedDesired = desired
421+
observedCurrent = current
422+
slog.Debug("cluster autoscaling HPA status", "desired", desired, "current", current)
423+
424+
if desired > current {
425+
slog.Info("cluster autoscaling HPA scaling intent detected",
426+
"desiredReplicas", desired, "currentReplicas", current)
427+
return true, nil
428+
}
429+
return false, nil
430+
},
431+
)
432+
if err != nil {
433+
if ctx.Err() != nil || waitCtx.Err() != nil {
434+
return 0, 0, errors.Wrap(errors.ErrCodeTimeout,
435+
"HPA did not report scaling intent — external metrics pipeline may be broken", err)
436+
}
437+
return 0, 0, errors.Wrap(errors.ErrCodeInternal, "HPA scaling intent polling failed", err)
438+
}
439+
return observedDesired, observedCurrent, nil
440+
}
441+
353442
// waitForKarpenterNodes polls until nodes with the discovered NodePool label exceed the
354443
// baseline count. This proves Karpenter provisioned NEW nodes, not just pre-existing ones.
355444
func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface, nodePoolName string, baselineNodeCount int) (int, error) {
356-
var observedNodeCount int
357445
waitCtx, cancel := context.WithTimeout(ctx, defaults.KarpenterNodeTimeout)
358446
defer cancel()
447+
var observedNodeCount int
359448

360449
err := wait.PollUntilContextCancel(waitCtx, defaults.KarpenterPollInterval, true,
361450
func(ctx context.Context) (bool, error) {
@@ -367,11 +456,11 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
367456
return false, nil
368457
}
369458

370-
observedNodeCount = len(nodes.Items)
371-
if observedNodeCount > baselineNodeCount {
459+
if len(nodes.Items) > baselineNodeCount {
372460
slog.Info("Karpenter provisioned new KWOK GPU node(s)",
373-
"total", observedNodeCount, "baseline", baselineNodeCount,
374-
"new", observedNodeCount-baselineNodeCount)
461+
"total", len(nodes.Items), "baseline", baselineNodeCount,
462+
"new", len(nodes.Items)-baselineNodeCount)
463+
observedNodeCount = len(nodes.Items)
375464
return true, nil
376465
}
377466
return false, nil
@@ -391,10 +480,10 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
391480
// This proves the full chain: HPA → scale → Karpenter → nodes → pods scheduled.
392481
// The namespace is unique per run, so all pods belong to this test — no stale pod interference.
393482
func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, namespace string) (int, int, error) {
394-
var observedTotal int
395-
var observedScheduled int
396483
waitCtx, cancel := context.WithTimeout(ctx, defaults.PodScheduleTimeout)
397484
defer cancel()
485+
var scheduledOut int
486+
var totalOut int
398487

399488
err := wait.PollUntilContextCancel(waitCtx, defaults.KarpenterPollInterval, true,
400489
func(ctx context.Context) (bool, error) {
@@ -404,26 +493,26 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
404493
return false, nil
405494
}
406495

407-
observedTotal = len(pods.Items)
408-
if observedTotal < 2 {
409-
slog.Debug("waiting for HPA-scaled pods", "count", observedTotal)
496+
if len(pods.Items) < 2 {
497+
slog.Debug("waiting for HPA-scaled pods", "count", len(pods.Items))
410498
return false, nil
411499
}
500+
totalOut = len(pods.Items)
412501

413502
var scheduled int
414503
for _, pod := range pods.Items {
415504
if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase == corev1.PodSucceeded {
416505
scheduled++
417506
}
418507
}
508+
scheduledOut = scheduled
419509

420-
observedScheduled = scheduled
421510
slog.Debug("cluster autoscaling pod status",
422-
"total", observedTotal, "scheduled", observedScheduled)
511+
"total", len(pods.Items), "scheduled", scheduled)
423512

424-
if observedScheduled >= 2 {
513+
if scheduled >= 2 {
425514
slog.Info("cluster autoscaling pods verified",
426-
"total", observedTotal, "scheduled", observedScheduled)
515+
"total", len(pods.Items), "scheduled", scheduled)
427516
return true, nil
428517
}
429518
return false, nil
@@ -436,5 +525,5 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
436525
}
437526
return 0, 0, errors.Wrap(errors.ErrCodeInternal, "pod scheduling verification failed", err)
438527
}
439-
return observedTotal, observedScheduled, nil
528+
return scheduledOut, totalOut, nil
440529
}

0 commit comments

Comments
 (0)