Skip to content

Commit f188055

Browse files
committed
feat(conformance): capture observed cluster autoscaling evidence
1 parent 3b9c189 commit f188055

File tree

9 files changed

+921
-171
lines changed

9 files changed

+921
-171
lines changed

pkg/defaults/timeouts.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,14 @@ const (
228228
ArtifactMaxPerCheck = 20
229229
)
230230

231+
// HTTP response limits for conformance checks.
232+
const (
233+
// HTTPResponseBodyLimit is the maximum size in bytes for HTTP response bodies
234+
// read by conformance checks (e.g., Prometheus metric scrapes). Prevents
235+
// unbounded reads from in-cluster services.
236+
HTTPResponseBodyLimit = 1 * 1024 * 1024 // 1 MiB
237+
)
238+
231239
// Job configuration constants.
232240
const (
233241
// JobTTLAfterFinished is the time-to-live for completed Jobs.

pkg/validator/checks/conformance/cluster_autoscaling_check.go

Lines changed: 104 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,16 @@ const (
4343
)
4444

4545
type clusterAutoscalingReport struct {
46-
NodePool string
47-
HPADesired int32
48-
HPACurrent int32
49-
BaselineNodes int
50-
ObservedNodes int
51-
TotalPods int
52-
ScheduledPods int
46+
NodePoolName string
47+
Namespace string
48+
DeploymentName string
49+
HPAName string
50+
HPADesiredReplicas int32
51+
HPACurrentReplicas int32
52+
BaselineNodeCount int
53+
ObservedNodeCount int
54+
ScheduledPodCount int
55+
ObservedPodCount int
5356
}
5457

5558
func init() {
@@ -88,11 +91,24 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
8891
if deploy.Spec.Replicas != nil {
8992
expected = *deploy.Spec.Replicas
9093
}
91-
recordArtifact(ctx, "Karpenter Controller",
94+
recordRawTextArtifact(ctx, "Karpenter Controller",
95+
"kubectl get deploy -n karpenter",
9296
fmt.Sprintf("Name: %s/%s\nReplicas: %d/%d available\nImage: %s",
9397
deploy.Namespace, deploy.Name,
9498
deploy.Status.AvailableReplicas, expected,
9599
firstContainerImage(deploy.Spec.Template.Spec.Containers)))
100+
karpenterPods, podErr := ctx.Clientset.CoreV1().Pods("karpenter").List(ctx.Context, metav1.ListOptions{})
101+
if podErr != nil {
102+
recordRawTextArtifact(ctx, "Karpenter pods", "kubectl get pods -n karpenter -o wide",
103+
fmt.Sprintf("failed to list karpenter pods: %v", podErr))
104+
} else {
105+
var podSummary strings.Builder
106+
for _, pod := range karpenterPods.Items {
107+
fmt.Fprintf(&podSummary, "%-44s ready=%s phase=%s node=%s\n",
108+
pod.Name, podReadyCount(pod), pod.Status.Phase, valueOrUnknown(pod.Spec.NodeName))
109+
}
110+
recordRawTextArtifact(ctx, "Karpenter pods", "kubectl get pods -n karpenter -o wide", podSummary.String())
111+
}
96112

97113
// 2. GPU NodePool exists with nvidia.com/gpu limits
98114
dynClient, err := getDynamicClient(ctx)
@@ -108,43 +124,77 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
108124
}
109125

110126
var gpuNodePoolNames []string
127+
var poolSummary strings.Builder
111128
for _, np := range nps.Items {
112129
limits, found, _ := unstructured.NestedMap(np.Object, "spec", "limits")
130+
limitGPU := "none"
113131
if found {
114-
if _, hasGPU := limits["nvidia.com/gpu"]; hasGPU {
132+
if raw, hasGPU := limits["nvidia.com/gpu"]; hasGPU {
115133
gpuNodePoolNames = append(gpuNodePoolNames, np.GetName())
134+
limitGPU = fmt.Sprintf("%v", raw)
116135
}
117136
}
137+
fmt.Fprintf(&poolSummary, "%-32s gpuLimit=%s\n", np.GetName(), limitGPU)
118138
}
139+
recordRawTextArtifact(ctx, "GPU NodePools",
140+
"kubectl get nodepools.karpenter.sh -o yaml", poolSummary.String())
119141
if len(gpuNodePoolNames) == 0 {
120142
return errors.New(errors.ErrCodeNotFound,
121143
"no NodePool with nvidia.com/gpu limits found")
122144
}
123145

124-
recordArtifact(ctx, "GPU NodePools",
146+
recordRawTextArtifact(ctx, "GPU NodePools (filtered)",
147+
"kubectl get nodepools.karpenter.sh",
125148
fmt.Sprintf("Count: %d\nNames: %s", len(gpuNodePoolNames),
126149
strings.Join(gpuNodePoolNames, ", ")))
127150
slog.Info("discovered GPU NodePools", "pools", gpuNodePoolNames)
128151

152+
gpuNodes, nodeErr := ctx.Clientset.CoreV1().Nodes().List(ctx.Context, metav1.ListOptions{
153+
LabelSelector: "nvidia.com/gpu.present=true",
154+
})
155+
if nodeErr != nil {
156+
recordRawTextArtifact(ctx, "GPU nodes",
157+
"kubectl get nodes -o custom-columns='NAME:.metadata.name,GPU:.status.capacity.nvidia.com/gpu'",
158+
fmt.Sprintf("failed to list GPU nodes: %v", nodeErr))
159+
} else {
160+
var nodeSummary strings.Builder
161+
for _, n := range gpuNodes.Items {
162+
gpuCap := n.Status.Capacity["nvidia.com/gpu"]
163+
instanceType := n.Labels["node.kubernetes.io/instance-type"]
164+
fmt.Fprintf(&nodeSummary, "%-44s gpu=%s instance=%s\n",
165+
n.Name, gpuCap.String(), valueOrUnknown(instanceType))
166+
}
167+
recordRawTextArtifact(ctx, "GPU nodes",
168+
"kubectl get nodes -o custom-columns='NAME:.metadata.name,GPU:.status.capacity.nvidia.com/gpu,INSTANCE-TYPE:.metadata.labels.node.kubernetes.io/instance-type'",
169+
nodeSummary.String())
170+
}
171+
129172
// 3. Behavioral validation: try each discovered GPU NodePool until one succeeds.
130173
// Multiple pools may exist (e.g. different GPU types) and not all may be viable
131174
// for this test workload.
132175
var lastErr error
133176
for _, poolName := range gpuNodePoolNames {
134177
slog.Info("attempting behavioral validation with NodePool", "nodePool", poolName)
135-
report, runErr := validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
136-
if runErr == nil {
137-
recordArtifact(ctx, "Cluster Autoscaling Behavioral Test",
138-
fmt.Sprintf("NodePool: %s\nHPA desired/current: %d/%d\nKarpenter nodes: baseline=%d observed=%d new=%d\nPods scheduled: %d/%d",
139-
report.NodePool,
140-
report.HPADesired, report.HPACurrent,
141-
report.BaselineNodes, report.ObservedNodes, report.ObservedNodes-report.BaselineNodes,
142-
report.ScheduledPods, report.TotalPods))
178+
report, validateErr := validateClusterAutoscaling(ctx.Context, ctx.Clientset, poolName)
179+
lastErr = validateErr
180+
if lastErr == nil {
181+
recordRawTextArtifact(ctx, "Apply test manifest",
182+
"kubectl apply -f docs/conformance/cncf/manifests/hpa-gpu-scale-test.yaml",
183+
fmt.Sprintf("Created namespace=%s deployment=%s hpa=%s for nodePool=%s",
184+
report.Namespace, report.DeploymentName, report.HPAName, report.NodePoolName))
185+
recordRawTextArtifact(ctx, "Cluster Autoscaling Behavioral Test",
186+
"kubectl get hpa && kubectl get nodes && kubectl get pods",
187+
fmt.Sprintf("NodePool: %s\nNamespace: %s\nHPA desired/current: %d/%d\nKarpenter nodes: baseline=%d observed=%d\nScheduled pods: %d/%d",
188+
report.NodePoolName, report.Namespace, report.HPADesiredReplicas,
189+
report.HPACurrentReplicas, report.BaselineNodeCount, report.ObservedNodeCount,
190+
report.ScheduledPodCount, report.ObservedPodCount))
191+
recordRawTextArtifact(ctx, "Delete test namespace",
192+
"kubectl delete namespace cluster-auto-test-<id> --ignore-not-found",
193+
fmt.Sprintf("Deleted namespace %s after cluster autoscaling test.", report.Namespace))
143194
return nil
144195
}
145-
lastErr = runErr
146196
slog.Debug("behavioral validation failed for NodePool",
147-
"nodePool", poolName, "error", runErr)
197+
"nodePool", poolName, "error", lastErr)
148198
}
149199
return lastErr
150200
}
@@ -154,10 +204,6 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
154204
// KWOK nodes → pods are scheduled. This proves the chain works end-to-end.
155205
// nodePoolName is the discovered GPU NodePool name from the precheck.
156206
func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interface, nodePoolName string) (*clusterAutoscalingReport, error) {
157-
report := &clusterAutoscalingReport{
158-
NodePool: nodePoolName,
159-
}
160-
161207
// Generate unique test resource names and namespace (prevents cross-run interference).
162208
b := make([]byte, 4)
163209
if _, err := rand.Read(b); err != nil {
@@ -167,6 +213,12 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
167213
nsName := clusterAutoTestPrefix + suffix
168214
deployName := clusterAutoTestPrefix + suffix
169215
hpaName := clusterAutoTestPrefix + suffix
216+
report := &clusterAutoscalingReport{
217+
NodePoolName: nodePoolName,
218+
Namespace: nsName,
219+
DeploymentName: deployName,
220+
HPAName: hpaName,
221+
}
170222

171223
// Create unique test namespace.
172224
ns := &corev1.Namespace{
@@ -197,47 +249,45 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
197249
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to count baseline Karpenter nodes", err)
198250
}
199251
baselineNodeCount := len(baselineNodes.Items)
200-
report.BaselineNodes = baselineNodeCount
252+
report.BaselineNodeCount = baselineNodeCount
201253
slog.Info("baseline Karpenter node count", "pool", nodePoolName, "count", baselineNodeCount)
202254

203255
// Create Deployment: GPU-requesting pods with Karpenter nodeSelector.
204256
deploy := buildClusterAutoTestDeployment(deployName, nsName, nodePoolName)
205-
_, createErr := clientset.AppsV1().Deployments(nsName).Create(
206-
ctx, deploy, metav1.CreateOptions{})
207-
if createErr != nil {
257+
if _, createErr := clientset.AppsV1().Deployments(nsName).Create(
258+
ctx, deploy, metav1.CreateOptions{}); createErr != nil {
208259
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test deployment", createErr)
209260
}
210261

211262
// Create HPA targeting external metric dcgm_gpu_power_usage.
212263
hpa := buildClusterAutoTestHPA(hpaName, deployName, nsName)
213-
_, createErr = clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
214-
ctx, hpa, metav1.CreateOptions{})
215-
if createErr != nil {
216-
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", createErr)
264+
if _, hpaErr := clientset.AutoscalingV2().HorizontalPodAutoscalers(nsName).Create(
265+
ctx, hpa, metav1.CreateOptions{}); hpaErr != nil {
266+
return nil, errors.Wrap(errors.ErrCodeInternal, "failed to create cluster autoscaling test HPA", hpaErr)
217267
}
218268

219269
// Wait for HPA to report scaling intent.
220-
desired, current, err := waitForHPAScaleUp(ctx, clientset, nsName, hpaName, "cluster autoscaling")
270+
desired, current, err := waitForHPAScalingIntent(ctx, clientset, nsName, hpaName)
221271
if err != nil {
222272
return nil, err
223273
}
224-
report.HPADesired = desired
225-
report.HPACurrent = current
274+
report.HPADesiredReplicas = desired
275+
report.HPACurrentReplicas = current
226276

227277
// Wait for Karpenter to provision KWOK nodes (above baseline count).
228278
observedNodes, err := waitForKarpenterNodes(ctx, clientset, nodePoolName, baselineNodeCount)
229279
if err != nil {
230280
return nil, err
231281
}
232-
report.ObservedNodes = observedNodes
282+
report.ObservedNodeCount = observedNodes
233283

234284
// Verify pods are scheduled (not Pending) with poll loop.
235-
totalPods, scheduledPods, err := verifyPodsScheduled(ctx, clientset, nsName)
285+
scheduled, total, err := verifyPodsScheduled(ctx, clientset, nsName)
236286
if err != nil {
237287
return nil, err
238288
}
239-
report.TotalPods = totalPods
240-
report.ScheduledPods = scheduledPods
289+
report.ScheduledPodCount = scheduled
290+
report.ObservedPodCount = total
241291
return report, nil
242292
}
243293

@@ -353,9 +403,9 @@ func buildClusterAutoTestHPA(name, deployName, namespace string) *autoscalingv2.
353403
// waitForKarpenterNodes polls until nodes with the discovered NodePool label exceed the
354404
// baseline count. This proves Karpenter provisioned NEW nodes, not just pre-existing ones.
355405
func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface, nodePoolName string, baselineNodeCount int) (int, error) {
356-
var observedNodeCount int
357406
waitCtx, cancel := context.WithTimeout(ctx, defaults.KarpenterNodeTimeout)
358407
defer cancel()
408+
var observedNodeCount int
359409

360410
err := wait.PollUntilContextCancel(waitCtx, defaults.KarpenterPollInterval, true,
361411
func(ctx context.Context) (bool, error) {
@@ -367,11 +417,11 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
367417
return false, nil
368418
}
369419

370-
observedNodeCount = len(nodes.Items)
371-
if observedNodeCount > baselineNodeCount {
420+
if len(nodes.Items) > baselineNodeCount {
372421
slog.Info("Karpenter provisioned new KWOK GPU node(s)",
373-
"total", observedNodeCount, "baseline", baselineNodeCount,
374-
"new", observedNodeCount-baselineNodeCount)
422+
"total", len(nodes.Items), "baseline", baselineNodeCount,
423+
"new", len(nodes.Items)-baselineNodeCount)
424+
observedNodeCount = len(nodes.Items)
375425
return true, nil
376426
}
377427
return false, nil
@@ -391,10 +441,10 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
391441
// This proves the full chain: HPA → scale → Karpenter → nodes → pods scheduled.
392442
// The namespace is unique per run, so all pods belong to this test — no stale pod interference.
393443
func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, namespace string) (int, int, error) {
394-
var observedTotal int
395-
var observedScheduled int
396444
waitCtx, cancel := context.WithTimeout(ctx, defaults.PodScheduleTimeout)
397445
defer cancel()
446+
var scheduledOut int
447+
var totalOut int
398448

399449
err := wait.PollUntilContextCancel(waitCtx, defaults.KarpenterPollInterval, true,
400450
func(ctx context.Context) (bool, error) {
@@ -404,26 +454,26 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
404454
return false, nil
405455
}
406456

407-
observedTotal = len(pods.Items)
408-
if observedTotal < 2 {
409-
slog.Debug("waiting for HPA-scaled pods", "count", observedTotal)
457+
if len(pods.Items) < 2 {
458+
slog.Debug("waiting for HPA-scaled pods", "count", len(pods.Items))
410459
return false, nil
411460
}
461+
totalOut = len(pods.Items)
412462

413463
var scheduled int
414464
for _, pod := range pods.Items {
415465
if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase == corev1.PodSucceeded {
416466
scheduled++
417467
}
418468
}
469+
scheduledOut = scheduled
419470

420-
observedScheduled = scheduled
421471
slog.Debug("cluster autoscaling pod status",
422-
"total", observedTotal, "scheduled", observedScheduled)
472+
"total", len(pods.Items), "scheduled", scheduled)
423473

424-
if observedScheduled >= 2 {
474+
if scheduled >= 2 {
425475
slog.Info("cluster autoscaling pods verified",
426-
"total", observedTotal, "scheduled", observedScheduled)
476+
"total", len(pods.Items), "scheduled", scheduled)
427477
return true, nil
428478
}
429479
return false, nil
@@ -436,5 +486,5 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
436486
}
437487
return 0, 0, errors.Wrap(errors.ErrCodeInternal, "pod scheduling verification failed", err)
438488
}
439-
return observedTotal, observedScheduled, nil
489+
return scheduledOut, totalOut, nil
440490
}

0 commit comments

Comments
 (0)