@@ -42,6 +42,16 @@ const (
4242 karpenterNodePoolLabel = "karpenter.sh/nodepool"
4343)
4444
45+ type clusterAutoscalingReport struct {
46+ NodePool string
47+ HPADesired int32
48+ HPACurrent int32
49+ BaselineNodes int
50+ ObservedNodes int
51+ TotalPods int
52+ ScheduledPods int
53+ }
54+
4555func init () {
4656 checks .RegisterCheck (& checks.Check {
4757 Name : "cluster-autoscaling" ,
@@ -122,14 +132,19 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
122132 var lastErr error
123133 for _ , poolName := range gpuNodePoolNames {
124134 slog .Info ("attempting behavioral validation with NodePool" , "nodePool" , poolName )
125- lastErr = validateClusterAutoscaling (ctx .Context , ctx .Clientset , poolName )
126- if lastErr == nil {
135+ report , runErr : = validateClusterAutoscaling (ctx .Context , ctx .Clientset , poolName )
136+ if runErr == nil {
127137 recordArtifact (ctx , "Cluster Autoscaling Behavioral Test" ,
128- fmt .Sprintf ("NodePool: %s\n HPA: scaling intent detected\n Karpenter: new node(s) provisioned\n Pods: scheduled on new nodes" , poolName ))
138+ fmt .Sprintf ("NodePool: %s\n HPA desired/current: %d/%d\n Karpenter nodes: baseline=%d observed=%d new=%d\n Pods scheduled: %d/%d" ,
139+ report .NodePool ,
140+ report .HPADesired , report .HPACurrent ,
141+ report .BaselineNodes , report .ObservedNodes , report .ObservedNodes - report .BaselineNodes ,
142+ report .ScheduledPods , report .TotalPods ))
129143 return nil
130144 }
145+ lastErr = runErr
131146 slog .Debug ("behavioral validation failed for NodePool" ,
132- "nodePool" , poolName , "error" , lastErr )
147+ "nodePool" , poolName , "error" , runErr )
133148 }
134149 return lastErr
135150}
@@ -138,11 +153,15 @@ func CheckClusterAutoscaling(ctx *checks.ValidationContext) error {
138153// Deployment + HPA (external metric) → HPA computes scale-up → Karpenter provisions
139154// KWOK nodes → pods are scheduled. This proves the chain works end-to-end.
140155// nodePoolName is the discovered GPU NodePool name from the precheck.
141- func validateClusterAutoscaling (ctx context.Context , clientset kubernetes.Interface , nodePoolName string ) error {
156+ func validateClusterAutoscaling (ctx context.Context , clientset kubernetes.Interface , nodePoolName string ) (* clusterAutoscalingReport , error ) {
157+ report := & clusterAutoscalingReport {
158+ NodePool : nodePoolName ,
159+ }
160+
142161 // Generate unique test resource names and namespace (prevents cross-run interference).
143162 b := make ([]byte , 4 )
144163 if _ , err := rand .Read (b ); err != nil {
145- return errors .Wrap (errors .ErrCodeInternal , "failed to generate random suffix" , err )
164+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to generate random suffix" , err )
146165 }
147166 suffix := hex .EncodeToString (b )
148167 nsName := clusterAutoTestPrefix + suffix
@@ -154,7 +173,7 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
154173 ObjectMeta : metav1.ObjectMeta {Name : nsName },
155174 }
156175 if _ , err := clientset .CoreV1 ().Namespaces ().Create (ctx , ns , metav1.CreateOptions {}); k8s .IgnoreAlreadyExists (err ) != nil {
157- return errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test namespace" , err )
176+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test namespace" , err )
158177 }
159178
160179 // Cleanup: delete namespace (cascades all resources, triggers Karpenter consolidation).
@@ -175,37 +194,51 @@ func validateClusterAutoscaling(ctx context.Context, clientset kubernetes.Interf
175194 LabelSelector : fmt .Sprintf ("%s=%s" , karpenterNodePoolLabel , nodePoolName ),
176195 })
177196 if err != nil {
178- return errors .Wrap (errors .ErrCodeInternal , "failed to count baseline Karpenter nodes" , err )
197+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to count baseline Karpenter nodes" , err )
179198 }
180199 baselineNodeCount := len (baselineNodes .Items )
200+ report .BaselineNodes = baselineNodeCount
181201 slog .Info ("baseline Karpenter node count" , "pool" , nodePoolName , "count" , baselineNodeCount )
182202
183203 // Create Deployment: GPU-requesting pods with Karpenter nodeSelector.
184204 deploy := buildClusterAutoTestDeployment (deployName , nsName , nodePoolName )
185- if _ , err := clientset .AppsV1 ().Deployments (nsName ).Create (
186- ctx , deploy , metav1.CreateOptions {}); err != nil {
187- return errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test deployment" , err )
205+ _ , createErr := clientset .AppsV1 ().Deployments (nsName ).Create (
206+ ctx , deploy , metav1.CreateOptions {})
207+ if createErr != nil {
208+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test deployment" , createErr )
188209 }
189210
190211 // Create HPA targeting external metric dcgm_gpu_power_usage.
191212 hpa := buildClusterAutoTestHPA (hpaName , deployName , nsName )
192- if _ , err := clientset .AutoscalingV2 ().HorizontalPodAutoscalers (nsName ).Create (
193- ctx , hpa , metav1.CreateOptions {}); err != nil {
194- return errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test HPA" , err )
213+ _ , createErr = clientset .AutoscalingV2 ().HorizontalPodAutoscalers (nsName ).Create (
214+ ctx , hpa , metav1.CreateOptions {})
215+ if createErr != nil {
216+ return nil , errors .Wrap (errors .ErrCodeInternal , "failed to create cluster autoscaling test HPA" , createErr )
195217 }
196218
197219 // Wait for HPA to report scaling intent.
198- if err := waitForClusterAutoHPAScale (ctx , clientset , nsName , hpaName ); err != nil {
199- return err
220+ desired , current , err := waitForHPAScaleUp (ctx , clientset , nsName , hpaName , "cluster autoscaling" )
221+ if err != nil {
222+ return nil , err
200223 }
224+ report .HPADesired = desired
225+ report .HPACurrent = current
201226
202227 // Wait for Karpenter to provision KWOK nodes (above baseline count).
203- if err := waitForKarpenterNodes (ctx , clientset , nodePoolName , baselineNodeCount ); err != nil {
204- return err
228+ observedNodes , err := waitForKarpenterNodes (ctx , clientset , nodePoolName , baselineNodeCount )
229+ if err != nil {
230+ return nil , err
205231 }
232+ report .ObservedNodes = observedNodes
206233
207234 // Verify pods are scheduled (not Pending) with poll loop.
208- return verifyPodsScheduled (ctx , clientset , nsName )
235+ totalPods , scheduledPods , err := verifyPodsScheduled (ctx , clientset , nsName )
236+ if err != nil {
237+ return nil , err
238+ }
239+ report .TotalPods = totalPods
240+ report .ScheduledPods = scheduledPods
241+ return report , nil
209242}
210243
211244// buildClusterAutoTestDeployment creates a Deployment that requests GPU resources
@@ -317,45 +350,10 @@ func buildClusterAutoTestHPA(name, deployName, namespace string) *autoscalingv2.
317350 }
318351}
319352
320- // waitForClusterAutoHPAScale polls the HPA until desiredReplicas > currentReplicas.
321- func waitForClusterAutoHPAScale (ctx context.Context , clientset kubernetes.Interface , namespace , hpaName string ) error {
322- waitCtx , cancel := context .WithTimeout (ctx , defaults .HPAScaleTimeout )
323- defer cancel ()
324-
325- err := wait .PollUntilContextCancel (waitCtx , defaults .HPAPollInterval , true ,
326- func (ctx context.Context ) (bool , error ) {
327- hpa , getErr := clientset .AutoscalingV2 ().HorizontalPodAutoscalers (namespace ).Get (
328- ctx , hpaName , metav1.GetOptions {})
329- if getErr != nil {
330- slog .Debug ("HPA not ready yet" , "error" , getErr )
331- return false , nil
332- }
333-
334- desired := hpa .Status .DesiredReplicas
335- current := hpa .Status .CurrentReplicas
336- slog .Debug ("cluster autoscaling HPA status" , "desired" , desired , "current" , current )
337-
338- if desired > current {
339- slog .Info ("cluster autoscaling HPA scaling intent detected" ,
340- "desiredReplicas" , desired , "currentReplicas" , current )
341- return true , nil
342- }
343- return false , nil
344- },
345- )
346- if err != nil {
347- if ctx .Err () != nil || waitCtx .Err () != nil {
348- return errors .Wrap (errors .ErrCodeTimeout ,
349- "HPA did not report scaling intent — external metrics pipeline may be broken" , err )
350- }
351- return errors .Wrap (errors .ErrCodeInternal , "HPA scaling intent polling failed" , err )
352- }
353- return nil
354- }
355-
356353// waitForKarpenterNodes polls until nodes with the discovered NodePool label exceed the
357354// baseline count. This proves Karpenter provisioned NEW nodes, not just pre-existing ones.
358- func waitForKarpenterNodes (ctx context.Context , clientset kubernetes.Interface , nodePoolName string , baselineNodeCount int ) error {
355+ func waitForKarpenterNodes (ctx context.Context , clientset kubernetes.Interface , nodePoolName string , baselineNodeCount int ) (int , error ) {
356+ var observedNodeCount int
359357 waitCtx , cancel := context .WithTimeout (ctx , defaults .KarpenterNodeTimeout )
360358 defer cancel ()
361359
@@ -369,29 +367,32 @@ func waitForKarpenterNodes(ctx context.Context, clientset kubernetes.Interface,
369367 return false , nil
370368 }
371369
372- if len (nodes .Items ) > baselineNodeCount {
370+ observedNodeCount = len (nodes .Items )
371+ if observedNodeCount > baselineNodeCount {
373372 slog .Info ("Karpenter provisioned new KWOK GPU node(s)" ,
374- "total" , len ( nodes . Items ) , "baseline" , baselineNodeCount ,
375- "new" , len ( nodes . Items ) - baselineNodeCount )
373+ "total" , observedNodeCount , "baseline" , baselineNodeCount ,
374+ "new" , observedNodeCount - baselineNodeCount )
376375 return true , nil
377376 }
378377 return false , nil
379378 },
380379 )
381380 if err != nil {
382381 if ctx .Err () != nil || waitCtx .Err () != nil {
383- return errors .Wrap (errors .ErrCodeTimeout ,
382+ return 0 , errors .Wrap (errors .ErrCodeTimeout ,
384383 "Karpenter did not provision GPU nodes within timeout" , err )
385384 }
386- return errors .Wrap (errors .ErrCodeInternal , "Karpenter node polling failed" , err )
385+ return 0 , errors .Wrap (errors .ErrCodeInternal , "Karpenter node polling failed" , err )
387386 }
388- return nil
387+ return observedNodeCount , nil
389388}
390389
391390// verifyPodsScheduled polls until pods in the unique test namespace are scheduled (not Pending).
392391// This proves the full chain: HPA → scale → Karpenter → nodes → pods scheduled.
393392// The namespace is unique per run, so all pods belong to this test — no stale pod interference.
394- func verifyPodsScheduled (ctx context.Context , clientset kubernetes.Interface , namespace string ) error {
393+ func verifyPodsScheduled (ctx context.Context , clientset kubernetes.Interface , namespace string ) (int , int , error ) {
394+ var observedTotal int
395+ var observedScheduled int
395396 waitCtx , cancel := context .WithTimeout (ctx , defaults .PodScheduleTimeout )
396397 defer cancel ()
397398
@@ -403,8 +404,9 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
403404 return false , nil
404405 }
405406
406- if len (pods .Items ) < 2 {
407- slog .Debug ("waiting for HPA-scaled pods" , "count" , len (pods .Items ))
407+ observedTotal = len (pods .Items )
408+ if observedTotal < 2 {
409+ slog .Debug ("waiting for HPA-scaled pods" , "count" , observedTotal )
408410 return false , nil
409411 }
410412
@@ -415,23 +417,24 @@ func verifyPodsScheduled(ctx context.Context, clientset kubernetes.Interface, na
415417 }
416418 }
417419
420+ observedScheduled = scheduled
418421 slog .Debug ("cluster autoscaling pod status" ,
419- "total" , len ( pods . Items ) , "scheduled" , scheduled )
422+ "total" , observedTotal , "scheduled" , observedScheduled )
420423
421- if scheduled >= 2 {
424+ if observedScheduled >= 2 {
422425 slog .Info ("cluster autoscaling pods verified" ,
423- "total" , len ( pods . Items ) , "scheduled" , scheduled )
426+ "total" , observedTotal , "scheduled" , observedScheduled )
424427 return true , nil
425428 }
426429 return false , nil
427430 },
428431 )
429432 if err != nil {
430433 if ctx .Err () != nil || waitCtx .Err () != nil {
431- return errors .Wrap (errors .ErrCodeTimeout ,
434+ return 0 , 0 , errors .Wrap (errors .ErrCodeTimeout ,
432435 "test pods not scheduled within timeout — Karpenter nodes may not be ready" , err )
433436 }
434- return errors .Wrap (errors .ErrCodeInternal , "pod scheduling verification failed" , err )
437+ return 0 , 0 , errors .Wrap (errors .ErrCodeInternal , "pod scheduling verification failed" , err )
435438 }
436- return nil
439+ return observedTotal , observedScheduled , nil
437440}
0 commit comments