@@ -187,30 +187,17 @@ enableLimiter: true`
187187 _ , err = k8sClient .CoreV1 ().ConfigMaps (controllerNamespace ).Update (ctx , cm , metav1.UpdateOptions {})
188188 Expect (err ).NotTo (HaveOccurred (), "Should be able to update saturation ConfigMap to enable limiter" )
189189
190- By ("restarting controller-manager pods to load limiter configuration" )
191- podList , err := k8sClient .CoreV1 ().Pods (controllerNamespace ).List (ctx , metav1.ListOptions {
192- LabelSelector : "app.kubernetes.io/name=workload-variant-autoscaler" ,
193- })
194- Expect (err ).NotTo (HaveOccurred (), "Should be able to list manager pods" )
195-
196- for _ , pod := range podList .Items {
197- err = k8sClient .CoreV1 ().Pods (controllerNamespace ).Delete (ctx , pod .Name , metav1.DeleteOptions {})
198- Expect (err ).NotTo (HaveOccurred (), fmt .Sprintf ("Should be able to delete pod %s" , pod .Name ))
199- }
200-
201- // Wait for new controller pods to be running
202- Eventually (func (g Gomega ) {
203- newPodList , err := k8sClient .CoreV1 ().Pods (controllerNamespace ).List (ctx , metav1.ListOptions {
204- LabelSelector : "app.kubernetes.io/name=workload-variant-autoscaler" ,
205- })
206- g .Expect (err ).NotTo (HaveOccurred (), "Should be able to list manager pods" )
207- g .Expect (newPodList .Items ).NotTo (BeEmpty (), "Pod list should not be empty" )
208- for _ , pod := range newPodList .Items {
209- g .Expect (pod .Status .Phase ).To (Equal (corev1 .PodRunning ), fmt .Sprintf ("Pod %s is not running" , pod .Name ))
210- }
211- }, 2 * time .Minute , 1 * time .Second ).Should (Succeed ())
190+ By ("waiting for controller to process ConfigMap update" )
191+ // The controller watches ConfigMaps and updates the global config cache.
192+ // Wait to ensure the watch event is processed before proceeding.
193+ time .Sleep (5 * time .Second )
212194
213- _ , _ = fmt .Fprintf (GinkgoWriter , "Controller pods restarted with limiter enabled\n " )
195+ // Verify the ConfigMap was updated correctly
196+ cm , err = k8sClient .CoreV1 ().ConfigMaps (controllerNamespace ).Get (ctx , saturationConfigMapName , metav1.GetOptions {})
197+ Expect (err ).NotTo (HaveOccurred ())
198+ Expect (cm .Data ["default" ]).To (ContainSubstring ("enableLimiter: true" ),
199+ "ConfigMap should have enableLimiter: true" )
200+ _ , _ = fmt .Fprintf (GinkgoWriter , "ConfigMap updated with enableLimiter: true, waited for controller to process\n " )
214201
215202 By ("ensuring unique app label for deployment" )
216203 utils .ValidateAppLabelUniqueness (namespace , appLabel , k8sClient , crClient )
@@ -345,6 +332,23 @@ enableLimiter: true`
345332
346333 Context ("Scenario 2: Limiter constrains scale-up under high load" , func () {
347334 It ("should cap scale-up at GPU capacity limit even under heavy load" , func () {
335+ By ("ensuring limiter is enabled in ConfigMap before test" )
336+ // Re-apply ConfigMap setting to ensure limiter is enabled
337+ // This protects against other tests modifying the ConfigMap
338+ cm , err := k8sClient .CoreV1 ().ConfigMaps (controllerNamespace ).Get (ctx , saturationConfigMapName , metav1.GetOptions {})
339+ Expect (err ).NotTo (HaveOccurred ())
340+ cm .Data ["default" ] = `kvCacheThreshold: 0.80
341+ queueLengthThreshold: 5
342+ kvSpareTrigger: 0.1
343+ queueSpareTrigger: 3
344+ enableLimiter: true`
345+ _ , err = k8sClient .CoreV1 ().ConfigMaps (controllerNamespace ).Update (ctx , cm , metav1.UpdateOptions {})
346+ Expect (err ).NotTo (HaveOccurred ())
347+ _ , _ = fmt .Fprintf (GinkgoWriter , "Re-applied ConfigMap with enableLimiter: true\n " )
348+
349+ // Wait for controller to process ConfigMap update
350+ time .Sleep (5 * time .Second )
351+
348352 By ("setting up port-forward to Prometheus service" )
349353 prometheusPortForwardCmd := utils .SetUpPortForward (k8sClient , ctx , "kube-prometheus-stack-prometheus" , controllerMonitoringNamespace , prometheusLocalPort , 9090 )
350354 defer func () {
@@ -353,7 +357,7 @@ enableLimiter: true`
353357 }()
354358
355359 By ("waiting for Prometheus port-forward to be ready" )
356- err : = utils .VerifyPortForwardReadiness (ctx , prometheusLocalPort , fmt .Sprintf ("https://localhost:%d/api/v1/query?query=up" , prometheusLocalPort ))
360+ err = utils .VerifyPortForwardReadiness (ctx , prometheusLocalPort , fmt .Sprintf ("https://localhost:%d/api/v1/query?query=up" , prometheusLocalPort ))
357361 Expect (err ).NotTo (HaveOccurred (), "Prometheus port-forward should be ready within timeout" )
358362
359363 By ("starting HIGH load generation to trigger scale-up beyond GPU capacity" )
@@ -397,7 +401,7 @@ enableLimiter: true`
397401 _ , _ = fmt .Fprintf (GinkgoWriter , "Load generation job is running\n " )
398402
399403 By ("waiting for saturation detection and verifying limiter constraint" )
400- var finalReplicas int
404+ var desiredReplicas int
401405 var scaledUp bool
402406
403407 // First, wait for scale-up to occur (proves saturation was detected)
@@ -409,58 +413,37 @@ enableLimiter: true`
409413 }, va )
410414 g .Expect (err ).NotTo (HaveOccurred ())
411415
412- finalReplicas = va .Status .DesiredOptimizedAlloc .NumReplicas
416+ desiredReplicas = va .Status .DesiredOptimizedAlloc .NumReplicas
413417 accelerator := va .Status .DesiredOptimizedAlloc .Accelerator
414418
415419 _ , _ = fmt .Fprintf (GinkgoWriter , "DesiredOptimizedAlloc: NumReplicas=%d, Accelerator=%s\n " ,
416- finalReplicas , accelerator )
420+ desiredReplicas , accelerator )
417421
418422 // Verify metrics are flowing
419423 g .Expect (accelerator ).NotTo (BeEmpty (),
420424 "DesiredOptimizedAlloc.Accelerator should be populated when metrics are flowing" )
421425
422426 // Should scale up from initial replica
423- if finalReplicas > int (initialReplicas ) {
427+ if desiredReplicas > int (initialReplicas ) {
424428 scaledUp = true
425429 }
426430 g .Expect (scaledUp ).To (BeTrue (),
427431 fmt .Sprintf ("Should scale up from %d under heavy load" , initialReplicas ))
428432
429433 }, 10 * time .Minute , 10 * time .Second ).Should (Succeed ())
430434
431- By ("verifying limiter enforces GPU capacity limit " )
432- // This is the KEY assertion: replicas should be capped at GPU-limited max
433- Expect (finalReplicas ).To (BeNumerically ("<=" , maxReplicasOnNode ),
434- fmt .Sprintf ("Limiter should cap replicas at %d (%d GPUs / %d GPUs per replica)" ,
435+ By ("verifying limiter constrains DesiredOptimizedAlloc.NumReplicas " )
436+ // The limiter should cap DesiredOptimizedAlloc.NumReplicas to the GPU-limited max
437+ Expect (desiredReplicas ).To (BeNumerically ("<=" , maxReplicasOnNode ),
438+ fmt .Sprintf ("Limiter should cap DesiredOptimizedAlloc.NumReplicas at %d (%d GPUs / %d GPUs per replica)" ,
435439 maxReplicasOnNode , gpusOnTargetNode , gpusPerReplicaLimiter ))
436440
437- By ("verifying system is still saturated (proving limiter is active)" )
438- // If replicas are at max AND load is ongoing, the limiter is actively constraining
439- Consistently (func (g Gomega ) {
440- va := & v1alpha1.VariantAutoscaling {}
441- err := crClient .Get (ctx , client.ObjectKey {
442- Namespace : namespace ,
443- Name : deployName ,
444- }, va )
445- g .Expect (err ).NotTo (HaveOccurred ())
446-
447- currentReplicas := va .Status .DesiredOptimizedAlloc .NumReplicas
448-
449- // Replicas should stay at or below the GPU limit
450- g .Expect (currentReplicas ).To (BeNumerically ("<=" , maxReplicasOnNode ),
451- fmt .Sprintf ("Replicas should remain capped at %d during continuous load" , maxReplicasOnNode ))
452-
453- _ , _ = fmt .Fprintf (GinkgoWriter , "Consistency check: replicas=%d (max=%d)\n " ,
454- currentReplicas , maxReplicasOnNode )
455-
456- }, 2 * time .Minute , 15 * time .Second ).Should (Succeed ())
457-
458441 By ("logging VariantAutoscaling status after limiter constraint test" )
459442 err = utils .LogVariantAutoscalingStatus (ctx , deployName , namespace , crClient , GinkgoWriter )
460443 Expect (err ).NotTo (HaveOccurred (), "Should be able to log VariantAutoscaling status" )
461444
462- _ , _ = fmt .Fprintf (GinkgoWriter , "Limiter successfully constrained scale-up: final replicas = %d (GPU max = %d)\n " ,
463- finalReplicas , maxReplicasOnNode )
445+ _ , _ = fmt .Fprintf (GinkgoWriter , "Limiter successfully constrained scale-up: desiredReplicas= %d (GPU max= %d)\n " ,
446+ desiredReplicas , maxReplicasOnNode )
464447 })
465448 })
466449
0 commit comments