@@ -357,7 +357,7 @@ enableLimiter: true`
357357 }()
358358
359359 By ("waiting for Prometheus port-forward to be ready" )
360- err = utils .VerifyPortForwardReadiness (ctx , prometheusLocalPort , fmt .Sprintf ("https://localhost:%d/api/v1/query?query=up" , prometheusLocalPort ))
360+ err : = utils .VerifyPortForwardReadiness (ctx , prometheusLocalPort , fmt .Sprintf ("https://localhost:%d/api/v1/query?query=up" , prometheusLocalPort ))
361361 Expect (err ).NotTo (HaveOccurred (), "Prometheus port-forward should be ready within timeout" )
362362
363363 By ("starting HIGH load generation to trigger scale-up beyond GPU capacity" )
@@ -402,7 +402,6 @@ enableLimiter: true`
402402
403403 By ("waiting for saturation detection and verifying limiter constraint" )
404404 var desiredReplicas int
405- var scaledUp bool
406405
407406 // First, wait for scale-up to occur (proves saturation was detected)
408407 Eventually (func (g Gomega ) {
@@ -423,12 +422,9 @@ enableLimiter: true`
423422 g .Expect (accelerator ).NotTo (BeEmpty (),
424423 "DesiredOptimizedAlloc.Accelerator should be populated when metrics are flowing" )
425424
426- // Should scale up from initial replica
427- if desiredReplicas > int (initialReplicas ) {
428- scaledUp = true
429- }
430- g .Expect (scaledUp ).To (BeTrue (),
431- fmt .Sprintf ("Should scale up from %d under heavy load" , initialReplicas ))
425+ // Should scale up from initial 1 replica due to saturation
426+ g .Expect (desiredReplicas ).To (BeNumerically (">" , int (initialReplicas )),
427+ fmt .Sprintf ("Should scale up from %d under load" , initialReplicas ))
432428
433429 }, 10 * time .Minute , 10 * time .Second ).Should (Succeed ())
434430
@@ -438,8 +434,19 @@ enableLimiter: true`
438434 fmt .Sprintf ("Limiter should cap DesiredOptimizedAlloc.NumReplicas at %d (%d GPUs / %d GPUs per replica)" ,
439435 maxReplicasOnNode , gpusOnTargetNode , gpusPerReplicaLimiter ))
440436
441- By ("logging VariantAutoscaling status after limiter constraint test" )
442- err = utils .LogVariantAutoscalingStatus (ctx , deployName , namespace , crClient , GinkgoWriter )
437+ desiredReplicas = va .Status .DesiredOptimizedAlloc .NumReplicas
438+ _ , _ = fmt .Fprintf (GinkgoWriter , "Checking DesiredOptimizedAlloc.NumReplicas=%d against max=%d\n " ,
439+ desiredReplicas , maxReplicasOnNode )
440+
441+ // Final replicas should not exceed max allowed by GPU capacity
442+ g .Expect (desiredReplicas ).To (BeNumerically ("<=" , maxReplicasOnNode ),
443+ fmt .Sprintf ("Final replicas %d should be less than or equal to max %d due to GPU limiter" ,
444+ desiredReplicas , maxReplicasOnNode ))
445+
446+ }, 2 * time .Minute , 10 * time .Second ).Should (Succeed ())
447+
448+ By ("logging VariantAutoscaling status after scale-up" )
449+ err = utils .LogVariantAutoscalingStatus (ctx , name , namespace , crClient , GinkgoWriter )
443450 Expect (err ).NotTo (HaveOccurred (), "Should be able to log VariantAutoscaling status" )
444451
445452 _ , _ = fmt .Fprintf (GinkgoWriter , "Limiter successfully constrained scale-up: desiredReplicas=%d (GPU max=%d)\n " ,
0 commit comments