llm-d
diff --git a/‎internal/engines/pipeline/cost_aware_optimizer.go‎
Lines changed: 37 additions & 6 deletions b/‎internal/engines/pipeline/cost_aware_optimizer.go‎
Lines changed: 37 additions & 6 deletions
diff --git a/‎internal/engines/pipeline/cost_aware_optimizer_test.go‎
Lines changed: 115 additions & 0 deletions b/‎internal/engines/pipeline/cost_aware_optimizer_test.go‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎internal/engines/pipeline/greedy_saturation_algorithm.go‎
Lines changed: 16 additions & 0 deletions b/‎internal/engines/pipeline/greedy_saturation_algorithm.go‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎internal/engines/pipeline/greedy_saturation_algorithm_test.go‎
Lines changed: 47 additions & 0 deletions b/‎internal/engines/pipeline/greedy_saturation_algorithm_test.go‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎internal/engines/pipeline/greedy_score_optimizer.go‎
Lines changed: 13 additions & 1 deletion b/‎internal/engines/pipeline/greedy_score_optimizer.go‎
Lines changed: 13 additions & 1 deletion
@@ -54,9 +54,9 @@ func (o *CostAwareOptimizer) Optimize(
 		targets := initTargets(req.VariantStates)
 
 		if req.Result.RequiredCapacity > 0 {
-			costAwareScaleUp(ctx, req.Result, targets)
+			costAwareScaleUp(ctx, req.Result, targets, stateMap)
 		} else if req.Result.SpareCapacity > 0 {
-			costAwareScaleDown(ctx, req.Result, targets)
+			costAwareScaleDown(ctx, req.Result, targets, stateMap)
 		}
 
 		decisions := buildDecisionsWithOptimizer(req, stateMap, vcMap, targets, "cost-aware")
@@ -71,13 +71,13 @@ func (o *CostAwareOptimizer) Optimize(
 
 // costAwareScaleUp adds replicas to the most cost-efficient variant.
 // Sorts by cost-efficiency (cost/perReplicaCapacity) ascending, picks first eligible.
-// Pending replicas are not skipped because the analyzer already accounts for their
-// capacity in the supply calculation — if RequiredCapacity > 0, demand exceeds total
-// supply including pending.
+// Respects maxReplicas per variant — if a variant hits its cap, remaining capacity
+// spills over to the next variant.
 func costAwareScaleUp(
 	ctx context.Context,
 	result *interfaces.AnalyzerResult,
 	targets map[string]int,
+	stateMap map[string]interfaces.VariantReplicaState,
 ) {
 	logger := ctrl.LoggerFrom(ctx)
 
@@ -93,6 +93,19 @@ func costAwareScaleUp(
 		}
 
 		replicasNeeded := int(math.Ceil(remaining / vc.PerReplicaCapacity))
+
+		// Cap by maxReplicas if set
+		state := stateMap[vc.VariantName]
+		if state.MaxReplicas != nil && *state.MaxReplicas > 0 {
+			maxAdd := *state.MaxReplicas - targets[vc.VariantName]
+			if maxAdd <= 0 {
+				continue // already at max
+			}
+			if replicasNeeded > maxAdd {
+				replicasNeeded = maxAdd
+			}
+		}
+
 		targets[vc.VariantName] = targets[vc.VariantName] + replicasNeeded
 		remaining -= float64(replicasNeeded) * vc.PerReplicaCapacity
 
@@ -105,20 +118,28 @@ func costAwareScaleUp(
 
 // costAwareScaleDown removes replicas from the most expensive variant.
 // Sorts by absolute cost descending, removes from most expensive first.
+// Respects minReplicas per variant — will not scale below the annotation floor.
 // The cheapest variant is protected at min 1 replica only when no other variant
 // has replicas — this prevents scale-down deadlocks where the expensive variant's
 // per-replica capacity exceeds spare but cheaper replicas could be removed.
 func costAwareScaleDown(
 	ctx context.Context,
 	result *interfaces.AnalyzerResult,
 	targets map[string]int,
+	stateMap ...map[string]interfaces.VariantReplicaState,
 ) {
 	logger := ctrl.LoggerFrom(ctx)
 
 	sorted := sortByCostDesc(result.VariantCapacities)
 	cheapest := findCheapestVariant(result.VariantCapacities)
 	remaining := result.SpareCapacity
 
+	// Build state lookup if provided
+	var states map[string]interfaces.VariantReplicaState
+	if len(stateMap) > 0 {
+		states = stateMap[0]
+	}
+
 	for _, vc := range sorted {
 		if remaining <= 0 {
 			break
@@ -128,17 +149,25 @@ func costAwareScaleDown(
 		}
 
 		current := targets[vc.VariantName]
+
+		// Determine minReplicas: annotation floor takes priority, then cheapest-variant logic
 		minReplicas := 0
+		if states != nil {
+			if state, ok := states[vc.VariantName]; ok && state.MinReplicas != nil {
+				minReplicas = *state.MinReplicas
+			}
+		}
 		if vc.VariantName == cheapest {
 			// Protect cheapest at 1 only if it's the last variant with replicas
+			// and no higher annotation min is set
 			otherHasReplicas := false
 			for name, t := range targets {
 				if name != cheapest && t > 0 {
 					otherHasReplicas = true
 					break
 				}
 			}
-			if !otherHasReplicas {
+			if !otherHasReplicas && minReplicas < 1 {
 				minReplicas = 1
 			}
 		}
@@ -273,6 +302,8 @@ func buildDecisionsWithOptimizer(
 			TargetReplicas:  target,
 			Action:          action,
 			Reason:          reason,
+			MinReplicas:     state.MinReplicas,
+			MaxReplicas:     state.MaxReplicas,
 		})
 	}
 	return decisions
 
@@ -373,6 +373,121 @@ var _ = Describe("CostAwareOptimizer", func() {
 		})
 	})
 
+	Context("MinReplicas/MaxReplicas Bounds", func() {
+		intPtr := func(n int) *int { return &n }
+
+		It("should respect maxReplicas during scale-up (spillover to next variant)", func() {
+			requests := []ModelScalingRequest{
+				{
+					ModelID:   "model-1",
+					Namespace: "default",
+					Result: &interfaces.AnalyzerResult{
+						RequiredCapacity: 30000,
+						VariantCapacities: []interfaces.VariantCapacity{
+							{VariantName: "cheap", AcceleratorName: "A100", Cost: 5.0, ReplicaCount: 1, PerReplicaCapacity: 10000},
+							{VariantName: "expensive", AcceleratorName: "H100", Cost: 15.0, ReplicaCount: 1, PerReplicaCapacity: 20000},
+						},
+					},
+					VariantStates: []interfaces.VariantReplicaState{
+						{VariantName: "cheap", CurrentReplicas: 1, MaxReplicas: intPtr(3)},
+						{VariantName: "expensive", CurrentReplicas: 1},
+					},
+				},
+			}
+
+			decisions := optimizer.Optimize(ctx, requests, nil)
+			dm := decisionMap(decisions)
+
+			// cheap: ceil(30000/10000)=3, but current=1 so target=1+3=4, capped by max=3 → add 2
+			// remaining = 30000 - 2*10000 = 10000
+			// expensive: ceil(10000/20000)=1 → target=1+1=2
+			Expect(dm["cheap"].TargetReplicas).To(Equal(3))
+			Expect(dm["expensive"].TargetReplicas).To(Equal(2))
+		})
+
+		It("should respect minReplicas during scale-down", func() {
+			requests := []ModelScalingRequest{
+				{
+					ModelID:   "model-1",
+					Namespace: "default",
+					Result: &interfaces.AnalyzerResult{
+						SpareCapacity: 50000,
+						VariantCapacities: []interfaces.VariantCapacity{
+							{VariantName: "expensive", Cost: 15.0, ReplicaCount: 3, PerReplicaCapacity: 20000},
+							{VariantName: "cheap", Cost: 5.0, ReplicaCount: 3, PerReplicaCapacity: 10000},
+						},
+					},
+					VariantStates: []interfaces.VariantReplicaState{
+						{VariantName: "expensive", CurrentReplicas: 3, MinReplicas: intPtr(2)},
+						{VariantName: "cheap", CurrentReplicas: 3},
+					},
+				},
+			}
+
+			decisions := optimizer.Optimize(ctx, requests, nil)
+			dm := decisionMap(decisions)
+
+			// expensive: cost DESC → tried first. min=2, removable=3-2=1. floor(50000/20000)=2 → capped to 1
+			// remaining = 50000-20000=30000
+			// cheap: not last variant → min=0. removable=3. floor(30000/10000)=3 → remove 3
+			Expect(dm["expensive"].TargetReplicas).To(Equal(2))
+			Expect(dm["cheap"].TargetReplicas).To(Equal(0))
+		})
+
+		It("should scale minReplicas=0 variant to zero while keeping minReplicas>0 sibling", func() {
+			requests := []ModelScalingRequest{
+				{
+					ModelID:   "model-1",
+					Namespace: "default",
+					Result: &interfaces.AnalyzerResult{
+						SpareCapacity: 80000, // enough to remove all
+						VariantCapacities: []interfaces.VariantCapacity{
+							{VariantName: "keep-alive", Cost: 15.0, ReplicaCount: 2, PerReplicaCapacity: 20000},
+							{VariantName: "expendable", Cost: 5.0, ReplicaCount: 3, PerReplicaCapacity: 10000},
+						},
+					},
+					VariantStates: []interfaces.VariantReplicaState{
+						{VariantName: "keep-alive", CurrentReplicas: 2, MinReplicas: intPtr(1)},
+						{VariantName: "expendable", CurrentReplicas: 3, MinReplicas: intPtr(0)},
+					},
+				},
+			}
+
+			decisions := optimizer.Optimize(ctx, requests, nil)
+			dm := decisionMap(decisions)
+
+			// keep-alive: minReplicas=1, so floor at 1
+			Expect(dm["keep-alive"].TargetReplicas).To(Equal(1))
+			// expendable: minReplicas=0 and other variant has replicas, so can go to 0
+			Expect(dm["expendable"].TargetReplicas).To(Equal(0))
+		})
+
+		It("should propagate MinReplicas/MaxReplicas to VariantDecision", func() {
+			requests := []ModelScalingRequest{
+				{
+					ModelID:   "model-1",
+					Namespace: "default",
+					Result: &interfaces.AnalyzerResult{
+						RequiredCapacity: 0,
+						SpareCapacity:    0,
+						VariantCapacities: []interfaces.VariantCapacity{
+							{VariantName: "v1", Cost: 5.0, ReplicaCount: 2, PerReplicaCapacity: 10000},
+						},
+					},
+					VariantStates: []interfaces.VariantReplicaState{
+						{VariantName: "v1", CurrentReplicas: 2, MinReplicas: intPtr(1), MaxReplicas: intPtr(10)},
+					},
+				},
+			}
+
+			decisions := optimizer.Optimize(ctx, requests, nil)
+
+			Expect(decisions).To(HaveLen(1))
+			Expect(decisions[0].MinReplicas).To(Equal(intPtr(1)))
+			Expect(decisions[0].MaxReplicas).To(Equal(intPtr(10)))
+		})
+	})
+
 	Context("Helper Functions", func() {
 
 		It("sortByCostEfficiencyAsc should order by cost/capacity", func() {
 
@@ -75,12 +75,22 @@ func (g *GreedyBySaturation) sortByPriority(decisions []*interfaces.VariantDecis
 
 // allocateForDecision attempts to allocate GPUs for a single decision.
 // If partial allocation, adjusts TargetReplicas accordingly.
+// Respects MaxReplicas (caps scale-up) and MinReplicas (floor even under GPU scarcity).
 func (g *GreedyBySaturation) allocateForDecision(d *interfaces.VariantDecision, allocator ResourceAllocator) {
 	replicasNeeded := d.TargetReplicas - d.CurrentReplicas
 	if replicasNeeded <= 0 {
 		return
 	}
 
+	// Cap by maxReplicas if set
+	if d.MaxReplicas != nil && *d.MaxReplicas > 0 && d.TargetReplicas > *d.MaxReplicas {
+		d.TargetReplicas = *d.MaxReplicas
+		replicasNeeded = d.TargetReplicas - d.CurrentReplicas
+		if replicasNeeded <= 0 {
+			return
+		}
+	}
+
 	gpusPerReplica := d.GPUsPerReplica
 	if gpusPerReplica <= 0 {
 		gpusPerReplica = 1 // Default to 1 GPU per replica if not specified
@@ -99,6 +109,12 @@ func (g *GreedyBySaturation) allocateForDecision(d *interfaces.VariantDecision,
 	d.GPUsAllocated = replicasAllocated * gpusPerReplica // Only count full replicas
 	d.TargetReplicas = d.CurrentReplicas + replicasAllocated
 
+	// MinReplicas is a hard floor — even if GPU availability is insufficient,
+	// set TargetReplicas to minReplicas (deployment may be unschedulable, but user intent is preserved).
+	if d.MinReplicas != nil && d.TargetReplicas < *d.MinReplicas {
+		d.TargetReplicas = *d.MinReplicas
+	}
+
 	// Mark as limited if we couldn't allocate all requested
 	if replicasAllocated < replicasNeeded {
 		d.WasLimited = true
 
@@ -276,5 +276,52 @@ var _ = Describe("GreedyBySaturation", func() {
 				Expect(err).NotTo(HaveOccurred())
 			})
 		})
+
+		Context("with MaxReplicas bound", func() {
+			It("should cap scale-up at maxReplicas even when GPUs are available", func() {
+				maxReplicas := 3
+				allocator = &simpleAllocator{remaining: 100}
+				decisions = []*interfaces.VariantDecision{
+					{
+						VariantName:     "v1",
+						CurrentReplicas: 1,
+						TargetReplicas:  10, // wants to scale to 10
+						GPUsPerReplica:  1,
+						SpareCapacity:   0.0,
+						MaxReplicas:     &maxReplicas,
+					},
+				}
+
+				err := algorithm.Allocate(ctx, decisions, allocator)
+				Expect(err).NotTo(HaveOccurred())
+
+				// Capped at maxReplicas=3
+				Expect(decisions[0].TargetReplicas).To(Equal(3))
+			})
+		})
+
+		Context("with MinReplicas bound", func() {
+			It("should enforce minReplicas floor even under GPU scarcity", func() {
+				minReplicas := 3
+				allocator = &simpleAllocator{remaining: 0} // no GPUs
+				decisions = []*interfaces.VariantDecision{
+					{
+						VariantName:     "v1",
+						CurrentReplicas: 1,
+						TargetReplicas:  5,
+						GPUsPerReplica:  2,
+						SpareCapacity:   0.0,
+						MinReplicas:     &minReplicas,
+					},
+				}
+
+				err := algorithm.Allocate(ctx, decisions, allocator)
+				Expect(err).NotTo(HaveOccurred())
+
+				// MinReplicas is a hard floor
+				Expect(decisions[0].TargetReplicas).To(Equal(3))
+				Expect(decisions[0].WasLimited).To(BeTrue())
+			})
+		})
 	})
 })
@@ -92,7 +92,7 @@ func (o *GreedyByScoreOptimizer) Optimize(
 		targets := initTargets(req.VariantStates)
 
 		if req.Result.SpareCapacity > 0 {
-			costAwareScaleDown(ctx, req.Result, targets)
+			costAwareScaleDown(ctx, req.Result, targets, stateMap)
 		}
 
 		decisions := buildDecisionsWithOptimizer(req, stateMap, vcMap, targets, "greedy-by-score")
@@ -327,6 +327,18 @@ func (o *GreedyByScoreOptimizer) allocateToVariants(
 		if n > maxByGPU {
 			n = maxByGPU
 		}
+
+		// Cap by maxReplicas if set
+		if state.MaxReplicas != nil && *state.MaxReplicas > 0 {
+			maxAdd := *state.MaxReplicas - w.targets[vc.VariantName]
+			if maxAdd <= 0 {
+				continue // already at max
+			}
+			if n > maxAdd {
+				n = maxAdd
+			}
+		}
+
 		if n <= 0 {
 			continue
 		}