Skip to content

Commit 1de48e3

Browse files
authored
Feat/min max replica bounds (#858)
* feat: add min/max replicas as VA annotations with optimizer integration Add per-variant min/max replica bounds via VA annotations (wva.llmd.ai/min-replicas, wva.llmd.ai/max-replicas) and integrate them into both V1 and V2 scaling paths: - Parse bounds from VA annotations in BuildVariantStates - Respect maxReplicas in CostAwareOptimizer (spillover to next variant) - Respect minReplicas in costAwareScaleDown (hard floor per variant) - Respect maxReplicas in GreedyBySaturationOptimizer allocateForModel - Respect min/max in V1 limiter allocateForDecision - Clamp targets in V1 CalculateSaturationTargets - Disable scale-to-zero enforcement when any variant has minReplicas > 0 - Propagate bounds through VariantDecision for observability * refactor: use VA spec fields for min/max replicas instead of annotations Remove annotation-based min/max replica bounds (wva.llmd.ai/min-replicas, wva.llmd.ai/max-replicas) and read directly from VA spec.MinReplicas and spec.MaxReplicas fields added in #864. This eliminates the annotation parsing layer and aligns with the CRD as the single source of truth. * fix(e2e): align VA min/max replicas with test expectations - Set explicit MinReplicas=1 and MaxReplicas=10 in VA builder defaults (was implicit MinReplicas via kubebuilder default and MaxReplicas=2) - Add VAOption functional options (WithMinReplicas, WithMaxReplicas) for tests that need custom replica bounds - Scale-to-zero and scale-from-zero tests now create VAs with MinReplicas=0 so the engine allows scaling to zero replicas - MaxReplicas raised from 2 to 10 to match HPA maxReplicas and avoid artificially capping scale-up in load tests * fix: address PR review — enforce minReplicas in GreedyByScore scale-down - Pass stateMap to costAwareScaleDown in GreedyByScoreOptimizer so minReplicas is respected during scale-down (was missing) - Update doc comments: "VA annotation" → "VA spec field" in VariantDecision, VariantReplicaState, and saturation analyzer - Add tests verifying mixed-minReplicas behavior: variant with minReplicas=0 scales to zero while sibling with minReplicas>0 is preserved (CostAware and GreedyByScore)
1 parent 104073d commit 1de48e3

12 files changed

Lines changed: 437 additions & 23 deletions

internal/engines/pipeline/cost_aware_optimizer.go

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ func (o *CostAwareOptimizer) Optimize(
5454
targets := initTargets(req.VariantStates)
5555

5656
if req.Result.RequiredCapacity > 0 {
57-
costAwareScaleUp(ctx, req.Result, targets)
57+
costAwareScaleUp(ctx, req.Result, targets, stateMap)
5858
} else if req.Result.SpareCapacity > 0 {
59-
costAwareScaleDown(ctx, req.Result, targets)
59+
costAwareScaleDown(ctx, req.Result, targets, stateMap)
6060
}
6161

6262
decisions := buildDecisionsWithOptimizer(req, stateMap, vcMap, targets, "cost-aware")
@@ -71,13 +71,13 @@ func (o *CostAwareOptimizer) Optimize(
7171

7272
// costAwareScaleUp adds replicas to the most cost-efficient variant.
7373
// Sorts by cost-efficiency (cost/perReplicaCapacity) ascending, picks first eligible.
74-
// Pending replicas are not skipped because the analyzer already accounts for their
75-
// capacity in the supply calculation — if RequiredCapacity > 0, demand exceeds total
76-
// supply including pending.
74+
// Respects maxReplicas per variant — if a variant hits its cap, remaining capacity
75+
// spills over to the next variant.
7776
func costAwareScaleUp(
7877
ctx context.Context,
7978
result *interfaces.AnalyzerResult,
8079
targets map[string]int,
80+
stateMap map[string]interfaces.VariantReplicaState,
8181
) {
8282
logger := ctrl.LoggerFrom(ctx)
8383

@@ -93,6 +93,19 @@ func costAwareScaleUp(
9393
}
9494

9595
replicasNeeded := int(math.Ceil(remaining / vc.PerReplicaCapacity))
96+
97+
// Cap by maxReplicas if set
98+
state := stateMap[vc.VariantName]
99+
if state.MaxReplicas != nil && *state.MaxReplicas > 0 {
100+
maxAdd := *state.MaxReplicas - targets[vc.VariantName]
101+
if maxAdd <= 0 {
102+
continue // already at max
103+
}
104+
if replicasNeeded > maxAdd {
105+
replicasNeeded = maxAdd
106+
}
107+
}
108+
96109
targets[vc.VariantName] = targets[vc.VariantName] + replicasNeeded
97110
remaining -= float64(replicasNeeded) * vc.PerReplicaCapacity
98111

@@ -105,20 +118,28 @@ func costAwareScaleUp(
105118

106119
// costAwareScaleDown removes replicas from the most expensive variant.
107120
// Sorts by absolute cost descending, removes from most expensive first.
121+
// Respects minReplicas per variant — will not scale below the annotation floor.
108122
// The cheapest variant is protected at min 1 replica only when no other variant
109123
// has replicas — this prevents scale-down deadlocks where the expensive variant's
110124
// per-replica capacity exceeds spare but cheaper replicas could be removed.
111125
func costAwareScaleDown(
112126
ctx context.Context,
113127
result *interfaces.AnalyzerResult,
114128
targets map[string]int,
129+
stateMap ...map[string]interfaces.VariantReplicaState,
115130
) {
116131
logger := ctrl.LoggerFrom(ctx)
117132

118133
sorted := sortByCostDesc(result.VariantCapacities)
119134
cheapest := findCheapestVariant(result.VariantCapacities)
120135
remaining := result.SpareCapacity
121136

137+
// Build state lookup if provided
138+
var states map[string]interfaces.VariantReplicaState
139+
if len(stateMap) > 0 {
140+
states = stateMap[0]
141+
}
142+
122143
for _, vc := range sorted {
123144
if remaining <= 0 {
124145
break
@@ -128,17 +149,25 @@ func costAwareScaleDown(
128149
}
129150

130151
current := targets[vc.VariantName]
152+
153+
// Determine minReplicas: annotation floor takes priority, then cheapest-variant logic
131154
minReplicas := 0
155+
if states != nil {
156+
if state, ok := states[vc.VariantName]; ok && state.MinReplicas != nil {
157+
minReplicas = *state.MinReplicas
158+
}
159+
}
132160
if vc.VariantName == cheapest {
133161
// Protect cheapest at 1 only if it's the last variant with replicas
162+
// and no higher annotation min is set
134163
otherHasReplicas := false
135164
for name, t := range targets {
136165
if name != cheapest && t > 0 {
137166
otherHasReplicas = true
138167
break
139168
}
140169
}
141-
if !otherHasReplicas {
170+
if !otherHasReplicas && minReplicas < 1 {
142171
minReplicas = 1
143172
}
144173
}
@@ -273,6 +302,8 @@ func buildDecisionsWithOptimizer(
273302
TargetReplicas: target,
274303
Action: action,
275304
Reason: reason,
305+
MinReplicas: state.MinReplicas,
306+
MaxReplicas: state.MaxReplicas,
276307
})
277308
}
278309
return decisions

internal/engines/pipeline/cost_aware_optimizer_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,121 @@ var _ = Describe("CostAwareOptimizer", func() {
373373
})
374374
})
375375

376+
Context("MinReplicas/MaxReplicas Bounds", func() {
377+
intPtr := func(n int) *int { return &n }
378+
379+
It("should respect maxReplicas during scale-up (spillover to next variant)", func() {
380+
requests := []ModelScalingRequest{
381+
{
382+
ModelID: "model-1",
383+
Namespace: "default",
384+
Result: &interfaces.AnalyzerResult{
385+
RequiredCapacity: 30000,
386+
VariantCapacities: []interfaces.VariantCapacity{
387+
{VariantName: "cheap", AcceleratorName: "A100", Cost: 5.0, ReplicaCount: 1, PerReplicaCapacity: 10000},
388+
{VariantName: "expensive", AcceleratorName: "H100", Cost: 15.0, ReplicaCount: 1, PerReplicaCapacity: 20000},
389+
},
390+
},
391+
VariantStates: []interfaces.VariantReplicaState{
392+
{VariantName: "cheap", CurrentReplicas: 1, MaxReplicas: intPtr(3)},
393+
{VariantName: "expensive", CurrentReplicas: 1},
394+
},
395+
},
396+
}
397+
398+
decisions := optimizer.Optimize(ctx, requests, nil)
399+
dm := decisionMap(decisions)
400+
401+
// cheap: ceil(30000/10000)=3, but current=1 so target=1+3=4, capped by max=3 → add 2
402+
// remaining = 30000 - 2*10000 = 10000
403+
// expensive: ceil(10000/20000)=1 → target=1+1=2
404+
Expect(dm["cheap"].TargetReplicas).To(Equal(3))
405+
Expect(dm["expensive"].TargetReplicas).To(Equal(2))
406+
})
407+
408+
It("should respect minReplicas during scale-down", func() {
409+
requests := []ModelScalingRequest{
410+
{
411+
ModelID: "model-1",
412+
Namespace: "default",
413+
Result: &interfaces.AnalyzerResult{
414+
SpareCapacity: 50000,
415+
VariantCapacities: []interfaces.VariantCapacity{
416+
{VariantName: "expensive", Cost: 15.0, ReplicaCount: 3, PerReplicaCapacity: 20000},
417+
{VariantName: "cheap", Cost: 5.0, ReplicaCount: 3, PerReplicaCapacity: 10000},
418+
},
419+
},
420+
VariantStates: []interfaces.VariantReplicaState{
421+
{VariantName: "expensive", CurrentReplicas: 3, MinReplicas: intPtr(2)},
422+
{VariantName: "cheap", CurrentReplicas: 3},
423+
},
424+
},
425+
}
426+
427+
decisions := optimizer.Optimize(ctx, requests, nil)
428+
dm := decisionMap(decisions)
429+
430+
// expensive: cost DESC → tried first. min=2, removable=3-2=1. floor(50000/20000)=2 → capped to 1
431+
// remaining = 50000-20000=30000
432+
// cheap: not last variant → min=0. removable=3. floor(30000/10000)=3 → remove 3
433+
Expect(dm["expensive"].TargetReplicas).To(Equal(2))
434+
Expect(dm["cheap"].TargetReplicas).To(Equal(0))
435+
})
436+
437+
It("should scale minReplicas=0 variant to zero while keeping minReplicas>0 sibling", func() {
438+
requests := []ModelScalingRequest{
439+
{
440+
ModelID: "model-1",
441+
Namespace: "default",
442+
Result: &interfaces.AnalyzerResult{
443+
SpareCapacity: 80000, // enough to remove all
444+
VariantCapacities: []interfaces.VariantCapacity{
445+
{VariantName: "keep-alive", Cost: 15.0, ReplicaCount: 2, PerReplicaCapacity: 20000},
446+
{VariantName: "expendable", Cost: 5.0, ReplicaCount: 3, PerReplicaCapacity: 10000},
447+
},
448+
},
449+
VariantStates: []interfaces.VariantReplicaState{
450+
{VariantName: "keep-alive", CurrentReplicas: 2, MinReplicas: intPtr(1)},
451+
{VariantName: "expendable", CurrentReplicas: 3, MinReplicas: intPtr(0)},
452+
},
453+
},
454+
}
455+
456+
decisions := optimizer.Optimize(ctx, requests, nil)
457+
dm := decisionMap(decisions)
458+
459+
// keep-alive: minReplicas=1, so floor at 1
460+
Expect(dm["keep-alive"].TargetReplicas).To(Equal(1))
461+
// expendable: minReplicas=0 and other variant has replicas, so can go to 0
462+
Expect(dm["expendable"].TargetReplicas).To(Equal(0))
463+
})
464+
465+
It("should propagate MinReplicas/MaxReplicas to VariantDecision", func() {
466+
requests := []ModelScalingRequest{
467+
{
468+
ModelID: "model-1",
469+
Namespace: "default",
470+
Result: &interfaces.AnalyzerResult{
471+
RequiredCapacity: 0,
472+
SpareCapacity: 0,
473+
VariantCapacities: []interfaces.VariantCapacity{
474+
{VariantName: "v1", Cost: 5.0, ReplicaCount: 2, PerReplicaCapacity: 10000},
475+
},
476+
},
477+
VariantStates: []interfaces.VariantReplicaState{
478+
{VariantName: "v1", CurrentReplicas: 2, MinReplicas: intPtr(1), MaxReplicas: intPtr(10)},
479+
},
480+
},
481+
}
482+
483+
decisions := optimizer.Optimize(ctx, requests, nil)
484+
485+
Expect(decisions).To(HaveLen(1))
486+
Expect(decisions[0].MinReplicas).To(Equal(intPtr(1)))
487+
Expect(decisions[0].MaxReplicas).To(Equal(intPtr(10)))
488+
})
489+
})
490+
376491
Context("Helper Functions", func() {
377492

378493
It("sortByCostEfficiencyAsc should order by cost/capacity", func() {

internal/engines/pipeline/greedy_saturation_algorithm.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,22 @@ func (g *GreedyBySaturation) sortByPriority(decisions []*interfaces.VariantDecis
7575

7676
// allocateForDecision attempts to allocate GPUs for a single decision.
7777
// If partial allocation, adjusts TargetReplicas accordingly.
78+
// Respects MaxReplicas (caps scale-up) and MinReplicas (floor even under GPU scarcity).
7879
func (g *GreedyBySaturation) allocateForDecision(d *interfaces.VariantDecision, allocator ResourceAllocator) {
7980
replicasNeeded := d.TargetReplicas - d.CurrentReplicas
8081
if replicasNeeded <= 0 {
8182
return
8283
}
8384

85+
// Cap by maxReplicas if set
86+
if d.MaxReplicas != nil && *d.MaxReplicas > 0 && d.TargetReplicas > *d.MaxReplicas {
87+
d.TargetReplicas = *d.MaxReplicas
88+
replicasNeeded = d.TargetReplicas - d.CurrentReplicas
89+
if replicasNeeded <= 0 {
90+
return
91+
}
92+
}
93+
8494
gpusPerReplica := d.GPUsPerReplica
8595
if gpusPerReplica <= 0 {
8696
gpusPerReplica = 1 // Default to 1 GPU per replica if not specified
@@ -99,6 +109,12 @@ func (g *GreedyBySaturation) allocateForDecision(d *interfaces.VariantDecision,
99109
d.GPUsAllocated = replicasAllocated * gpusPerReplica // Only count full replicas
100110
d.TargetReplicas = d.CurrentReplicas + replicasAllocated
101111

112+
// MinReplicas is a hard floor — even if GPU availability is insufficient,
113+
// set TargetReplicas to minReplicas (deployment may be unschedulable, but user intent is preserved).
114+
if d.MinReplicas != nil && d.TargetReplicas < *d.MinReplicas {
115+
d.TargetReplicas = *d.MinReplicas
116+
}
117+
102118
// Mark as limited if we couldn't allocate all requested
103119
if replicasAllocated < replicasNeeded {
104120
d.WasLimited = true

internal/engines/pipeline/greedy_saturation_algorithm_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,5 +276,52 @@ var _ = Describe("GreedyBySaturation", func() {
276276
Expect(err).NotTo(HaveOccurred())
277277
})
278278
})
279+
280+
Context("with MaxReplicas bound", func() {
281+
It("should cap scale-up at maxReplicas even when GPUs are available", func() {
282+
maxReplicas := 3
283+
allocator = &simpleAllocator{remaining: 100}
284+
decisions = []*interfaces.VariantDecision{
285+
{
286+
VariantName: "v1",
287+
CurrentReplicas: 1,
288+
TargetReplicas: 10, // wants to scale to 10
289+
GPUsPerReplica: 1,
290+
SpareCapacity: 0.0,
291+
MaxReplicas: &maxReplicas,
292+
},
293+
}
294+
295+
err := algorithm.Allocate(ctx, decisions, allocator)
296+
Expect(err).NotTo(HaveOccurred())
297+
298+
// Capped at maxReplicas=3
299+
Expect(decisions[0].TargetReplicas).To(Equal(3))
300+
})
301+
})
302+
303+
Context("with MinReplicas bound", func() {
304+
It("should enforce minReplicas floor even under GPU scarcity", func() {
305+
minReplicas := 3
306+
allocator = &simpleAllocator{remaining: 0} // no GPUs
307+
decisions = []*interfaces.VariantDecision{
308+
{
309+
VariantName: "v1",
310+
CurrentReplicas: 1,
311+
TargetReplicas: 5,
312+
GPUsPerReplica: 2,
313+
SpareCapacity: 0.0,
314+
MinReplicas: &minReplicas,
315+
},
316+
}
317+
318+
err := algorithm.Allocate(ctx, decisions, allocator)
319+
Expect(err).NotTo(HaveOccurred())
320+
321+
// MinReplicas is a hard floor
322+
Expect(decisions[0].TargetReplicas).To(Equal(3))
323+
Expect(decisions[0].WasLimited).To(BeTrue())
324+
})
325+
})
279326
})
280327
})

internal/engines/pipeline/greedy_score_optimizer.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ func (o *GreedyByScoreOptimizer) Optimize(
9292
targets := initTargets(req.VariantStates)
9393

9494
if req.Result.SpareCapacity > 0 {
95-
costAwareScaleDown(ctx, req.Result, targets)
95+
costAwareScaleDown(ctx, req.Result, targets, stateMap)
9696
}
9797

9898
decisions := buildDecisionsWithOptimizer(req, stateMap, vcMap, targets, "greedy-by-score")
@@ -327,6 +327,18 @@ func (o *GreedyByScoreOptimizer) allocateToVariants(
327327
if n > maxByGPU {
328328
n = maxByGPU
329329
}
330+
331+
// Cap by maxReplicas if set
332+
if state.MaxReplicas != nil && *state.MaxReplicas > 0 {
333+
maxAdd := *state.MaxReplicas - w.targets[vc.VariantName]
334+
if maxAdd <= 0 {
335+
continue // already at max
336+
}
337+
if n > maxAdd {
338+
n = maxAdd
339+
}
340+
}
341+
330342
if n <= 0 {
331343
continue
332344
}

0 commit comments

Comments
 (0)