Skip to content

Commit 104073d

Browse files
authored
api: add minReplicas, maxReplicas and behavior fields to VA spec (#864)
* api: add minReplicas and maxReplicas to VariantAutoscalingSpec Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com> * api: add behavior field to VariantAutoscalingConfigSpec for HPA scaling policies Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com> * test: fix VA fixtures for maxReplicas validation and add CRD field tests for minReplicas, maxReplicas, and behavior Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com> * refactor(api): change default maxReplicas from 10 to 2 Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com> * refactor(api): remove behavior field to align with release plan Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com> --------- Signed-off-by: Vivek Karunai Kiri Ragavan <vkarunai@redhat.com>
1 parent 7196434 commit 104073d

14 files changed

Lines changed: 213 additions & 34 deletions

File tree

api/v1alpha1/variantautoscaling_types.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ type VariantAutoscalingConfigSpec struct {
1717
}
1818

1919
// VariantAutoscalingSpec defines the desired state for autoscaling a model variant.
20+
// +kubebuilder:validation:XValidation:rule="!has(self.minReplicas) || self.minReplicas <= self.maxReplicas",message="minReplicas must be less than or equal to maxReplicas"
2021
type VariantAutoscalingSpec struct {
2122
// ScaleTargetRef references the scalable resource to manage.
2223
// This follows the same pattern as HorizontalPodAutoscaler.
@@ -28,6 +29,20 @@ type VariantAutoscalingSpec struct {
2829
// +kubebuilder:validation:Required
2930
ModelID string `json:"modelID"`
3031

32+
// MinReplicas is the lower bound on the number of replicas for this variant.
33+
// A value of 0 enables scale-to-zero when the model is idle.
34+
// Defaults to 1, preserving existing behavior for VAs that omit this field.
35+
// +kubebuilder:validation:Minimum=0
36+
// +kubebuilder:default=1
37+
// +optional
38+
MinReplicas *int32 `json:"minReplicas,omitempty"`
39+
40+
// MaxReplicas is the upper bound on the number of replicas for this variant.
41+
// The autoscaler will never scale beyond this value regardless of load.
42+
// +kubebuilder:validation:Minimum=1
43+
// +kubebuilder:default=2
44+
MaxReplicas int32 `json:"maxReplicas"`
45+
3146
// VariantAutoscalingConfigSpec holds optional tuning fields that integrators can embed.
3247
VariantAutoscalingConfigSpec `json:",inline"`
3348
}
@@ -76,6 +91,8 @@ type ActuationStatus struct {
7691
// +kubebuilder:resource:shortName=va
7792
// +kubebuilder:printcolumn:name="Target",type=string,JSONPath=".spec.scaleTargetRef.name"
7893
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=".spec.modelID"
94+
// +kubebuilder:printcolumn:name="Min",type=integer,JSONPath=".spec.minReplicas"
95+
// +kubebuilder:printcolumn:name="Max",type=integer,JSONPath=".spec.maxReplicas"
7996
// +kubebuilder:printcolumn:name="Optimized",type=string,JSONPath=".status.desiredOptimizedAlloc.numReplicas"
8097
// +kubebuilder:printcolumn:name="MetricsReady",type=string,JSONPath=".status.conditions[?(@.type=='MetricsAvailable')].status"
8198
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp"

api/v1alpha1/variantautoscaling_types_test.go

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ func makeValidVA() *VariantAutoscaling {
3131
Kind: "Deployment",
3232
Name: "va-sample-deployment",
3333
},
34-
ModelID: "model-123",
34+
ModelID: "model-123",
35+
MaxReplicas: 2,
3536
},
3637
Status: VariantAutoscalingStatus{
3738
// CurrentAlloc: Allocation{...} -- Removed
@@ -218,3 +219,66 @@ func jsonContainsKey(b []byte, key string) bool {
218219
_, ok := m[key]
219220
return ok
220221
}
222+
223+
func TestMinMaxReplicasJSON(t *testing.T) {
224+
minVal := int32(2)
225+
va := &VariantAutoscaling{
226+
ObjectMeta: metav1.ObjectMeta{Name: "va-replicas", Namespace: "default"},
227+
Spec: VariantAutoscalingSpec{
228+
ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
229+
Kind: "Deployment",
230+
Name: "my-deploy",
231+
},
232+
ModelID: "model-x",
233+
MinReplicas: &minVal,
234+
MaxReplicas: 5,
235+
},
236+
}
237+
238+
b, err := json.Marshal(va)
239+
if err != nil {
240+
t.Fatalf("marshal failed: %v", err)
241+
}
242+
243+
var probe struct {
244+
Spec struct {
245+
MinReplicas *int32 `json:"minReplicas"`
246+
MaxReplicas int32 `json:"maxReplicas"`
247+
} `json:"spec"`
248+
}
249+
if err := json.Unmarshal(b, &probe); err != nil {
250+
t.Fatalf("unmarshal failed: %v", err)
251+
}
252+
if probe.Spec.MinReplicas == nil || *probe.Spec.MinReplicas != 2 {
253+
t.Errorf("expected minReplicas=2, got %v", probe.Spec.MinReplicas)
254+
}
255+
if probe.Spec.MaxReplicas != 5 {
256+
t.Errorf("expected maxReplicas=5, got %d", probe.Spec.MaxReplicas)
257+
}
258+
259+
// minReplicas must be absent from JSON when nil (omitempty)
260+
vaNoMin := &VariantAutoscaling{
261+
ObjectMeta: metav1.ObjectMeta{Name: "va-no-min", Namespace: "default"},
262+
Spec: VariantAutoscalingSpec{
263+
ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
264+
Kind: "Deployment",
265+
Name: "my-deploy",
266+
},
267+
ModelID: "model-x",
268+
MaxReplicas: 5,
269+
},
270+
}
271+
b2, err := json.Marshal(vaNoMin)
272+
if err != nil {
273+
t.Fatalf("marshal failed: %v", err)
274+
}
275+
var probeSpec struct {
276+
Spec map[string]any `json:"spec"`
277+
}
278+
if err := json.Unmarshal(b2, &probeSpec); err != nil {
279+
t.Fatalf("unmarshal failed: %v", err)
280+
}
281+
if _, ok := probeSpec.Spec["minReplicas"]; ok {
282+
t.Errorf("expected minReplicas to be absent when nil, but it was present")
283+
}
284+
}

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 6 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/workload-variant-autoscaler/crds/llmd.ai_variantautoscalings.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ spec:
2323
- jsonPath: .spec.modelID
2424
name: Model
2525
type: string
26+
- jsonPath: .spec.minReplicas
27+
name: Min
28+
type: integer
29+
- jsonPath: .spec.maxReplicas
30+
name: Max
31+
type: integer
2632
- jsonPath: .status.desiredOptimizedAlloc.numReplicas
2733
name: Optimized
2834
type: string
@@ -60,6 +66,23 @@ spec:
6066
description: Spec defines the desired state for autoscaling the model
6167
variant.
6268
properties:
69+
maxReplicas:
70+
default: 2
71+
description: |-
72+
MaxReplicas is the upper bound on the number of replicas for this variant.
73+
The autoscaler will never scale beyond this value regardless of load.
74+
format: int32
75+
minimum: 1
76+
type: integer
77+
minReplicas:
78+
default: 1
79+
description: |-
80+
MinReplicas is the lower bound on the number of replicas for this variant.
81+
A value of 0 enables scale-to-zero when the model is idle.
82+
Defaults to 1, preserving existing behavior for VAs that omit this field.
83+
format: int32
84+
minimum: 0
85+
type: integer
6386
modelID:
6487
description: ModelID specifies the unique identifier of the model
6588
to be autoscaled.
@@ -90,9 +113,13 @@ spec:
90113
pattern: ^\d+(\.\d+)?$
91114
type: string
92115
required:
116+
- maxReplicas
93117
- modelID
94118
- scaleTargetRef
95119
type: object
120+
x-kubernetes-validations:
121+
- message: minReplicas must be less than or equal to maxReplicas
122+
rule: '!has(self.minReplicas) || self.minReplicas <= self.maxReplicas'
96123
status:
97124
description: Status represents the current status of autoscaling for the
98125
model variant.

charts/workload-variant-autoscaler/values-dev.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ hpa:
6868
enabled: true
6969
# minReplicas: 0 for scale-to-zero testing (requires HPAScaleToZero feature gate)
7070
minReplicas: 0
71-
maxReplicas: 10
71+
maxReplicas: 2
7272
targetAverageValue: "1"
7373
vllmService:
7474
enabled: true

charts/workload-variant-autoscaler/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ hpa:
103103
# minReplicas: 1 is the safe default that prevents scale-to-zero
104104
# Set to 0 when wva.scaleToZero is enabled
105105
minReplicas: 1
106-
maxReplicas: 10
106+
maxReplicas: 2
107107
targetAverageValue: "1"
108108
# HPA scaling behavior configuration
109109
behavior:

config/crd/bases/llmd.ai_variantautoscalings.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ spec:
2323
- jsonPath: .spec.modelID
2424
name: Model
2525
type: string
26+
- jsonPath: .spec.minReplicas
27+
name: Min
28+
type: integer
29+
- jsonPath: .spec.maxReplicas
30+
name: Max
31+
type: integer
2632
- jsonPath: .status.desiredOptimizedAlloc.numReplicas
2733
name: Optimized
2834
type: string
@@ -60,6 +66,23 @@ spec:
6066
description: Spec defines the desired state for autoscaling the model
6167
variant.
6268
properties:
69+
maxReplicas:
70+
default: 2
71+
description: |-
72+
MaxReplicas is the upper bound on the number of replicas for this variant.
73+
The autoscaler will never scale beyond this value regardless of load.
74+
format: int32
75+
minimum: 1
76+
type: integer
77+
minReplicas:
78+
default: 1
79+
description: |-
80+
MinReplicas is the lower bound on the number of replicas for this variant.
81+
A value of 0 enables scale-to-zero when the model is idle.
82+
Defaults to 1, preserving existing behavior for VAs that omit this field.
83+
format: int32
84+
minimum: 0
85+
type: integer
6386
modelID:
6487
description: ModelID specifies the unique identifier of the model
6588
to be autoscaled.
@@ -90,9 +113,13 @@ spec:
90113
pattern: ^\d+(\.\d+)?$
91114
type: string
92115
required:
116+
- maxReplicas
93117
- modelID
94118
- scaleTargetRef
95119
type: object
120+
x-kubernetes-validations:
121+
- message: minReplicas must be less than or equal to maxReplicas
122+
rule: '!has(self.minReplicas) || self.minReplicas <= self.maxReplicas'
96123
status:
97124
description: Status represents the current status of autoscaling for the
98125
model variant.

config/samples/hpa-integration.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
kind: Deployment
1010
name: sample-deployment
1111
# minReplicas: 0 # scale to zero - alpha feature
12-
maxReplicas: 10
12+
maxReplicas: 2
1313
behavior:
1414
scaleUp:
1515
stabilizationWindowSeconds: 0

docs/user-guide/crd-reference.md

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ _Appears in:_
4545
| --- | --- | --- | --- |
4646
| `lastRunTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | LastRunTime is the timestamp of the last optimization run. | | |
4747
| `accelerator` _string_ | Accelerator is the type of accelerator for the optimized allocation. | | MinLength: 2 <br /> |
48-
| `numReplicas` _integer_ | NumReplicas is the number of replicas for the optimized allocation. | | Minimum: 1 <br /> |
48+
| `numReplicas` _integer_ | NumReplicas is the number of replicas for the optimized allocation. | | Minimum: 0 <br /> |
4949

5050

5151
#### VariantAutoscaling
@@ -64,13 +64,31 @@ _Appears in:_
6464
| --- | --- | --- | --- |
6565
| `apiVersion` _string_ | `llmd.ai/v1alpha1` | | |
6666
| `kind` _string_ | `VariantAutoscaling` | | |
67-
| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds | | |
68-
| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources | | |
67+
| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds | | Optional: \{\} <br /> |
68+
| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources | | Optional: \{\} <br /> |
6969
| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
7070
| `spec` _[VariantAutoscalingSpec](#variantautoscalingspec)_ | Spec defines the desired state for autoscaling the model variant. | | |
7171
| `status` _[VariantAutoscalingStatus](#variantautoscalingstatus)_ | Status represents the current status of autoscaling for the model variant. | | |
7272

7373

74+
#### VariantAutoscalingConfigSpec
75+
76+
77+
78+
VariantAutoscalingConfigSpec holds the optional tuning fields for a VariantAutoscaling.
79+
It is extracted as a standalone embeddable type so that higher-level controllers
80+
(e.g. KServe) can inline it without duplicating field definitions.
81+
82+
83+
84+
_Appears in:_
85+
- [VariantAutoscalingSpec](#variantautoscalingspec)
86+
87+
| Field | Description | Default | Validation |
88+
| --- | --- | --- | --- |
89+
| `variantCost` _string_ | VariantCost specifies the cost per replica for this variant (used in saturation analysis). | 10.0 | Optional: \{\} <br />Pattern: `^\d+(\.\d+)?$` <br /> |
90+
91+
7492
#### VariantAutoscalingList
7593

7694

@@ -85,8 +103,8 @@ VariantAutoscalingList contains a list of VariantAutoscaling resources.
85103
| --- | --- | --- | --- |
86104
| `apiVersion` _string_ | `llmd.ai/v1alpha1` | | |
87105
| `kind` _string_ | `VariantAutoscalingList` | | |
88-
| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds | | |
89-
| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources | | |
106+
| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds | | Optional: \{\} <br /> |
107+
| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources | | Optional: \{\} <br /> |
90108
| `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | |
91109
| `items` _[VariantAutoscaling](#variantautoscaling) array_ | Items is the list of VariantAutoscaling resources. | | |
92110

@@ -104,8 +122,10 @@ _Appears in:_
104122

105123
| Field | Description | Default | Validation |
106124
| --- | --- | --- | --- |
107-
| `scaleTargetRef` _[CrossVersionObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#crossversionobjectreference-v1-autoscaling)_ | ScaleTargetRef references the scalable resource to manage.<br />This follows the same pattern as HorizontalPodAutoscaler. | | Required: \{\} <br /> |
125+
| `scaleTargetRef` _[CrossVersionObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#crossversionobjectreference-v2-autoscaling)_ | ScaleTargetRef references the scalable resource to manage.<br />This follows the same pattern as HorizontalPodAutoscaler. | | Required: \{\} <br /> |
108126
| `modelID` _string_ | ModelID specifies the unique identifier of the model to be autoscaled. | | MinLength: 1 <br />Required: \{\} <br /> |
127+
| `minReplicas` _integer_ | MinReplicas is the lower bound on the number of replicas for this variant.<br />A value of 0 enables scale-to-zero when the model is idle.<br />Defaults to 1, preserving existing behavior for VAs that omit this field. | 1 | Minimum: 0 <br />Optional: \{\} <br /> |
128+
| `maxReplicas` _integer_ | MaxReplicas is the upper bound on the number of replicas for this variant.<br />The autoscaler will never scale beyond this value regardless of load. | 2 | Minimum: 1 <br /> |
109129
| `variantCost` _string_ | VariantCost specifies the cost per replica for this variant (used in saturation analysis). | 10.0 | Optional: \{\} <br />Pattern: `^\d+(\.\d+)?$` <br /> |
110130

111131

@@ -114,7 +134,7 @@ _Appears in:_
114134

115135

116136
VariantAutoscalingStatus represents the current status of autoscaling for a variant,
117-
including the desired optimized allocation and actuation status.
137+
including the current allocation, desired optimized allocation, and actuation status.
118138

119139

120140

internal/actuator/actuator_test.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ var _ = Describe("Actuator", func() {
122122
Kind: "Deployment",
123123
Name: resourceName,
124124
},
125+
MaxReplicas: 2,
125126
},
126127
}
127128

@@ -213,7 +214,8 @@ var _ = Describe("Actuator", func() {
213214
Kind: "Deployment",
214215
Name: contextResourceName,
215216
},
216-
ModelID: "test-model/variant-1",
217+
ModelID: "test-model/variant-1",
218+
MaxReplicas: 2,
217219
},
218220
Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
219221
DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{
@@ -331,7 +333,8 @@ var _ = Describe("Actuator", func() {
331333
Kind: "Deployment",
332334
Name: contextResourceName,
333335
},
334-
ModelID: "test-model/metrics-test",
336+
ModelID: "test-model/metrics-test",
337+
MaxReplicas: 2,
335338
},
336339
Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
337340
DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{
@@ -391,7 +394,8 @@ var _ = Describe("Actuator", func() {
391394
Kind: "Deployment",
392395
Name: "incomplete-va",
393396
},
394-
ModelID: "test-model/incomplete",
397+
ModelID: "test-model/incomplete",
398+
MaxReplicas: 2,
395399
},
396400
Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
397401
// DesiredOptimizedAlloc.NumReplicas will be 0 by default
@@ -460,7 +464,8 @@ var _ = Describe("Actuator", func() {
460464
Kind: "Deployment",
461465
Name: contextResourceName,
462466
},
463-
ModelID: "test-model/validation-test",
467+
ModelID: "test-model/validation-test",
468+
MaxReplicas: 2,
464469
},
465470
Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
466471
DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{

0 commit comments

Comments
 (0)