llm-d
diff --git a/‎api/v1alpha1/variantautoscaling_types.go‎
Lines changed: 17 additions & 0 deletions b/‎api/v1alpha1/variantautoscaling_types.go‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎api/v1alpha1/variantautoscaling_types_test.go‎
Lines changed: 65 additions & 1 deletion b/‎api/v1alpha1/variantautoscaling_types_test.go‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎api/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 1 deletion b/‎api/v1alpha1/zz_generated.deepcopy.go‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎charts/workload-variant-autoscaler/crds/llmd.ai_variantautoscalings.yaml‎
Lines changed: 27 additions & 0 deletions b/‎charts/workload-variant-autoscaler/crds/llmd.ai_variantautoscalings.yaml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎charts/workload-variant-autoscaler/values-dev.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/workload-variant-autoscaler/values-dev.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎charts/workload-variant-autoscaler/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎charts/workload-variant-autoscaler/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎config/crd/bases/llmd.ai_variantautoscalings.yaml‎
Lines changed: 27 additions & 0 deletions b/‎config/crd/bases/llmd.ai_variantautoscalings.yaml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎config/samples/hpa-integration.yaml‎
Lines changed: 1 addition & 1 deletion b/‎config/samples/hpa-integration.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/user-guide/crd-reference.md‎
Lines changed: 27 additions & 7 deletions b/‎docs/user-guide/crd-reference.md‎
Lines changed: 27 additions & 7 deletions
diff --git a/‎internal/actuator/actuator_test.go‎
Lines changed: 9 additions & 4 deletions b/‎internal/actuator/actuator_test.go‎
Lines changed: 9 additions & 4 deletions
@@ -17,6 +17,7 @@ type VariantAutoscalingConfigSpec struct {
 }
 
 // VariantAutoscalingSpec defines the desired state for autoscaling a model variant.
+// +kubebuilder:validation:XValidation:rule="!has(self.minReplicas) || self.minReplicas <= self.maxReplicas",message="minReplicas must be less than or equal to maxReplicas"
 type VariantAutoscalingSpec struct {
 	// ScaleTargetRef references the scalable resource to manage.
 	// This follows the same pattern as HorizontalPodAutoscaler.
@@ -28,6 +29,20 @@ type VariantAutoscalingSpec struct {
 	// +kubebuilder:validation:Required
 	ModelID string `json:"modelID"`
 
+	// MinReplicas is the lower bound on the number of replicas for this variant.
+	// A value of 0 enables scale-to-zero when the model is idle.
+	// Defaults to 1, preserving existing behavior for VAs that omit this field.
+	// +kubebuilder:validation:Minimum=0
+	// +kubebuilder:default=1
+	// +optional
+	MinReplicas *int32 `json:"minReplicas,omitempty"`
+
+	// MaxReplicas is the upper bound on the number of replicas for this variant.
+	// The autoscaler will never scale beyond this value regardless of load.
+	// +kubebuilder:validation:Minimum=1
+	// +kubebuilder:default=2
+	MaxReplicas int32 `json:"maxReplicas"`
+
 	// VariantAutoscalingConfigSpec holds optional tuning fields that integrators can embed.
 	VariantAutoscalingConfigSpec `json:",inline"`
 }
@@ -76,6 +91,8 @@ type ActuationStatus struct {
 // +kubebuilder:resource:shortName=va
 // +kubebuilder:printcolumn:name="Target",type=string,JSONPath=".spec.scaleTargetRef.name"
 // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=".spec.modelID"
+// +kubebuilder:printcolumn:name="Min",type=integer,JSONPath=".spec.minReplicas"
+// +kubebuilder:printcolumn:name="Max",type=integer,JSONPath=".spec.maxReplicas"
 // +kubebuilder:printcolumn:name="Optimized",type=string,JSONPath=".status.desiredOptimizedAlloc.numReplicas"
 // +kubebuilder:printcolumn:name="MetricsReady",type=string,JSONPath=".status.conditions[?(@.type=='MetricsAvailable')].status"
 // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp"
 
@@ -31,7 +31,8 @@ func makeValidVA() *VariantAutoscaling {
 				Kind: "Deployment",
 				Name: "va-sample-deployment",
 			},
-			ModelID: "model-123",
+			ModelID:     "model-123",
+			MaxReplicas: 2,
 		},
 		Status: VariantAutoscalingStatus{
 			// CurrentAlloc: Allocation{...} -- Removed
@@ -218,3 +219,66 @@ func jsonContainsKey(b []byte, key string) bool {
 	_, ok := m[key]
 	return ok
 }
+
+func TestMinMaxReplicasJSON(t *testing.T) {
+	minVal := int32(2)
+	va := &VariantAutoscaling{
+		ObjectMeta: metav1.ObjectMeta{Name: "va-replicas", Namespace: "default"},
+		Spec: VariantAutoscalingSpec{
+			ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
+				Kind: "Deployment",
+				Name: "my-deploy",
+			},
+			ModelID:     "model-x",
+			MinReplicas: &minVal,
+			MaxReplicas: 5,
+		},
+	}
+
+	b, err := json.Marshal(va)
+	if err != nil {
+		t.Fatalf("marshal failed: %v", err)
+	}
+
+	var probe struct {
+		Spec struct {
+			MinReplicas *int32 `json:"minReplicas"`
+			MaxReplicas int32  `json:"maxReplicas"`
+		} `json:"spec"`
+	}
+	if err := json.Unmarshal(b, &probe); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if probe.Spec.MinReplicas == nil || *probe.Spec.MinReplicas != 2 {
+		t.Errorf("expected minReplicas=2, got %v", probe.Spec.MinReplicas)
+	}
+	if probe.Spec.MaxReplicas != 5 {
+		t.Errorf("expected maxReplicas=5, got %d", probe.Spec.MaxReplicas)
+	}
+
+	// minReplicas must be absent from JSON when nil (omitempty)
+	vaNoMin := &VariantAutoscaling{
+		ObjectMeta: metav1.ObjectMeta{Name: "va-no-min", Namespace: "default"},
+		Spec: VariantAutoscalingSpec{
+			ScaleTargetRef: autoscalingv2.CrossVersionObjectReference{
+				Kind: "Deployment",
+				Name: "my-deploy",
+			},
+			ModelID:     "model-x",
+			MaxReplicas: 5,
+		},
+	}
+	b2, err := json.Marshal(vaNoMin)
+	if err != nil {
+		t.Fatalf("marshal failed: %v", err)
+	}
+	var probeSpec struct {
+		Spec map[string]any `json:"spec"`
+	}
+	if err := json.Unmarshal(b2, &probeSpec); err != nil {
+		t.Fatalf("unmarshal failed: %v", err)
+	}
+	if _, ok := probeSpec.Spec["minReplicas"]; ok {
+		t.Errorf("expected minReplicas to be absent when nil, but it was present")
+	}
+}
@@ -23,6 +23,12 @@ spec:
     - jsonPath: .spec.modelID
       name: Model
       type: string
+    - jsonPath: .spec.minReplicas
+      name: Min
+      type: integer
+    - jsonPath: .spec.maxReplicas
+      name: Max
+      type: integer
     - jsonPath: .status.desiredOptimizedAlloc.numReplicas
       name: Optimized
       type: string
@@ -60,6 +66,23 @@ spec:
             description: Spec defines the desired state for autoscaling the model
               variant.
             properties:
+              maxReplicas:
+                default: 2
+                description: |-
+                  MaxReplicas is the upper bound on the number of replicas for this variant.
+                  The autoscaler will never scale beyond this value regardless of load.
+                format: int32
+                minimum: 1
+                type: integer
+              minReplicas:
+                default: 1
+                description: |-
+                  MinReplicas is the lower bound on the number of replicas for this variant.
+                  A value of 0 enables scale-to-zero when the model is idle.
+                  Defaults to 1, preserving existing behavior for VAs that omit this field.
+                format: int32
+                minimum: 0
+                type: integer
               modelID:
                 description: ModelID specifies the unique identifier of the model
                   to be autoscaled.
@@ -90,9 +113,13 @@ spec:
                 pattern: ^\d+(\.\d+)?$
                 type: string
             required:
+            - maxReplicas
             - modelID
             - scaleTargetRef
             type: object
+            x-kubernetes-validations:
+            - message: minReplicas must be less than or equal to maxReplicas
+              rule: '!has(self.minReplicas) || self.minReplicas <= self.maxReplicas'
           status:
             description: Status represents the current status of autoscaling for the
               model variant.
 
@@ -68,7 +68,7 @@ hpa:
   enabled: true
   # minReplicas: 0 for scale-to-zero testing (requires HPAScaleToZero feature gate)
   minReplicas: 0
-  maxReplicas: 10
+  maxReplicas: 2
   targetAverageValue: "1"
 vllmService:
   enabled: true
 
@@ -103,7 +103,7 @@ hpa:
   # minReplicas: 1 is the safe default that prevents scale-to-zero
   # Set to 0 when wva.scaleToZero is enabled
   minReplicas: 1
-  maxReplicas: 10
+  maxReplicas: 2
   targetAverageValue: "1"
   # HPA scaling behavior configuration
   behavior:
 
@@ -23,6 +23,12 @@ spec:
     - jsonPath: .spec.modelID
       name: Model
       type: string
+    - jsonPath: .spec.minReplicas
+      name: Min
+      type: integer
+    - jsonPath: .spec.maxReplicas
+      name: Max
+      type: integer
     - jsonPath: .status.desiredOptimizedAlloc.numReplicas
       name: Optimized
       type: string
@@ -60,6 +66,23 @@ spec:
             description: Spec defines the desired state for autoscaling the model
               variant.
             properties:
+              maxReplicas:
+                default: 2
+                description: |-
+                  MaxReplicas is the upper bound on the number of replicas for this variant.
+                  The autoscaler will never scale beyond this value regardless of load.
+                format: int32
+                minimum: 1
+                type: integer
+              minReplicas:
+                default: 1
+                description: |-
+                  MinReplicas is the lower bound on the number of replicas for this variant.
+                  A value of 0 enables scale-to-zero when the model is idle.
+                  Defaults to 1, preserving existing behavior for VAs that omit this field.
+                format: int32
+                minimum: 0
+                type: integer
               modelID:
                 description: ModelID specifies the unique identifier of the model
                   to be autoscaled.
@@ -90,9 +113,13 @@ spec:
                 pattern: ^\d+(\.\d+)?$
                 type: string
             required:
+            - maxReplicas
             - modelID
             - scaleTargetRef
             type: object
+            x-kubernetes-validations:
+            - message: minReplicas must be less than or equal to maxReplicas
+              rule: '!has(self.minReplicas) || self.minReplicas <= self.maxReplicas'
           status:
             description: Status represents the current status of autoscaling for the
               model variant.
 
@@ -9,7 +9,7 @@ spec:
     kind: Deployment
     name: sample-deployment
   # minReplicas: 0  # scale to zero - alpha feature
-  maxReplicas: 10
+  maxReplicas: 2
   behavior:
     scaleUp:
       stabilizationWindowSeconds: 0
 
@@ -45,7 +45,7 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `lastRunTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#time-v1-meta)_ | LastRunTime is the timestamp of the last optimization run. |  |  |
 | `accelerator` _string_ | Accelerator is the type of accelerator for the optimized allocation. |  | MinLength: 2 <br /> |
-| `numReplicas` _integer_ | NumReplicas is the number of replicas for the optimized allocation. |  | Minimum: 1 <br /> |
+| `numReplicas` _integer_ | NumReplicas is the number of replicas for the optimized allocation. |  | Minimum: 0 <br /> |
 
 
 #### VariantAutoscaling
@@ -64,13 +64,31 @@ _Appears in:_
 | --- | --- | --- | --- |
 | `apiVersion` _string_ | `llmd.ai/v1alpha1` | | |
 | `kind` _string_ | `VariantAutoscaling` | | |
-| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds |  |  |
-| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources |  |  |
+| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds |  | Optional: \{\} <br /> |
+| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources |  | Optional: \{\} <br /> |
 | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
 | `spec` _[VariantAutoscalingSpec](#variantautoscalingspec)_ | Spec defines the desired state for autoscaling the model variant. |  |  |
 | `status` _[VariantAutoscalingStatus](#variantautoscalingstatus)_ | Status represents the current status of autoscaling for the model variant. |  |  |
 
 
+#### VariantAutoscalingConfigSpec
+
+
+
+VariantAutoscalingConfigSpec holds the optional tuning fields for a VariantAutoscaling.
+It is extracted as a standalone embeddable type so that higher-level controllers
+(e.g. KServe) can inline it without duplicating field definitions.
+
+
+
+_Appears in:_
+- [VariantAutoscalingSpec](#variantautoscalingspec)
+
+| Field | Description | Default | Validation |
+| --- | --- | --- | --- |
+| `variantCost` _string_ | VariantCost specifies the cost per replica for this variant (used in saturation analysis). | 10.0 | Optional: \{\} <br />Pattern: `^\d+(\.\d+)?$` <br /> |
+
+
 #### VariantAutoscalingList
 
 
@@ -85,8 +103,8 @@ VariantAutoscalingList contains a list of VariantAutoscaling resources.
 | --- | --- | --- | --- |
 | `apiVersion` _string_ | `llmd.ai/v1alpha1` | | |
 | `kind` _string_ | `VariantAutoscalingList` | | |
-| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds |  |  |
-| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources |  |  |
+| `kind` _string_ | Kind is a string value representing the REST resource this object represents.<br />Servers may infer this from the endpoint the client submits requests to.<br />Cannot be updated.<br />In CamelCase.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds |  | Optional: \{\} <br /> |
+| `apiVersion` _string_ | APIVersion defines the versioned schema of this representation of an object.<br />Servers should convert recognized schemas to the latest internal value, and<br />may reject unrecognized values.<br />More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources |  | Optional: \{\} <br /> |
 | `metadata` _[ListMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#listmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. |  |  |
 | `items` _[VariantAutoscaling](#variantautoscaling) array_ | Items is the list of VariantAutoscaling resources. |  |  |
 
@@ -104,8 +122,10 @@ _Appears in:_
 
 | Field | Description | Default | Validation |
 | --- | --- | --- | --- |
-| `scaleTargetRef` _[CrossVersionObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#crossversionobjectreference-v1-autoscaling)_ | ScaleTargetRef references the scalable resource to manage.<br />This follows the same pattern as HorizontalPodAutoscaler. |  | Required: \{\} <br /> |
+| `scaleTargetRef` _[CrossVersionObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.32/#crossversionobjectreference-v2-autoscaling)_ | ScaleTargetRef references the scalable resource to manage.<br />This follows the same pattern as HorizontalPodAutoscaler. |  | Required: \{\} <br /> |
 | `modelID` _string_ | ModelID specifies the unique identifier of the model to be autoscaled. |  | MinLength: 1 <br />Required: \{\} <br /> |
+| `minReplicas` _integer_ | MinReplicas is the lower bound on the number of replicas for this variant.<br />A value of 0 enables scale-to-zero when the model is idle.<br />Defaults to 1, preserving existing behavior for VAs that omit this field. | 1 | Minimum: 0 <br />Optional: \{\} <br /> |
+| `maxReplicas` _integer_ | MaxReplicas is the upper bound on the number of replicas for this variant.<br />The autoscaler will never scale beyond this value regardless of load. | 2 | Minimum: 1 <br /> |
 | `variantCost` _string_ | VariantCost specifies the cost per replica for this variant (used in saturation analysis). | 10.0 | Optional: \{\} <br />Pattern: `^\d+(\.\d+)?$` <br /> |
 
 
@@ -114,7 +134,7 @@ _Appears in:_
 
 
 VariantAutoscalingStatus represents the current status of autoscaling for a variant,
-including the desired optimized allocation and actuation status.
+including the current allocation, desired optimized allocation, and actuation status.
 
 
 
 
@@ -122,6 +122,7 @@ var _ = Describe("Actuator", func() {
 						Kind: "Deployment",
 						Name: resourceName,
 					},
+					MaxReplicas: 2,
 				},
 			}
 
@@ -213,7 +214,8 @@ var _ = Describe("Actuator", func() {
 						Kind: "Deployment",
 						Name: contextResourceName,
 					},
-					ModelID: "test-model/variant-1",
+					ModelID:     "test-model/variant-1",
+					MaxReplicas: 2,
 				},
 				Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
 					DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{
@@ -331,7 +333,8 @@ var _ = Describe("Actuator", func() {
 						Kind: "Deployment",
 						Name: contextResourceName,
 					},
-					ModelID: "test-model/metrics-test",
+					ModelID:     "test-model/metrics-test",
+					MaxReplicas: 2,
 				},
 				Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
 					DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{
@@ -391,7 +394,8 @@ var _ = Describe("Actuator", func() {
 						Kind: "Deployment",
 						Name: "incomplete-va",
 					},
-					ModelID: "test-model/incomplete",
+					ModelID:     "test-model/incomplete",
+					MaxReplicas: 2,
 				},
 				Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
 					// DesiredOptimizedAlloc.NumReplicas will be 0 by default
@@ -460,7 +464,8 @@ var _ = Describe("Actuator", func() {
 						Kind: "Deployment",
 						Name: contextResourceName,
 					},
-					ModelID: "test-model/validation-test",
+					ModelID:     "test-model/validation-test",
+					MaxReplicas: 2,
 				},
 				Status: llmdVariantAutoscalingV1alpha1.VariantAutoscalingStatus{
 					DesiredOptimizedAlloc: llmdVariantAutoscalingV1alpha1.OptimizedAlloc{