llm-d · asm582 · Jan 9, 2026 · Jan 9, 2026
diff --git a/api/v1alpha1/variantautoscaling_types.go b/api/v1alpha1/variantautoscaling_types.go
@@ -27,9 +27,6 @@ type VariantAutoscalingSpec struct {
 // VariantAutoscalingStatus represents the current status of autoscaling for a variant,
 // including the current allocation, desired optimized allocation, and actuation status.
 type VariantAutoscalingStatus struct {
-	// CurrentAlloc specifies the current resource allocation for the variant.
-	// +kubebuilder:validation:Optional
-	CurrentAlloc Allocation `json:"currentAlloc,omitempty"`
 
 	// DesiredOptimizedAlloc indicates the target optimized allocation based on autoscaling logic.
 	DesiredOptimizedAlloc OptimizedAlloc `json:"desiredOptimizedAlloc,omitempty"`
@@ -46,47 +43,6 @@ type VariantAutoscalingStatus struct {
 	Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
 }
 
-// Allocation describes the current resource allocation for a model variant.
-type Allocation struct {
-	// Accelerator is the type of accelerator currently allocated.
-	// +kubebuilder:validation:MinLength=1
-	Accelerator string `json:"accelerator"`
-
-	// NumReplicas is the number of replicas currently allocated.
-	// +kubebuilder:validation:Minimum=0
-	NumReplicas int `json:"numReplicas"`
-
-	// MaxBatch is the maximum batch size currently allocated.
-	// +kubebuilder:validation:Minimum=0
-	MaxBatch int `json:"maxBatch"`
-
-	// ITLAverage is the average inter token latency for the current allocation.
-	// +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$`
-	ITLAverage string `json:"itlAverage"`
-
-	// TTFTAverage is the average time to first token for the current allocation
-	// +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$`
-	TTFTAverage string `json:"ttftAverage"`
-
-	// Load describes the workload characteristics for the current allocation.
-	Load LoadProfile `json:"load"`
-}
-
-// LoadProfile represents the configuration for workload characteristics,
-// including the rate of incoming requests (ArrivalRate) and the average
-// length of each request (AvgLength). Both fields are specified as strings
-// to allow flexible input formats.
-type LoadProfile struct {
-	// ArrivalRate is the rate of incoming requests in inference server.
-	ArrivalRate string `json:"arrivalRate"`
-
-	// AvgInputTokens is the average number of input(prefill) tokens per request in inference server.
-	AvgInputTokens string `json:"avgInputTokens"`
-
-	// AvgOutputTokens is the average number of output(decode) tokens per request in inference server.
-	AvgOutputTokens string `json:"avgOutputTokens"`
-}
-
 // OptimizedAlloc describes the target optimized allocation for a model variant.
 type OptimizedAlloc struct {
 	// LastRunTime is the timestamp of the last optimization run.
@@ -112,8 +68,6 @@ type ActuationStatus struct {
 // +kubebuilder:resource:shortName=va
 // +kubebuilder:printcolumn:name="Target",type=string,JSONPath=".spec.scaleTargetRef.name"
 // +kubebuilder:printcolumn:name="Model",type=string,JSONPath=".spec.modelID"
-// +kubebuilder:printcolumn:name="Accelerator",type=string,JSONPath=".status.currentAlloc.accelerator"
-// +kubebuilder:printcolumn:name="CurrentReplicas",type=integer,JSONPath=".status.currentAlloc.numReplicas"
 // +kubebuilder:printcolumn:name="Optimized",type=string,JSONPath=".status.desiredOptimizedAlloc.numReplicas"
 // +kubebuilder:printcolumn:name="MetricsReady",type=string,JSONPath=".status.conditions[?(@.type=='MetricsAvailable')].status"
 // +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp"

diff --git a/api/v1alpha1/variantautoscaling_types_test.go b/api/v1alpha1/variantautoscaling_types_test.go
@@ -34,18 +34,7 @@ func makeValidVA() *VariantAutoscaling {
 			ModelID: "model-123",
 		},
 		Status: VariantAutoscalingStatus{
-			CurrentAlloc: Allocation{
-				Accelerator: "nvidia.com/mig-1g.5gb",
-				NumReplicas: 1,
-				MaxBatch:    8,
-				ITLAverage:  "45.6",
-				TTFTAverage: "3.2",
-				Load: LoadProfile{
-					ArrivalRate:     "12 rps",
-					AvgOutputTokens: "2.5 s",
-					AvgInputTokens:  "2.5 s",
-				},
-			},
+			// CurrentAlloc: Allocation{...} -- Removed
 			DesiredOptimizedAlloc: OptimizedAlloc{
 				LastRunTime: metav1.NewTime(time.Unix(1730000000, 0).UTC()),
 				Accelerator: "nvidia.com/mig-1g.5gb",
@@ -86,15 +75,16 @@ func TestDeepCopyIndependence(t *testing.T) {
 	cp := orig.DeepCopy()
 
 	cp.Spec.ModelID = "model-456"
-	cp.Status.CurrentAlloc.Load.ArrivalRate = "20 rps"
+	cp.Spec.ModelID = "model-456"
+	// cp.Status.CurrentAlloc.Load.ArrivalRate = "20 rps" -- Removed
 
 	if orig.Spec.ModelID == cp.Spec.ModelID {
 		t.Errorf("DeepCopy did not create independent copy for Spec.ModelID")
 	}
 
-	if orig.Status.CurrentAlloc.Load.ArrivalRate == cp.Status.CurrentAlloc.Load.ArrivalRate {
-		t.Errorf("DeepCopy did not create independent copy for nested Status.Load")
-	}
+	// if orig.Status.CurrentAlloc.Load.ArrivalRate == cp.Status.CurrentAlloc.Load.ArrivalRate {
+	// 	t.Errorf("DeepCopy did not create independent copy for nested Status.Load")
+	// }
 }
 
 func TestJSONRoundTrip(t *testing.T) {
@@ -170,9 +160,6 @@ func TestStatusOmitEmpty(t *testing.T) {
 	// Optional: sanity-check a couple of zero values inside status
 	var probe struct {
 		Status struct {
-			CurrentAlloc struct {
-				Accelerator string `json:"accelerator"`
-			} `json:"currentAlloc"`
 			DesiredOptimizedAlloc struct {
 				LastRunTime *string `json:"lastRunTime"`
 				NumReplicas int     `json:"numReplicas"`
@@ -185,8 +172,7 @@ func TestStatusOmitEmpty(t *testing.T) {
 	if err := json.Unmarshal(b, &probe); err != nil {
 		t.Fatalf("unmarshal probe failed: %v", err)
 	}
-	if probe.Status.CurrentAlloc.Accelerator != "" ||
-		probe.Status.DesiredOptimizedAlloc.NumReplicas != 0 ||
+	if probe.Status.DesiredOptimizedAlloc.NumReplicas != 0 ||
 		probe.Status.Actuation.Applied != false {
 		t.Errorf("unexpected non-zero defaults in status: %+v", probe.Status)
 	}

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/charts/workload-variant-autoscaler/crds/llmd.ai_variantautoscalings.yaml b/charts/workload-variant-autoscaler/crds/llmd.ai_variantautoscalings.yaml
@@ -23,12 +23,6 @@ spec:
     - jsonPath: .spec.modelID
       name: Model
       type: string
-    - jsonPath: .status.currentAlloc.accelerator
-      name: Accelerator
-      type: string
-    - jsonPath: .status.currentAlloc.numReplicas
-      name: CurrentReplicas
-      type: integer
     - jsonPath: .status.desiredOptimizedAlloc.numReplicas
       name: Optimized
       type: string
@@ -71,67 +65,6 @@ spec:
                   to be autoscaled.
                 minLength: 1
                 type: string
-              modelProfile:
-                description: ModelProfile provides resource and performance characteristics
-                  for the model variant.
-                properties:
-                  accelerators:
-                    description: Accelerators is a list of accelerator profiles for
-                      the model variant.
-                    items:
-                      description: |-
-                        AcceleratorProfile defines the configuration for an accelerator used in autoscaling.
-                        It specifies the type and count of accelerator, as well as parameters for scaling behavior.
-                      properties:
-                        acc:
-                          description: Acc specifies the type or name of the accelerator
-                            (e.g., GPU type).
-                          minLength: 1
-                          type: string
-                        accCount:
-                          description: AccCount specifies the number of accelerator
-                            units to be used.
-                          minimum: 1
-                          type: integer
-                        maxBatchSize:
-                          description: MaxBatchSize is the maximum batch size supported
-                            by the accelerator.
-                          minimum: 1
-                          type: integer
-                        perfParms:
-                          description: PerParms specifies the prefill and decode parameters
-                            for ttft and itl models
-                          properties:
-                            decodeParms:
-                              additionalProperties:
-                                type: string
-                              description: |-
-                                DecodeParms contains parameters for the decode phase (ITL calculation)
-                                Expected keys: "alpha", "beta" for equation: itl = alpha + beta * maxBatchSize
-                              minProperties: 1
-                              type: object
-                            prefillParms:
-                              additionalProperties:
-                                type: string
-                              description: |-
-                                PrefillParms contains parameters for the prefill phase (TTFT calculation)
-                                Expected keys: "gamma", "delta" for equation: ttft = gamma + delta * tokens * maxBatchSize
-                              minProperties: 1
-                              type: object
-                          required:
-                          - decodeParms
-                          - prefillParms
-                          type: object
-                      required:
-                      - acc
-                      - accCount
-                      - maxBatchSize
-                      type: object
-                    minItems: 1
-                    type: array
-                required:
-                - accelerators
-                type: object
               scaleTargetRef:
                 description: |-
                   ScaleTargetRef references the scalable resource to manage.
@@ -237,62 +170,6 @@ spec:
                 x-kubernetes-list-map-keys:
                 - type
                 x-kubernetes-list-type: map
-              currentAlloc:
-                description: CurrentAlloc specifies the current resource allocation
-                  for the variant.
-                properties:
-                  accelerator:
-                    description: Accelerator is the type of accelerator currently
-                      allocated.
-                    minLength: 1
-                    type: string
-                  itlAverage:
-                    description: ITLAverage is the average inter token latency for
-                      the current allocation.
-                    pattern: ^\d+(\.\d+)?$
-                    type: string
-                  load:
-                    description: Load describes the workload characteristics for the
-                      current allocation.
-                    properties:
-                      arrivalRate:
-                        description: ArrivalRate is the rate of incoming requests
-                          in inference server.
-                        type: string
-                      avgInputTokens:
-                        description: AvgInputTokens is the average number of input(prefill)
-                          tokens per request in inference server.
-                        type: string
-                      avgOutputTokens:
-                        description: AvgOutputTokens is the average number of output(decode)
-                          tokens per request in inference server.
-                        type: string
-                    required:
-                    - arrivalRate
-                    - avgInputTokens
-                    - avgOutputTokens
-                    type: object
-                  maxBatch:
-                    description: MaxBatch is the maximum batch size currently allocated.
-                    minimum: 0
-                    type: integer
-                  numReplicas:
-                    description: NumReplicas is the number of replicas currently allocated.
-                    minimum: 0
-                    type: integer
-                  ttftAverage:
-                    description: TTFTAverage is the average time to first token for
-                      the current allocation
-                    pattern: ^\d+(\.\d+)?$
-                    type: string
-                required:
-                - accelerator
-                - itlAverage
-                - load
-                - maxBatch
-                - numReplicas
-                - ttftAverage
-                type: object
               desiredOptimizedAlloc:
                 description: DesiredOptimizedAlloc indicates the target optimized
                   allocation based on autoscaling logic.

diff --git a/config/crd/bases/llmd.ai_variantautoscalings.yaml b/config/crd/bases/llmd.ai_variantautoscalings.yaml
@@ -23,12 +23,6 @@ spec:
     - jsonPath: .spec.modelID
       name: Model
       type: string
-    - jsonPath: .status.currentAlloc.accelerator
-      name: Accelerator
-      type: string
-    - jsonPath: .status.currentAlloc.numReplicas
-      name: CurrentReplicas
-      type: integer
     - jsonPath: .status.desiredOptimizedAlloc.numReplicas
       name: Optimized
       type: string
@@ -176,62 +170,6 @@ spec:
                 x-kubernetes-list-map-keys:
                 - type
                 x-kubernetes-list-type: map
-              currentAlloc:
-                description: CurrentAlloc specifies the current resource allocation
-                  for the variant.
-                properties:
-                  accelerator:
-                    description: Accelerator is the type of accelerator currently
-                      allocated.
-                    minLength: 1
-                    type: string
-                  itlAverage:
-                    description: ITLAverage is the average inter token latency for
-                      the current allocation.
-                    pattern: ^\d+(\.\d+)?$
-                    type: string
-                  load:
-                    description: Load describes the workload characteristics for the
-                      current allocation.
-                    properties:
-                      arrivalRate:
-                        description: ArrivalRate is the rate of incoming requests
-                          in inference server.
-                        type: string
-                      avgInputTokens:
-                        description: AvgInputTokens is the average number of input(prefill)
-                          tokens per request in inference server.
-                        type: string
-                      avgOutputTokens:
-                        description: AvgOutputTokens is the average number of output(decode)
-                          tokens per request in inference server.
-                        type: string
-                    required:
-                    - arrivalRate
-                    - avgInputTokens
-                    - avgOutputTokens
-                    type: object
-                  maxBatch:
-                    description: MaxBatch is the maximum batch size currently allocated.
-                    minimum: 0
-                    type: integer
-                  numReplicas:
-                    description: NumReplicas is the number of replicas currently allocated.
-                    minimum: 0
-                    type: integer
-                  ttftAverage:
-                    description: TTFTAverage is the average time to first token for
-                      the current allocation
-                    pattern: ^\d+(\.\d+)?$
-                    type: string
-                required:
-                - accelerator
-                - itlAverage
-                - load
-                - maxBatch
-                - numReplicas
-                - ttftAverage
-                type: object
               desiredOptimizedAlloc:
                 description: DesiredOptimizedAlloc indicates the target optimized
                   allocation based on autoscaling logic.