Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 0 additions & 46 deletions api/v1alpha1/variantautoscaling_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ type VariantAutoscalingSpec struct {
// VariantAutoscalingStatus represents the current status of autoscaling for a variant,
// including the current allocation, desired optimized allocation, and actuation status.
type VariantAutoscalingStatus struct {
// CurrentAlloc specifies the current resource allocation for the variant.
// +kubebuilder:validation:Optional
CurrentAlloc Allocation `json:"currentAlloc,omitempty"`

// DesiredOptimizedAlloc indicates the target optimized allocation based on autoscaling logic.
DesiredOptimizedAlloc OptimizedAlloc `json:"desiredOptimizedAlloc,omitempty"`
Expand All @@ -46,47 +43,6 @@ type VariantAutoscalingStatus struct {
Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"`
}

// Allocation describes the current resource allocation for a model variant.
type Allocation struct {
// Accelerator is the type of accelerator currently allocated.
// +kubebuilder:validation:MinLength=1
Accelerator string `json:"accelerator"`

// NumReplicas is the number of replicas currently allocated.
// +kubebuilder:validation:Minimum=0
NumReplicas int `json:"numReplicas"`

// MaxBatch is the maximum batch size currently allocated.
// +kubebuilder:validation:Minimum=0
MaxBatch int `json:"maxBatch"`

// ITLAverage is the average inter token latency for the current allocation.
// +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$`
ITLAverage string `json:"itlAverage"`

// TTFTAverage is the average time to first token for the current allocation
// +kubebuilder:validation:Pattern=`^\d+(\.\d+)?$`
TTFTAverage string `json:"ttftAverage"`

// Load describes the workload characteristics for the current allocation.
Load LoadProfile `json:"load"`
}

// LoadProfile represents the configuration for workload characteristics,
// including the rate of incoming requests (ArrivalRate) and the average
// length of each request (AvgLength). Both fields are specified as strings
// to allow flexible input formats.
type LoadProfile struct {
// ArrivalRate is the rate of incoming requests in inference server.
ArrivalRate string `json:"arrivalRate"`

// AvgInputTokens is the average number of input(prefill) tokens per request in inference server.
AvgInputTokens string `json:"avgInputTokens"`

// AvgOutputTokens is the average number of output(decode) tokens per request in inference server.
AvgOutputTokens string `json:"avgOutputTokens"`
}

// OptimizedAlloc describes the target optimized allocation for a model variant.
type OptimizedAlloc struct {
// LastRunTime is the timestamp of the last optimization run.
Expand All @@ -112,8 +68,6 @@ type ActuationStatus struct {
// +kubebuilder:resource:shortName=va
// +kubebuilder:printcolumn:name="Target",type=string,JSONPath=".spec.scaleTargetRef.name"
// +kubebuilder:printcolumn:name="Model",type=string,JSONPath=".spec.modelID"
// +kubebuilder:printcolumn:name="Accelerator",type=string,JSONPath=".status.currentAlloc.accelerator"
// +kubebuilder:printcolumn:name="CurrentReplicas",type=integer,JSONPath=".status.currentAlloc.numReplicas"
// +kubebuilder:printcolumn:name="Optimized",type=string,JSONPath=".status.desiredOptimizedAlloc.numReplicas"
// +kubebuilder:printcolumn:name="MetricsReady",type=string,JSONPath=".status.conditions[?(@.type=='MetricsAvailable')].status"
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=".metadata.creationTimestamp"
Expand Down
28 changes: 7 additions & 21 deletions api/v1alpha1/variantautoscaling_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,7 @@ func makeValidVA() *VariantAutoscaling {
ModelID: "model-123",
},
Status: VariantAutoscalingStatus{
CurrentAlloc: Allocation{
Accelerator: "nvidia.com/mig-1g.5gb",
NumReplicas: 1,
MaxBatch: 8,
ITLAverage: "45.6",
TTFTAverage: "3.2",
Load: LoadProfile{
ArrivalRate: "12 rps",
AvgOutputTokens: "2.5 s",
AvgInputTokens: "2.5 s",
},
},
// CurrentAlloc: Allocation{...} -- Removed
DesiredOptimizedAlloc: OptimizedAlloc{
LastRunTime: metav1.NewTime(time.Unix(1730000000, 0).UTC()),
Accelerator: "nvidia.com/mig-1g.5gb",
Expand Down Expand Up @@ -86,15 +75,16 @@ func TestDeepCopyIndependence(t *testing.T) {
cp := orig.DeepCopy()

cp.Spec.ModelID = "model-456"
cp.Status.CurrentAlloc.Load.ArrivalRate = "20 rps"
cp.Spec.ModelID = "model-456"
// cp.Status.CurrentAlloc.Load.ArrivalRate = "20 rps" -- Removed

if orig.Spec.ModelID == cp.Spec.ModelID {
t.Errorf("DeepCopy did not create independent copy for Spec.ModelID")
}

if orig.Status.CurrentAlloc.Load.ArrivalRate == cp.Status.CurrentAlloc.Load.ArrivalRate {
t.Errorf("DeepCopy did not create independent copy for nested Status.Load")
}
// if orig.Status.CurrentAlloc.Load.ArrivalRate == cp.Status.CurrentAlloc.Load.ArrivalRate {
// t.Errorf("DeepCopy did not create independent copy for nested Status.Load")
// }
}

func TestJSONRoundTrip(t *testing.T) {
Expand Down Expand Up @@ -170,9 +160,6 @@ func TestStatusOmitEmpty(t *testing.T) {
// Optional: sanity-check a couple of zero values inside status
var probe struct {
Status struct {
CurrentAlloc struct {
Accelerator string `json:"accelerator"`
} `json:"currentAlloc"`
DesiredOptimizedAlloc struct {
LastRunTime *string `json:"lastRunTime"`
NumReplicas int `json:"numReplicas"`
Expand All @@ -185,8 +172,7 @@ func TestStatusOmitEmpty(t *testing.T) {
if err := json.Unmarshal(b, &probe); err != nil {
t.Fatalf("unmarshal probe failed: %v", err)
}
if probe.Status.CurrentAlloc.Accelerator != "" ||
probe.Status.DesiredOptimizedAlloc.NumReplicas != 0 ||
if probe.Status.DesiredOptimizedAlloc.NumReplicas != 0 ||
probe.Status.Actuation.Applied != false {
t.Errorf("unexpected non-zero defaults in status: %+v", probe.Status)
}
Expand Down
32 changes: 0 additions & 32 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ spec:
- jsonPath: .spec.modelID
name: Model
type: string
- jsonPath: .status.currentAlloc.accelerator
name: Accelerator
type: string
- jsonPath: .status.currentAlloc.numReplicas
name: CurrentReplicas
type: integer
- jsonPath: .status.desiredOptimizedAlloc.numReplicas
name: Optimized
type: string
Expand Down Expand Up @@ -71,67 +65,6 @@ spec:
to be autoscaled.
minLength: 1
type: string
modelProfile:
description: ModelProfile provides resource and performance characteristics
for the model variant.
properties:
accelerators:
description: Accelerators is a list of accelerator profiles for
the model variant.
items:
description: |-
AcceleratorProfile defines the configuration for an accelerator used in autoscaling.
It specifies the type and count of accelerator, as well as parameters for scaling behavior.
properties:
acc:
description: Acc specifies the type or name of the accelerator
(e.g., GPU type).
minLength: 1
type: string
accCount:
description: AccCount specifies the number of accelerator
units to be used.
minimum: 1
type: integer
maxBatchSize:
description: MaxBatchSize is the maximum batch size supported
by the accelerator.
minimum: 1
type: integer
perfParms:
description: PerParms specifies the prefill and decode parameters
for ttft and itl models
properties:
decodeParms:
additionalProperties:
type: string
description: |-
DecodeParms contains parameters for the decode phase (ITL calculation)
Expected keys: "alpha", "beta" for equation: itl = alpha + beta * maxBatchSize
minProperties: 1
type: object
prefillParms:
additionalProperties:
type: string
description: |-
PrefillParms contains parameters for the prefill phase (TTFT calculation)
Expected keys: "gamma", "delta" for equation: ttft = gamma + delta * tokens * maxBatchSize
minProperties: 1
type: object
required:
- decodeParms
- prefillParms
type: object
required:
- acc
- accCount
- maxBatchSize
type: object
minItems: 1
type: array
required:
- accelerators
type: object
scaleTargetRef:
description: |-
ScaleTargetRef references the scalable resource to manage.
Expand Down Expand Up @@ -237,62 +170,6 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
currentAlloc:
description: CurrentAlloc specifies the current resource allocation
for the variant.
properties:
accelerator:
description: Accelerator is the type of accelerator currently
allocated.
minLength: 1
type: string
itlAverage:
description: ITLAverage is the average inter token latency for
the current allocation.
pattern: ^\d+(\.\d+)?$
type: string
load:
description: Load describes the workload characteristics for the
current allocation.
properties:
arrivalRate:
description: ArrivalRate is the rate of incoming requests
in inference server.
type: string
avgInputTokens:
description: AvgInputTokens is the average number of input(prefill)
tokens per request in inference server.
type: string
avgOutputTokens:
description: AvgOutputTokens is the average number of output(decode)
tokens per request in inference server.
type: string
required:
- arrivalRate
- avgInputTokens
- avgOutputTokens
type: object
maxBatch:
description: MaxBatch is the maximum batch size currently allocated.
minimum: 0
type: integer
numReplicas:
description: NumReplicas is the number of replicas currently allocated.
minimum: 0
type: integer
ttftAverage:
description: TTFTAverage is the average time to first token for
the current allocation
pattern: ^\d+(\.\d+)?$
type: string
required:
- accelerator
- itlAverage
- load
- maxBatch
- numReplicas
- ttftAverage
type: object
desiredOptimizedAlloc:
description: DesiredOptimizedAlloc indicates the target optimized
allocation based on autoscaling logic.
Expand Down
62 changes: 0 additions & 62 deletions config/crd/bases/llmd.ai_variantautoscalings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ spec:
- jsonPath: .spec.modelID
name: Model
type: string
- jsonPath: .status.currentAlloc.accelerator
name: Accelerator
type: string
- jsonPath: .status.currentAlloc.numReplicas
name: CurrentReplicas
type: integer
- jsonPath: .status.desiredOptimizedAlloc.numReplicas
name: Optimized
type: string
Expand Down Expand Up @@ -176,62 +170,6 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
currentAlloc:
description: CurrentAlloc specifies the current resource allocation
for the variant.
properties:
accelerator:
description: Accelerator is the type of accelerator currently
allocated.
minLength: 1
type: string
itlAverage:
description: ITLAverage is the average inter token latency for
the current allocation.
pattern: ^\d+(\.\d+)?$
type: string
load:
description: Load describes the workload characteristics for the
current allocation.
properties:
arrivalRate:
description: ArrivalRate is the rate of incoming requests
in inference server.
type: string
avgInputTokens:
description: AvgInputTokens is the average number of input(prefill)
tokens per request in inference server.
type: string
avgOutputTokens:
description: AvgOutputTokens is the average number of output(decode)
tokens per request in inference server.
type: string
required:
- arrivalRate
- avgInputTokens
- avgOutputTokens
type: object
maxBatch:
description: MaxBatch is the maximum batch size currently allocated.
minimum: 0
type: integer
numReplicas:
description: NumReplicas is the number of replicas currently allocated.
minimum: 0
type: integer
ttftAverage:
description: TTFTAverage is the average time to first token for
the current allocation
pattern: ^\d+(\.\d+)?$
type: string
required:
- accelerator
- itlAverage
- load
- maxBatch
- numReplicas
- ttftAverage
type: object
desiredOptimizedAlloc:
description: DesiredOptimizedAlloc indicates the target optimized
allocation based on autoscaling logic.
Expand Down
Loading