llm-d-workload-variant-autoscaler/internal/interfaces/saturation_analyzer.go at 71231a8d9dff9875545f30b1fe8624fa7fb59825 · llm-d/llm-d-workload-variant-autoscaler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
package interfaces

import (
	"context"
	"time"

	autoscalingv2 "k8s.io/api/autoscaling/v2"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// SaturationAnalyzerName is the canonical name for the saturation analyzer.
const SaturationAnalyzerName = "saturation"

// RoleBoth represents the default role when a variant serves both prefill and decode.
const RoleBoth = "both"

// ReplicaMetrics holds per-replica metrics used by both the saturation analyzer
// and the queueing model analyzer. Saturation analysis uses KV cache, queue, and
// token-capacity fields, while the queueing model analyzer uses
// ArrivalRate and MaxBatchSize to model queue dynamics and estimate optimal capacity.
type ReplicaMetrics struct {
	PodName         string
	KvCacheUsage    float64 // KV cache utilization (0.0-1.0)
	QueueLength     int     // Number of requests waiting
	VariantName     string  // Name of the variant this replica belongs to
	Namespace       string
	ModelID         string  // Model ID for grouping variants
	AcceleratorName string  // Accelerator type for this variant
	Cost            float64 // Cost per replica (from CRD spec, default 10)
	// Metadata contains freshness information (optional)
	Metadata *ReplicaMetricsMetadata `json:"metadata,omitempty"`

	// --- Fields for Saturation Analyzer V2 and Queueing Model Analyzer ---

	// NumGpuBlocks is the total number of KV cache blocks allocated on GPU.
	// Sourced from vllm:cache_config_info label "num_gpu_blocks".
	// Zero value means cache_config_info metric is not available.
	NumGpuBlocks int64

	// BlockSize is the number of tokens per KV cache block.
	// Sourced from vllm:cache_config_info label "block_size".
	// Zero value means cache_config_info metric is not available.
	BlockSize int64

	// TotalKvCapacityTokens is NumGpuBlocks × BlockSize (total token slots).
	// Computed by the collector after parsing cache_config_info labels.
	// Zero value means capacity data is unavailable.
	TotalKvCapacityTokens int64

	// TokensInUse is the derived current token demand on this replica.
	// Computed as KvCacheUsage × TotalKvCapacityTokens.
	// Zero when TotalKvCapacityTokens is unavailable.
	TokensInUse int64

	// AvgOutputTokens is the average generation tokens per request on this replica.
	// Derived from rate(generation_tokens_sum) / rate(generation_tokens_count).
	// Used by saturation V2 for token-demand estimation (k2 derivation) and by
	// the queueing model analyzer for RequestSize and service rate computation.
	// Zero when metrics are unavailable.
	AvgOutputTokens float64

	// AvgInputTokens is the average prompt tokens per request on this replica.
	// Derived from rate(prompt_tokens_sum) / rate(prompt_tokens_count).
	// Used by saturation V2 for token-demand estimation (k2 derivation) and by
	// the queueing model analyzer for RequestSize and service rate computation.
	// Zero when metrics are unavailable.
	AvgInputTokens float64

	// PrefixCacheHitRate is the fraction of prefix cache queries that were hits (0.0-1.0).
	// Derived from rate(vllm:prefix_cache_hits[5m]) / rate(vllm:prefix_cache_queries[5m]).
	// Used to reduce estimated input token demand for scheduler-queued requests.
	// Zero when prefix caching is disabled or metrics are unavailable.
	PrefixCacheHitRate float64

	// ArrivalRate is the request arrival rate to this replica in requests per second.
	// Sourced from rate(inference_extension_scheduler_attempts_total{status="success"}[5m]) per pod.
	// This represents requests being dispatched to this replica by the scheduler.
	// Used by queueing model analyzer as Lambda (arrival rate) for queue dynamics estimation.
	// Zero when scheduler metrics are unavailable.
	ArrivalRate float64

	// MaxBatchSize is the maximum number of concurrent inference requests this replica can process.
	// Parsed from the --max-num-seqs flag in the pod's parent Deployment container args.
	// Defaults to 256 (vLLM v0.8+ default) when the flag is not explicitly set.
	// Used by queueing model analyzer.
	MaxBatchSize int64

	// AvgTTFT is the average time-to-first-token on this replica in seconds.
	// Derived from rate(vllm:time_to_first_token_seconds_sum[5m]) / rate(..._count[5m]).
	// Used by queueing model tuner as observed TTFT for Kalman filter parameter learning.
	// Zero when metrics are unavailable.
	AvgTTFT float64

	// AvgITL is the average inter-token latency on this replica in seconds.
	// Derived from rate(vllm:time_per_output_token_seconds_sum[5m]) / rate(..._count[5m]).
	// Used by queueing model tuner as observed ITL for Kalman filter parameter learning.
	// Zero when metrics are unavailable.
	AvgITL float64
}

// ReplicaMetricsMetadata contains freshness information for replica metrics
type ReplicaMetricsMetadata struct {
	// CollectedAt is when the metrics were collected
	CollectedAt time.Time
	// Age is the age of the metrics
	Age time.Duration
	// FreshnessStatus indicates freshness: "fresh", "stale", "unavailable"
	FreshnessStatus string
}

// ModelSaturationAnalysis holds saturation analysis results for a model (across all variants)
type ModelSaturationAnalysis struct {
	ModelID    string
	Namespace  string
	AnalyzedAt time.Time // Timestamp when analysis was performed

	// Aggregated metrics across all variants of this model
	TotalReplicas       int
	NonSaturatedCount   int // Replicas below saturation thresholds
	AvgSpareKvCapacity  float64
	AvgSpareQueueLength float64

	// Scale decision recommendations
	ShouldScaleUp bool

	ScaleUpReason string
	ScaleDownSafe bool // Indicates if scale-down simulation passed

	// Detailed variant breakdown
	VariantAnalyses []VariantSaturationAnalysis
}

// VariantSaturationAnalysis holds saturation analysis for a single variant
type VariantSaturationAnalysis struct {
	VariantName         string
	AcceleratorName     string
	Cost                float64 // Cost per replica for this variant
	ReplicaCount        int
	NonSaturatedCount   int
	MaxKvCacheUsage     float64
	MaxQueueLength      int
	AvgSpareKvCapacity  float64
	AvgSpareQueueLength float64
	SaturatedReplicas   []string // Pod names of saturated replicas
}

// DecisionStep represents a single step in the decision pipeline.
// Each pipeline stage (saturation analysis, resource limiting, etc.) adds its own step.
type DecisionStep struct {
	// Name identifies the pipeline stage (e.g., "saturation", "limiter", "enforcer")
	Name string
	// Action is the action determined by this step
	Action SaturationAction
	// TargetReplicas is the target replicas after this step
	TargetReplicas int
	// Reason explains why this step made its decision
	Reason string
	// WasConstrained is true if this step modified the previous step's target
	WasConstrained bool
	// Timestamp when this step was executed
	Timestamp metav1.Time
}

// VariantDecision represents the scaling decision for a single variant.
//
// This type serves as shared state that flows through the decision pipeline.
// Each pipeline stage (saturation analysis, resource limiting, enforcement)
// reads and modifies the decision, adding its step to DecisionSteps.
//
// Pipeline stages modify the state they own:
//   - Saturation analyzer: sets initial Action, TargetReplicas, SaturationBased
//   - Resource limiter: may constrain TargetReplicas, adds limiting step
//   - Enforcer: applies final constraints (min/max), adds enforcement step
type VariantDecision struct {
	// --- Variant identification ---
	VariantName     string
	Namespace       string
	ModelID         string
	AcceleratorName string
	Cost            float64
	Role            string // "prefill", "decode", "both"

	// --- Scaling state ---
	Action                 SaturationAction
	CurrentReplicas        int
	TargetReplicas         int // Current target (modified by pipeline stages)
	OriginalTargetReplicas int // Original target before resource limiting (for logging)
	DesiredReplicas        int // Original desired replicas from optimizer (from CRD status)

	// --- Resource requirements (for resource limiting) ---
	GPUsPerReplica int // GPUs required per replica
	// SpareCapacity indicates how much spare capacity this variant has.
	// 0.0 = fully saturated, 1.0 = completely idle.
	// Used by allocation algorithms to prioritize saturated variants.
	// V1: threshold-relative spare KV capacity (AvgSpareKvCapacity).
	// V2: 1.0 - Utilization (absolute spare).
	SpareCapacity float64
	// Utilization is the variant-level utilization ratio (0.0-1.0) reported for
	// observability. The exact formula differs by analyzer because V1 and V2
	// reason about saturation differently:
	//   V1: mean of per-replica KvCacheUsage fractions (matches what V1's
	//       per-replica threshold check operates on).
	//   V2: TotalDemand / TotalCapacity from AnalyzerResult (token-demand-based).
	// For uniform-capacity replicas the two are numerically equivalent; for
	// mixed-capacity replicas V2's value is capacity-weighted.
	Utilization float64
	// KvCacheTokensUsed is the sum of TokensInUse across this variant's replicas.
	KvCacheTokensUsed int64
	// KvCacheTokensCapacity is the sum of TotalKvCapacityTokens across this variant's replicas.
	KvCacheTokensCapacity int64
	// RequiredCapacity is the model-level required capacity (>0 means scale-up needed).
	// Same value for all variants of a model.
	// V1: binary (1.0 if shouldScaleUp, else 0.0).
	// V2: continuous token-based demand from AnalyzerResult.
	// Use RequiredCapacityUnit to disambiguate the units when consuming this field
	// (or its corresponding Prometheus metric).
	RequiredCapacity float64
	// RequiredCapacityUnit describes the unit of RequiredCapacity ("binary" or "continuous").
	// Exposed as the `unit` Prometheus label on wva_required_capacity so dashboards
	// can filter by semantics rather than by which analyzer produced the value.
	//   "binary":     V1 path, value is 0.0 or 1.0
	//   "continuous": V2 path, value is a token-demand magnitude
	RequiredCapacityUnit string
	// ScaleTargetRef references the Deployment/StatefulSet for scheduling constraints
	ScaleTargetRef *autoscalingv2.CrossVersionObjectReference

	// --- Pipeline tracking ---
	// DecisionSteps records each pipeline stage's contribution to the final decision.
	// This replaces the single Reason field with structured multi-step tracking.
	DecisionSteps []DecisionStep
	// Reason is kept for backward compatibility and contains the final/summary reason
	Reason string

	// --- Saturation-specific flags ---
	SaturationBased    bool        // True if decision is primarily saturation-driven
	ModelBasedDecision bool        // True if decision considers model-based optimizer
	SafetyOverride     bool        // True if saturation veto overrode model-based decision
	LastRunTime        metav1.Time // Time when decision was made (for status updates)
	SaturationOnly     bool        // True if operating in saturation-only mode (no model-based analysis)

	// --- Allocation state ---
	// CurrentAllocation carries the collected metrics/allocation state
	// This helps the Controller update status without re-collecting metrics
	CurrentAllocation *Allocation

	// --- Resource limiting results ---
	// GPUsAllocated is the number of GPUs allocated by the resource limiter
	GPUsAllocated int
	// WasLimited indicates if the target was constrained by resource limits
	WasLimited bool
	// LimitedBy identifies which limiter constrained the decision (if any)
	LimitedBy string

	// --- Replica bounds ---
	// MinReplicas is the minimum number of replicas for this variant (from VA spec field).
	// nil means not set (default: 0).
	MinReplicas *int
	// MaxReplicas is the maximum number of replicas for this variant (from VA spec field).
	// nil means not set (no cap).
	MaxReplicas *int

	// --- Metrics availability ---
	// MetricsAvailable indicates whether saturation metrics were available for this decision
	MetricsAvailable bool
	// MetricsReason is the reason for the MetricsAvailable condition
	MetricsReason string
	// MetricsMessage is the human-readable message for the MetricsAvailable condition
	MetricsMessage string
}

// AddDecisionStep adds a step to the decision pipeline history.
// This should be called by each pipeline stage after modifying the decision.
func (d *VariantDecision) AddDecisionStep(name string, reason string, wasConstrained bool) {
	step := DecisionStep{
		Name:           name,
		Action:         d.Action,
		TargetReplicas: d.TargetReplicas,
		Reason:         reason,
		WasConstrained: wasConstrained,
		Timestamp:      metav1.Now(),
	}
	d.DecisionSteps = append(d.DecisionSteps, step)
}

// LastStep returns the most recent decision step, or nil if none.
func (d *VariantDecision) LastStep() *DecisionStep {
	if len(d.DecisionSteps) == 0 {
		return nil
	}
	return &d.DecisionSteps[len(d.DecisionSteps)-1]
}

// SaturationAction represents the scaling action
type SaturationAction string

const (
	ActionScaleUp   SaturationAction = "scale-up"
	ActionScaleDown SaturationAction = "scale-down"
	ActionNoChange  SaturationAction = "no-change"
)

// VariantReplicaState holds the current and desired replica counts for a variant
type VariantReplicaState struct {
	VariantName     string
	CurrentReplicas int
	DesiredReplicas int // From optimizer/CRD status, 0 if not set
	// PendingReplicas are pods that exist but are not yet ready to serve traffic
	// (CurrentReplicas - ReadyReplicas). This typically occurs during scale-up when
	// new pods are starting (containers initializing, model loading, health checks).
	// Pod startup can take 2-7 minutes depending on model size and hardware.
	// WVA uses this to prevent cascade scaling - avoiding new scale-up requests
	// while pending pods are still becoming ready.
	PendingReplicas int
	// GPUsPerReplica is the number of GPUs required per replica, extracted from
	// the deployment's container resource requests (nvidia.com/gpu, amd.com/gpu, etc.).
	// Defaults to 1 if no GPU requests are found.
	GPUsPerReplica int
	// Role is the P/D disaggregation role: "prefill", "decode", or "both" (default).
	Role string
	// MinReplicas is the minimum number of replicas for this variant (from VA spec field).
	// nil means not set (default: 0, allows scale to zero).
	MinReplicas *int
	// MaxReplicas is the maximum number of replicas for this variant (from VA spec field).
	// nil means not set (default: 0, no cap).
	MaxReplicas *int
}

// SaturationAnalyzer analyzes replica saturation metrics and recommends scaling decisions
type SaturationAnalyzer interface {
	// AnalyzeModelSaturation analyzes saturation for all variants of a model
	// Returns saturation analysis with scale-up/scale-down recommendations
	AnalyzeModelSaturation(
		ctx context.Context,
		modelID string,
		namespace string,
		replicaMetrics []ReplicaMetrics,
		config AnalyzerConfig,
	) (*ModelSaturationAnalysis, error)

	// CalculateSaturationTargets determines target replicas per variant based on saturation analysis.
	// Step 1: Pure saturation-based target calculation
	// - Uses ready replica count (those with metrics) to avoid excessive scale-up
	// - Preserves desired replicas when desired ≠ current (from previous optimizer run)
	// - Uses cost-based selection (cheapest for scale-up, most expensive for scale-down)
	// Returns: map[variantName]targetReplicas
	CalculateSaturationTargets(
		saturationAnalysis *ModelSaturationAnalysis,
		variantStates []VariantReplicaState,
	) map[string]int
}