Skip to content

Commit b199209

Browse files
committed
fix: V2 analyzer fallback when vllm:cache_config_info is absent
When the model server does not emit the vllm:cache_config_info metric (e.g., llm-d-inference-sim), TotalKvCapacityTokens is 0 and the V2 analyzer skipped the replica entirely, resulting in totalDemand=0 and no scale-up decisions. Add computeReplicaCapacityFallback that uses the deployment-derived capacity from the capacity store and estimates demand from KvCacheUsage percentage. This allows V2 to produce scaling decisions with any vLLM-compatible server, not just those emitting cache_config_info.
1 parent f1df0cd commit b199209

File tree

1 file changed

+50
-1
lines changed

1 file changed

+50
-1
lines changed

internal/engines/analyzers/saturation_v2/analyzer.go

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,12 @@ func (a *SaturationAnalyzer) computeReplicaCapacity(
159159
gpuCount int,
160160
) *ReplicaCapacity {
161161
if rm.TotalKvCapacityTokens <= 0 {
162-
return nil
162+
// TODO: implement proper demand estimation when vllm:cache_config_info is absent.
163+
// Currently we fall back to percentage-based demand using the deployment-derived
164+
// capacity from the capacity store. A better approach would be to estimate
165+
// TotalKvCapacityTokens from deployment args (num_gpu_blocks_override, block_size)
166+
// or use a dedicated percentage-based demand signal.
167+
return a.computeReplicaCapacityFallback(rm, config, modelID, namespace, gpuCount)
163168
}
164169

165170
// Compute demand
@@ -223,6 +228,50 @@ func (a *SaturationAnalyzer) computeReplicaCapacity(
223228
}
224229
}
225230

231+
// computeReplicaCapacityFallback handles the case where vllm:cache_config_info
232+
// is not available (TotalKvCapacityTokens == 0). It uses the deployment-derived
233+
// capacity from the capacity store and estimates demand from KvCacheUsage percentage.
234+
// This allows V2 to work with model servers that don't emit cache_config_info
235+
// (e.g., the llm-d-inference-sim).
236+
func (a *SaturationAnalyzer) computeReplicaCapacityFallback(
237+
rm interfaces.ReplicaMetrics,
238+
config *config.SaturationScalingConfig,
239+
modelID, namespace string,
240+
gpuCount int,
241+
) *ReplicaCapacity {
242+
rec := a.capacityStore.Get(namespace, modelID, rm.VariantName)
243+
if rec == nil || rec.EffectiveCapacity <= 0 {
244+
return nil
245+
}
246+
247+
effectiveCapacity := rec.EffectiveCapacity
248+
249+
// Estimate demand from KV cache usage percentage applied to the stored capacity.
250+
// This is a coarse approximation — KvCacheUsage reflects memory pressure, not
251+
// exact token demand — but it's sufficient when token-level metrics are absent.
252+
replicaDemand := int64(rm.KvCacheUsage * float64(effectiveCapacity))
253+
254+
// Add queue-based demand if we have average input token info
255+
if rm.AvgInputTokens > 0 {
256+
replicaDemand += int64(rm.QueueLength) * int64(rm.AvgInputTokens)
257+
}
258+
259+
isSaturated := replicaDemand >= effectiveCapacity
260+
261+
return &ReplicaCapacity{
262+
PodName: rm.PodName,
263+
VariantName: rm.VariantName,
264+
AcceleratorName: rm.AcceleratorName,
265+
TokensInUse: replicaDemand,
266+
TotalKvCapacityTokens: effectiveCapacity, // synthetic: store-derived
267+
MemoryBoundCapacity: effectiveCapacity,
268+
ComputeBoundCapacity: effectiveCapacity,
269+
EffectiveCapacity: effectiveCapacity,
270+
IsSaturated: isSaturated,
271+
ReplicaDemand: replicaDemand,
272+
}
273+
}
274+
226275
// computeK2 determines the compute-bound capacity using a priority chain:
227276
// 1. Observed (queue saturated) → use tokensInUse as k2
228277
// 2. Historical → rolling average from previous observations

0 commit comments

Comments
 (0)