Skip to content

Commit 2ce4c6e

Browse files
authored
Merge branch 'main' into unify-config
2 parents 576604f + 7fc1e73 commit 2ce4c6e

28 files changed

Lines changed: 2712 additions & 383 deletions

AGENTS.md

Lines changed: 345 additions & 0 deletions
Large diffs are not rendered by default.

CLAUDE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
see @AGENTS.md for instructions.

charts/workload-variant-autoscaler/templates/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ rules:
7575
- patch
7676
- update
7777
- watch
78+
- apiGroups:
79+
- apps
80+
resources:
81+
- replicasets
82+
verbs:
83+
- get
84+
- list
85+
- watch
7886
- apiGroups:
7987
- llmd.ai
8088
resources:

charts/workload-variant-autoscaler/templates/variantautoscaling.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ spec:
1818
# ScaleTargetRef references the target resource to scale (similar to HPA)
1919
# TODO: Support templating for scaleTargetRef to enable managing groups of deployments
2020
scaleTargetRef:
21+
apiVersion: apps/v1
2122
kind: Deployment
2223
name: {{ .Values.llmd.deploymentName | default (printf "%s-decode" .Values.llmd.modelName) }}
2324
# OpenAI API compatible name of the model

cmd/main.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ import (
5151
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/datastore"
5252
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/engines/saturation"
5353
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/engines/scalefromzero"
54+
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/indexers"
5455
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/logging"
5556
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/metrics"
5657
"github.com/llm-d-incubation/workload-variant-autoscaler/internal/utils"
@@ -348,6 +349,14 @@ func main() {
348349
os.Exit(1)
349350
}
350351

352+
// Setup custom indexes for lookups on VariantAutoscalings
353+
setupLog.Info("Setting up indexes")
354+
if err := indexers.SetupIndexes(context.Background(), mgr); err != nil {
355+
setupLog.Error(err, "unable to setup indexes")
356+
os.Exit(1)
357+
}
358+
setupLog.Info("Indexes setup completed")
359+
351360
// Initialize metrics
352361
setupLog.Info("Creating metrics emitter instance")
353362
// Force initialization of metrics by creating a metrics emitter
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
# This ConfigMap defines saturation-based scaling thresholds for model variants.
4+
# Saturation scaling uses KV cache utilization and queue length metrics to determine
5+
# when replicas are saturated and when to scale up.
6+
#
7+
# Configuration structure:
8+
# - 'default' entry: Global default thresholds applied to all variants
9+
# - Override entries: Per-model/namespace custom thresholds (must include model_id and namespace)
10+
metadata:
11+
name: saturation-scaling-config
12+
namespace: workload-variant-autoscaler-system
13+
labels:
14+
app.kubernetes.io/name: workload-variant-autoscaler
15+
app.kubernetes.io/managed-by: kustomize
16+
data:
17+
# Global defaults applied to all variants unless overridden
18+
default: |
19+
kvCacheThreshold: 0.80
20+
queueLengthThreshold: 5
21+
kvSpareTrigger: 0.1
22+
queueSpareTrigger: 3
23+
# Enable GPU limiter to constrain scaling based on available cluster resources
24+
# When true, scale-up decisions are limited by available GPU capacity
25+
enableLimiter: false

config/manager/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
resources:
22
- manager.yaml
33
- configmap.yaml
4+
- configmap-saturation-scaling.yaml
45
apiVersion: kustomize.config.k8s.io/v1beta1
56
kind: Kustomization
67
images:

config/manager/manager.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ spec:
108108
valueFrom:
109109
fieldRef:
110110
fieldPath: metadata.namespace
111+
# Saturation scaling ConfigMap name (must match kustomize namePrefix + base name)
112+
- name: SATURATION_CONFIG_MAP_NAME
113+
value: "workload-variant-autoscaler-saturation-scaling-config"
111114
name: manager
112115
ports: []
113116
securityContext:

config/rbac/role.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,14 @@ rules:
5757
- patch
5858
- update
5959
- watch
60+
- apiGroups:
61+
- apps
62+
resources:
63+
- replicasets
64+
verbs:
65+
- get
66+
- list
67+
- watch
6068
- apiGroups:
6169
- llmd.ai
6270
resources:

internal/collector/registration/saturation.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@ const (
99
// Saturation queries (per-pod peak metrics over time windows)
1010
QueryKvCacheUsage = "kv_cache_usage"
1111
QueryQueueLength = "queue_length"
12+
13+
// V2 queries (token-based capacity analysis)
14+
QueryCacheConfigInfo = "cache_config_info"
15+
QueryAvgOutputTokens = "avg_output_tokens"
16+
QueryAvgInputTokens = "avg_input_tokens"
17+
QueryPrefixCacheHitRate = "prefix_cache_hit_rate"
18+
19+
// Scheduler flow control queries (model-level, from inference scheduler)
20+
QuerySchedulerQueueSize = "scheduler_queue_size"
21+
QuerySchedulerQueueBytes = "scheduler_queue_bytes"
1222
)
1323

1424
// RegisterSaturationQueries registers queries used by the saturation analyzer.
@@ -35,4 +45,78 @@ func RegisterSaturationQueries(sourceRegistry *source.SourceRegistry) {
3545
Description: "Peak queue length per pod over last minute",
3646
})
3747

48+
// --- V2 queries for token-based capacity analysis ---
49+
50+
// Cache config info per pod (static labels with block size and GPU blocks count)
51+
// Uses max to deduplicate when multiple series exist per pod with different label combinations
52+
// Used by Saturation Analyzer V2 for token capacity computation
53+
registry.MustRegister(source.QueryTemplate{
54+
Name: QueryCacheConfigInfo,
55+
Type: source.QueryTypePromQL,
56+
Template: `max by (pod, num_gpu_blocks, block_size) (vllm:cache_config_info{namespace="{{.namespace}}",model_name="{{.modelID}}"})`,
57+
Params: []string{source.ParamNamespace, source.ParamModelID},
58+
Description: "KV cache configuration info per pod (num_gpu_blocks and block_size as labels)",
59+
})
60+
61+
// Average output (generation) tokens per completed request
62+
// Used for output-length-dependent k2 estimation
63+
registry.MustRegister(source.QueryTemplate{
64+
Name: QueryAvgOutputTokens,
65+
Type: source.QueryTypePromQL,
66+
Template: `max by (pod) (rate(vllm:request_generation_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_generation_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
67+
Params: []string{source.ParamNamespace, source.ParamModelID},
68+
Description: "Average output tokens per completed request (5m rate)",
69+
})
70+
71+
// Average input (prompt) tokens per completed request
72+
// Used in k2 derivation formula: k2 = N_max × (I + O/2)
73+
registry.MustRegister(source.QueryTemplate{
74+
Name: QueryAvgInputTokens,
75+
Type: source.QueryTypePromQL,
76+
Template: `max by (pod) (rate(vllm:request_prompt_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_prompt_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
77+
Params: []string{source.ParamNamespace, source.ParamModelID},
78+
Description: "Average input tokens per completed request (5m rate)",
79+
})
80+
81+
// Prefix cache hit rate per pod (5m rate)
82+
// Used to reduce estimated input token demand for scheduler-queued requests.
83+
// Returns 0..1 where 1 means all prefix lookups were cache hits.
84+
registry.MustRegister(source.QueryTemplate{
85+
Name: QueryPrefixCacheHitRate,
86+
Type: source.QueryTypePromQL,
87+
Template: `max by (pod) (rate(vllm:prefix_cache_hits{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:prefix_cache_queries{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
88+
Params: []string{source.ParamNamespace, source.ParamModelID},
89+
Description: "Prefix cache hit rate per pod (0.0-1.0, 5m rate)",
90+
})
91+
92+
// --- Scheduler flow control queries (model-level) ---
93+
// These come from the llm-d inference scheduler, not vLLM pods.
94+
// They use target_model_name when available, falling back to model_name.
95+
// The "or" clause handles cases where target_model_name is not set.
96+
//
97+
// TODO(#2309): These metrics currently lack a namespace label in the upstream
98+
// gateway-api-inference-extension EPP. If the same model name exists in
99+
// different namespaces, these queries will aggregate across all of them.
100+
// Once the upstream adds a namespace label, these queries should filter by it.
101+
102+
// Number of requests queued in the scheduler's flow control layer
103+
registry.MustRegister(source.QueryTemplate{
104+
Name: QuerySchedulerQueueSize,
105+
Type: source.QueryTypePromQL,
106+
Template: `sum(inference_extension_flow_control_queue_size{target_model_name="{{.modelID}}"})` +
107+
` or sum(inference_extension_flow_control_queue_size{model_name="{{.modelID}}",target_model_name=""})`,
108+
Params: []string{source.ParamModelID},
109+
Description: "Total requests queued in scheduler flow control for this model",
110+
})
111+
112+
// Total bytes of request bodies queued in the scheduler's flow control layer
113+
registry.MustRegister(source.QueryTemplate{
114+
Name: QuerySchedulerQueueBytes,
115+
Type: source.QueryTypePromQL,
116+
Template: `sum(inference_extension_flow_control_queue_bytes{target_model_name="{{.modelID}}"})` +
117+
` or sum(inference_extension_flow_control_queue_bytes{model_name="{{.modelID}}",target_model_name=""})`,
118+
Params: []string{source.ParamModelID},
119+
Description: "Total bytes queued in scheduler flow control for this model",
120+
})
121+
38122
}

0 commit comments

Comments
 (0)