99 // Saturation queries (per-pod peak metrics over time windows)
1010 QueryKvCacheUsage = "kv_cache_usage"
1111 QueryQueueLength = "queue_length"
12+
13+ // V2 queries (token-based capacity analysis)
14+ QueryCacheConfigInfo = "cache_config_info"
15+ QueryAvgOutputTokens = "avg_output_tokens"
16+ QueryAvgInputTokens = "avg_input_tokens"
17+ QueryPrefixCacheHitRate = "prefix_cache_hit_rate"
18+
19+ // Scheduler flow control queries (model-level, from inference scheduler)
20+ QuerySchedulerQueueSize = "scheduler_queue_size"
21+ QuerySchedulerQueueBytes = "scheduler_queue_bytes"
1222)
1323
1424// RegisterSaturationQueries registers queries used by the saturation analyzer.
@@ -35,4 +45,78 @@ func RegisterSaturationQueries(sourceRegistry *source.SourceRegistry) {
3545 Description : "Peak queue length per pod over last minute" ,
3646 })
3747
48+ // --- V2 queries for token-based capacity analysis ---
49+
50+ // Cache config info per pod (static labels with block size and GPU blocks count)
51+ // Uses max to deduplicate when multiple series exist per pod with different label combinations
52+ // Used by Saturation Analyzer V2 for token capacity computation
53+ registry .MustRegister (source.QueryTemplate {
54+ Name : QueryCacheConfigInfo ,
55+ Type : source .QueryTypePromQL ,
56+ Template : `max by (pod, num_gpu_blocks, block_size) (vllm:cache_config_info{namespace="{{.namespace}}",model_name="{{.modelID}}"})` ,
57+ Params : []string {source .ParamNamespace , source .ParamModelID },
58+ Description : "KV cache configuration info per pod (num_gpu_blocks and block_size as labels)" ,
59+ })
60+
61+ // Average output (generation) tokens per completed request
62+ // Used for output-length-dependent k2 estimation
63+ registry .MustRegister (source.QueryTemplate {
64+ Name : QueryAvgOutputTokens ,
65+ Type : source .QueryTypePromQL ,
66+ Template : `max by (pod) (rate(vllm:request_generation_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_generation_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))` ,
67+ Params : []string {source .ParamNamespace , source .ParamModelID },
68+ Description : "Average output tokens per completed request (5m rate)" ,
69+ })
70+
71+ // Average input (prompt) tokens per completed request
72+ // Used in k2 derivation formula: k2 = N_max × (I + O/2)
73+ registry .MustRegister (source.QueryTemplate {
74+ Name : QueryAvgInputTokens ,
75+ Type : source .QueryTypePromQL ,
76+ Template : `max by (pod) (rate(vllm:request_prompt_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_prompt_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))` ,
77+ Params : []string {source .ParamNamespace , source .ParamModelID },
78+ Description : "Average input tokens per completed request (5m rate)" ,
79+ })
80+
81+ // Prefix cache hit rate per pod (5m rate)
82+ // Used to reduce estimated input token demand for scheduler-queued requests.
83+ // Returns 0..1 where 1 means all prefix lookups were cache hits.
84+ registry .MustRegister (source.QueryTemplate {
85+ Name : QueryPrefixCacheHitRate ,
86+ Type : source .QueryTypePromQL ,
87+ Template : `max by (pod) (rate(vllm:prefix_cache_hits{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:prefix_cache_queries{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))` ,
88+ Params : []string {source .ParamNamespace , source .ParamModelID },
89+ Description : "Prefix cache hit rate per pod (0.0-1.0, 5m rate)" ,
90+ })
91+
92+ // --- Scheduler flow control queries (model-level) ---
93+ // These come from the llm-d inference scheduler, not vLLM pods.
94+ // They use target_model_name when available, falling back to model_name.
95+ // The "or" clause handles cases where target_model_name is not set.
96+ //
97+ // TODO(#2309): These metrics currently lack a namespace label in the upstream
98+ // gateway-api-inference-extension EPP. If the same model name exists in
99+ // different namespaces, these queries will aggregate across all of them.
100+ // Once the upstream adds a namespace label, these queries should filter by it.
101+
102+ // Number of requests queued in the scheduler's flow control layer
103+ registry .MustRegister (source.QueryTemplate {
104+ Name : QuerySchedulerQueueSize ,
105+ Type : source .QueryTypePromQL ,
106+ Template : `sum(inference_extension_flow_control_queue_size{target_model_name="{{.modelID}}"})` +
107+ ` or sum(inference_extension_flow_control_queue_size{model_name="{{.modelID}}",target_model_name=""})` ,
108+ Params : []string {source .ParamModelID },
109+ Description : "Total requests queued in scheduler flow control for this model" ,
110+ })
111+
112+ // Total bytes of request bodies queued in the scheduler's flow control layer
113+ registry .MustRegister (source.QueryTemplate {
114+ Name : QuerySchedulerQueueBytes ,
115+ Type : source .QueryTypePromQL ,
116+ Template : `sum(inference_extension_flow_control_queue_bytes{target_model_name="{{.modelID}}"})` +
117+ ` or sum(inference_extension_flow_control_queue_bytes{model_name="{{.modelID}}",target_model_name=""})` ,
118+ Params : []string {source .ParamModelID },
119+ Description : "Total bytes queued in scheduler flow control for this model" ,
120+ })
121+
38122}
0 commit comments