|
| 1 | +// This file provides queueing model analyzer metrics collection using the source |
| 2 | +// infrastructure with registered query templates. |
| 3 | +package registration |
| 4 | + |
| 5 | +import ( |
| 6 | + "github.com/llm-d/llm-d-workload-variant-autoscaler/internal/collector/source" |
| 7 | +) |
| 8 | + |
| 9 | +// Query name constants for queueing model analyzer metrics. |
| 10 | +const ( |
| 11 | + // QuerySchedulerDispatchRate is the query name for per-endpoint request dispatch rate from scheduler. |
| 12 | + // This represents the arrival rate (requests/sec) being dispatched to each replica by the scheduler. |
| 13 | + // Source: inference_extension_scheduler_attempts_total (gateway-api-inference-extension) |
| 14 | + QuerySchedulerDispatchRate = "scheduler_dispatch_rate" |
| 15 | + |
| 16 | + // QueryAvgTTFT is the query name for average time-to-first-token per pod (in seconds). |
| 17 | + // Source: vllm:time_to_first_token_seconds histogram |
| 18 | + QueryAvgTTFT = "avg_ttft" |
| 19 | + |
| 20 | + // QueryAvgITL is the query name for average inter-token latency per pod (in seconds). |
| 21 | + // Source: vllm:time_per_output_token_seconds histogram |
| 22 | + QueryAvgITL = "avg_itl" |
| 23 | +) |
| 24 | + |
| 25 | +// RegisterQueueingModelQueries registers queries used by the queueing model analyzer. |
| 26 | +func RegisterQueueingModelQueries(sourceRegistry *source.SourceRegistry) { |
| 27 | + registry := sourceRegistry.Get("prometheus").QueryList() |
| 28 | + |
| 29 | + // Scheduler dispatch rate per endpoint (per-pod arrival rate) |
| 30 | + // Records successful scheduling attempts with endpoint and model information. |
| 31 | + // Metric labels: status, pod_name, namespace, port, model_name, target_model_name |
| 32 | + // We filter by status="success" and match model identity using target_model_name |
| 33 | + // (resolved model after routing, e.g. specific LoRA adapter) with fallback to |
| 34 | + // model_name (original request model) when target_model_name is not set. |
| 35 | + // This follows the same pattern as scheduler flow control queries. |
| 36 | + // Uses sum (not max) because dispatch rate is an additive counter — multiple |
| 37 | + // series per pod should be summed. Uses rate() over 1m window for requests/sec. |
| 38 | + registry.MustRegister(source.QueryTemplate{ |
| 39 | + Name: QuerySchedulerDispatchRate, |
| 40 | + Type: source.QueryTypePromQL, |
| 41 | + Template: `sum by (pod_name, namespace) (rate(inference_extension_scheduler_attempts_total{status="success",namespace="{{.namespace}}",target_model_name="{{.modelID}}"}[1m]))` + |
| 42 | + ` or sum by (pod_name, namespace) (rate(inference_extension_scheduler_attempts_total{status="success",namespace="{{.namespace}}",model_name="{{.modelID}}",target_model_name=""}[1m]))`, |
| 43 | + Params: []string{source.ParamNamespace, source.ParamModelID}, |
| 44 | + Description: "Request dispatch rate per endpoint (requests/sec) from scheduler, " + |
| 45 | + "representing the arrival rate to each replica for a specific model", |
| 46 | + }) |
| 47 | + |
| 48 | + // Average time-to-first-token per pod (seconds). |
| 49 | + // Uses histogram _sum/_count from vLLM over a 1m rate window. |
| 50 | + // Used by queueing model tuner as the observed TTFT for Kalman filter updates. |
| 51 | + registry.MustRegister(source.QueryTemplate{ |
| 52 | + Name: QueryAvgTTFT, |
| 53 | + Type: source.QueryTypePromQL, |
| 54 | + Template: `max by (pod) (rate(vllm:time_to_first_token_seconds_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]) / rate(vllm:time_to_first_token_seconds_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]))`, |
| 55 | + Params: []string{source.ParamNamespace, source.ParamModelID}, |
| 56 | + Description: "Average time-to-first-token per pod (seconds), " + |
| 57 | + "used by queueing model tuner for parameter learning", |
| 58 | + }) |
| 59 | + |
| 60 | + // Average inter-token latency per pod (seconds). |
| 61 | + // Uses histogram _sum/_count from vLLM over a 1m rate window. |
| 62 | + // Used by queueing model tuner as the observed ITL for Kalman filter updates. |
| 63 | + registry.MustRegister(source.QueryTemplate{ |
| 64 | + Name: QueryAvgITL, |
| 65 | + Type: source.QueryTypePromQL, |
| 66 | + Template: `max by (pod) (rate(vllm:time_per_output_token_seconds_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]) / rate(vllm:time_per_output_token_seconds_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]))`, |
| 67 | + Params: []string{source.ParamNamespace, source.ParamModelID}, |
| 68 | + Description: "Average inter-token latency per pod (seconds), " + |
| 69 | + "used by queueing model tuner for parameter learning", |
| 70 | + }) |
| 71 | + |
| 72 | + // Note: MaxBatchSize (max_num_seqs) is not available as a Prometheus metric from vLLM. |
| 73 | + // It is sourced from the Deployment's container args using the deployment parser |
| 74 | + // (see saturation_v2.ParseVLLMArgs). The collector populates ReplicaMetrics.MaxBatchSize |
| 75 | + // by parsing the --max-num-seqs flag from the pod's parent Deployment spec. |
| 76 | +} |
0 commit comments