llm-d
diff --git a/‎deploy/configmap-queueing-model.yaml‎
Lines changed: 51 additions & 0 deletions b/‎deploy/configmap-queueing-model.yaml‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎go.mod‎
Lines changed: 4 additions & 4 deletions b/‎go.mod‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎go.sum‎
Lines changed: 7 additions & 7 deletions b/‎go.sum‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎internal/collector/registration/queueing_model.go‎
Lines changed: 76 additions & 0 deletions b/‎internal/collector/registration/queueing_model.go‎
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,51 @@
+apiVersion: v1
+kind: ConfigMap
+# This ConfigMap configures the queueing model analyzer for the WVA controller.
+# Its presence (with a "default" key) activates the queueing model analyzer.
+# Delete this ConfigMap to fall back to the analyzer selected in
+# wva-saturation-scaling-config (V1 or V2 saturation analyzer).
+#
+# Configuration structure:
+# - 'default' key: Global default parameters applied to all models.
+# - Additional keys: Per-model overrides. The key name is arbitrary; the
+#   model_id and namespace fields inside the value identify the target model.
+#   Add one key per model override. Per-model entries must set both targetTTFT
+#   and targetITL, or neither (0 means infer from metrics).
+metadata:
+  name: wva-queueing-model-config
+  namespace: workload-variant-autoscaler-system
+  labels:
+    app.kubernetes.io/name: workload-variant-autoscaler
+    app.kubernetes.io/managed-by: kustomize
+data:
+  # Global defaults applied to all models unless overridden.
+  default: |
+    # rho = 1 - 1/k
+    # k=3.0 -> rho ~= 0.67, k=2.0 -> rho = 0.50, k=5.0 -> rho = 0.80
+    # where rho is the fraction of server capacity consumed by arrivals.
+    # Must be > 1.0.
+    # Default: 3.0
+    sloMultiplier: 3.0
+    # Enable online parameter learning via Kalman filter.
+    # When true, the tuner learns alpha/beta/gamma from observed metrics.
+    # When false, relies on explicit SLO targets or fallback heuristics.
+    tuningEnabled: true
+
+  # Per-model overrides — add one entry per model that needs custom parameters.
+  # The key name (e.g. "llama-prod") is arbitrary; model_id + namespace identify the model.
+  #
+  # llama-prod: |
+  #   model_id: "unsloth/Meta-Llama-3.1-8B"
+  #   namespace: "llm-d-prod"
+  #   targetTTFT: 500.0   # ms; must be set together with targetITL, or both omitted
+  #   targetITL: 50.0     # ms
+  #   sloMultiplier: 2.5
+  #   tuningEnabled: true
+  #
+  # mistral-staging: |
+  #   model_id: "mistralai/Mistral-7B-Instruct-v0.2"
+  #   namespace: "llm-d-staging"
+  #   targetTTFT: 800.0
+  #   targetITL: 80.0
+  #   sloMultiplier: 3.0
+  #   tuningEnabled: false
@@ -19,6 +19,7 @@ require (
 	github.com/Masterminds/semver/v3 v3.4.0 // indirect
 	github.com/cenkalti/backoff/v5 v5.0.3 // indirect
 	github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
+	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/sagikazarmark/locafero v0.11.0 // indirect
@@ -32,7 +33,7 @@ require (
 	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	golang.org/x/mod v0.32.0 // indirect
 	sigs.k8s.io/randfill v1.0.0 // indirect
-	sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
+	sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect
 )
 
 require (
@@ -54,7 +55,6 @@ require (
 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 	github.com/go-openapi/swag v0.23.1 // indirect
 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
-	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/google/btree v1.1.3 // indirect
 	github.com/google/cel-go v0.26.0 // indirect
 	github.com/google/gnostic-models v0.7.0 // indirect
@@ -74,7 +74,7 @@ require (
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.67.5
 	github.com/prometheus/procfs v0.17.0 // indirect
-	github.com/spf13/cobra v1.9.1 // indirect
+	github.com/spf13/cobra v1.10.0 // indirect
 	github.com/spf13/pflag v1.0.10
 	github.com/stoewer/go-strcase v1.3.0 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
@@ -110,7 +110,7 @@ require (
 	k8s.io/apiserver v0.34.3 // indirect
 	k8s.io/component-base v0.34.3 // indirect
 	k8s.io/klog/v2 v2.130.1 // indirect
-	k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
+	k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
 	k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
 	sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect
 	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
 
@@ -149,9 +149,9 @@ github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I=
 github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg=
 github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY=
 github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo=
-github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo=
-github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0=
-github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
+github.com/spf13/cobra v1.10.0 h1:a5/WeUlSDCvV5a45ljW2ZFtV0bTDpkfSAj3uqB6Sc+0=
+github.com/spf13/cobra v1.10.0/go.mod h1:9dhySC7dnTtEiqzmqfkLj47BslqLCUPMXjG2lj/NgoE=
+github.com/spf13/pflag v1.0.8/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk=
 github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU=
@@ -296,8 +296,8 @@ k8s.io/component-base v0.34.3 h1:zsEgw6ELqK0XncCQomgO9DpUIzlrYuZYA0Cgo+JWpVk=
 k8s.io/component-base v0.34.3/go.mod h1:5iIlD8wPfWE/xSHTRfbjuvUul2WZbI2nOUK65XL0E/c=
 k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
-k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
-k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
+k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=
+k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ=
 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 h1:SjGebBtkBqHFOli+05xYbK8YF1Dzkbzn+gDM4X9T4Ck=
 k8s.io/utils v0.0.0-20251002143259-bc988d571ff4/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
 sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
@@ -310,7 +310,7 @@ sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5E
 sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg=
 sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU=
 sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY=
-sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco=
-sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
+sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs=
+sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE=
 sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs=
 sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4=
@@ -0,0 +1,76 @@
+// This file provides queueing model analyzer metrics collection using the source
+// infrastructure with registered query templates.
+package registration
+
+import (
+	"github.com/llm-d/llm-d-workload-variant-autoscaler/internal/collector/source"
+)
+
+// Query name constants for queueing model analyzer metrics.
+const (
+	// QuerySchedulerDispatchRate is the query name for per-endpoint request dispatch rate from scheduler.
+	// This represents the arrival rate (requests/sec) being dispatched to each replica by the scheduler.
+	// Source: inference_extension_scheduler_attempts_total (gateway-api-inference-extension)
+	QuerySchedulerDispatchRate = "scheduler_dispatch_rate"
+
+	// QueryAvgTTFT is the query name for average time-to-first-token per pod (in seconds).
+	// Source: vllm:time_to_first_token_seconds histogram
+	QueryAvgTTFT = "avg_ttft"
+
+	// QueryAvgITL is the query name for average inter-token latency per pod (in seconds).
+	// Source: vllm:time_per_output_token_seconds histogram
+	QueryAvgITL = "avg_itl"
+)
+
+// RegisterQueueingModelQueries registers queries used by the queueing model analyzer.
+func RegisterQueueingModelQueries(sourceRegistry *source.SourceRegistry) {
+	registry := sourceRegistry.Get("prometheus").QueryList()
+
+	// Scheduler dispatch rate per endpoint (per-pod arrival rate)
+	// Records successful scheduling attempts with endpoint and model information.
+	// Metric labels: status, pod_name, namespace, port, model_name, target_model_name
+	// We filter by status="success" and match model identity using target_model_name
+	// (resolved model after routing, e.g. specific LoRA adapter) with fallback to
+	// model_name (original request model) when target_model_name is not set.
+	// This follows the same pattern as scheduler flow control queries.
+	// Uses sum (not max) because dispatch rate is an additive counter — multiple
+	// series per pod should be summed. Uses rate() over 1m window for requests/sec.
+	registry.MustRegister(source.QueryTemplate{
+		Name: QuerySchedulerDispatchRate,
+		Type: source.QueryTypePromQL,
+		Template: `sum by (pod_name, namespace) (rate(inference_extension_scheduler_attempts_total{status="success",namespace="{{.namespace}}",target_model_name="{{.modelID}}"}[1m]))` +
+			` or sum by (pod_name, namespace) (rate(inference_extension_scheduler_attempts_total{status="success",namespace="{{.namespace}}",model_name="{{.modelID}}",target_model_name=""}[1m]))`,
+		Params: []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Request dispatch rate per endpoint (requests/sec) from scheduler, " +
+			"representing the arrival rate to each replica for a specific model",
+	})
+
+	// Average time-to-first-token per pod (seconds).
+	// Uses histogram _sum/_count from vLLM over a 1m rate window.
+	// Used by queueing model tuner as the observed TTFT for Kalman filter updates.
+	registry.MustRegister(source.QueryTemplate{
+		Name:     QueryAvgTTFT,
+		Type:     source.QueryTypePromQL,
+		Template: `max by (pod) (rate(vllm:time_to_first_token_seconds_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]) / rate(vllm:time_to_first_token_seconds_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]))`,
+		Params:   []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Average time-to-first-token per pod (seconds), " +
+			"used by queueing model tuner for parameter learning",
+	})
+
+	// Average inter-token latency per pod (seconds).
+	// Uses histogram _sum/_count from vLLM over a 1m rate window.
+	// Used by queueing model tuner as the observed ITL for Kalman filter updates.
+	registry.MustRegister(source.QueryTemplate{
+		Name:     QueryAvgITL,
+		Type:     source.QueryTypePromQL,
+		Template: `max by (pod) (rate(vllm:time_per_output_token_seconds_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]) / rate(vllm:time_per_output_token_seconds_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[1m]))`,
+		Params:   []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Average inter-token latency per pod (seconds), " +
+			"used by queueing model tuner for parameter learning",
+	})
+
+	// Note: MaxBatchSize (max_num_seqs) is not available as a Prometheus metric from vLLM.
+	// It is sourced from the Deployment's container args using the deployment parser
+	// (see saturation_v2.ParseVLLMArgs). The collector populates ReplicaMetrics.MaxBatchSize
+	// by parsing the --max-num-seqs flag from the pod's parent Deployment spec.
+}