mamy-CS
diff --git a/‎AGENTS.md‎
Lines changed: 345 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 345 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 1 addition & 0 deletions b/‎CLAUDE.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎charts/workload-variant-autoscaler/templates/rbac/role.yaml‎
Lines changed: 8 additions & 0 deletions b/‎charts/workload-variant-autoscaler/templates/rbac/role.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎charts/workload-variant-autoscaler/templates/variantautoscaling.yaml‎
Lines changed: 1 addition & 0 deletions b/‎charts/workload-variant-autoscaler/templates/variantautoscaling.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmd/main.go‎
Lines changed: 9 additions & 0 deletions b/‎cmd/main.go‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎config/manager/configmap-saturation-scaling.yaml‎
Lines changed: 25 additions & 0 deletions b/‎config/manager/configmap-saturation-scaling.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎config/manager/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions b/‎config/manager/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/manager/manager.yaml‎
Lines changed: 3 additions & 0 deletions b/‎config/manager/manager.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎config/rbac/role.yaml‎
Lines changed: 8 additions & 0 deletions b/‎config/rbac/role.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎internal/collector/registration/saturation.go‎
Lines changed: 84 additions & 0 deletions b/‎internal/collector/registration/saturation.go‎
Lines changed: 84 additions & 0 deletions
@@ -0,0 +1 @@
+see @AGENTS.md for instructions.
@@ -75,6 +75,14 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - apps
+  resources:
+  - replicasets
+  verbs:
+  - get
+  - list
+  - watch
 - apiGroups:
   - llmd.ai
   resources:
 
@@ -18,6 +18,7 @@ spec:
   # ScaleTargetRef references the target resource to scale (similar to HPA)
   # TODO: Support templating for scaleTargetRef to enable managing groups of deployments
   scaleTargetRef:
+    apiVersion: apps/v1
     kind: Deployment
     name: {{ .Values.llmd.deploymentName | default (printf "%s-decode" .Values.llmd.modelName) }}
   # OpenAI API compatible name of the model
 
@@ -51,6 +51,7 @@ import (
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/datastore"
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/engines/saturation"
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/engines/scalefromzero"
+	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/indexers"
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/logging"
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/metrics"
 	"github.com/llm-d-incubation/workload-variant-autoscaler/internal/utils"
@@ -348,6 +349,14 @@ func main() {
 		os.Exit(1)
 	}
 
+	// Setup custom indexes for lookups on VariantAutoscalings
+	setupLog.Info("Setting up indexes")
+	if err := indexers.SetupIndexes(context.Background(), mgr); err != nil {
+		setupLog.Error(err, "unable to setup indexes")
+		os.Exit(1)
+	}
+	setupLog.Info("Indexes setup completed")
+
 	// Initialize metrics
 	setupLog.Info("Creating metrics emitter instance")
 	// Force initialization of metrics by creating a metrics emitter
 
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: ConfigMap
+# This ConfigMap defines saturation-based scaling thresholds for model variants.
+# Saturation scaling uses KV cache utilization and queue length metrics to determine
+# when replicas are saturated and when to scale up.
+#
+# Configuration structure:
+# - 'default' entry: Global default thresholds applied to all variants
+# - Override entries: Per-model/namespace custom thresholds (must include model_id and namespace)
+metadata:
+  name: saturation-scaling-config
+  namespace: workload-variant-autoscaler-system
+  labels:
+    app.kubernetes.io/name: workload-variant-autoscaler
+    app.kubernetes.io/managed-by: kustomize
+data:
+  # Global defaults applied to all variants unless overridden
+  default: |
+    kvCacheThreshold: 0.80
+    queueLengthThreshold: 5
+    kvSpareTrigger: 0.1
+    queueSpareTrigger: 3
+    # Enable GPU limiter to constrain scaling based on available cluster resources
+    # When true, scale-up decisions are limited by available GPU capacity
+    enableLimiter: false
@@ -1,6 +1,7 @@
 resources:
 - manager.yaml
 - configmap.yaml
+- configmap-saturation-scaling.yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 images:
 
@@ -108,6 +108,9 @@ spec:
             valueFrom:
               fieldRef:
                 fieldPath: metadata.namespace
+          # Saturation scaling ConfigMap name (must match kustomize namePrefix + base name)
+          - name: SATURATION_CONFIG_MAP_NAME
+            value: "workload-variant-autoscaler-saturation-scaling-config"
         name: manager
         ports: []
         securityContext:
 
@@ -57,6 +57,14 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - apps
+  resources:
+  - replicasets
+  verbs:
+  - get
+  - list
+  - watch
 - apiGroups:
   - llmd.ai
   resources:
 
@@ -9,6 +9,16 @@ const (
 	// Saturation queries (per-pod peak metrics over time windows)
 	QueryKvCacheUsage = "kv_cache_usage"
 	QueryQueueLength  = "queue_length"
+
+	// V2 queries (token-based capacity analysis)
+	QueryCacheConfigInfo   = "cache_config_info"
+	QueryAvgOutputTokens   = "avg_output_tokens"
+	QueryAvgInputTokens    = "avg_input_tokens"
+	QueryPrefixCacheHitRate = "prefix_cache_hit_rate"
+
+	// Scheduler flow control queries (model-level, from inference scheduler)
+	QuerySchedulerQueueSize  = "scheduler_queue_size"
+	QuerySchedulerQueueBytes = "scheduler_queue_bytes"
 )
 
 // RegisterSaturationQueries registers queries used by the saturation analyzer.
@@ -35,4 +45,78 @@ func RegisterSaturationQueries(sourceRegistry *source.SourceRegistry) {
 		Description: "Peak queue length per pod over last minute",
 	})
 
+	// --- V2 queries for token-based capacity analysis ---
+
+	// Cache config info per pod (static labels with block size and GPU blocks count)
+	// Uses max to deduplicate when multiple series exist per pod with different label combinations
+	// Used by Saturation Analyzer V2 for token capacity computation
+	registry.MustRegister(source.QueryTemplate{
+		Name:        QueryCacheConfigInfo,
+		Type:        source.QueryTypePromQL,
+		Template:    `max by (pod, num_gpu_blocks, block_size) (vllm:cache_config_info{namespace="{{.namespace}}",model_name="{{.modelID}}"})`,
+		Params:      []string{source.ParamNamespace, source.ParamModelID},
+		Description: "KV cache configuration info per pod (num_gpu_blocks and block_size as labels)",
+	})
+
+	// Average output (generation) tokens per completed request
+	// Used for output-length-dependent k2 estimation
+	registry.MustRegister(source.QueryTemplate{
+		Name:        QueryAvgOutputTokens,
+		Type:        source.QueryTypePromQL,
+		Template:    `max by (pod) (rate(vllm:request_generation_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_generation_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
+		Params:      []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Average output tokens per completed request (5m rate)",
+	})
+
+	// Average input (prompt) tokens per completed request
+	// Used in k2 derivation formula: k2 = N_max × (I + O/2)
+	registry.MustRegister(source.QueryTemplate{
+		Name:        QueryAvgInputTokens,
+		Type:        source.QueryTypePromQL,
+		Template:    `max by (pod) (rate(vllm:request_prompt_tokens_sum{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:request_prompt_tokens_count{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
+		Params:      []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Average input tokens per completed request (5m rate)",
+	})
+
+	// Prefix cache hit rate per pod (5m rate)
+	// Used to reduce estimated input token demand for scheduler-queued requests.
+	// Returns 0..1 where 1 means all prefix lookups were cache hits.
+	registry.MustRegister(source.QueryTemplate{
+		Name:        QueryPrefixCacheHitRate,
+		Type:        source.QueryTypePromQL,
+		Template:    `max by (pod) (rate(vllm:prefix_cache_hits{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]) / rate(vllm:prefix_cache_queries{namespace="{{.namespace}}",model_name="{{.modelID}}"}[5m]))`,
+		Params:      []string{source.ParamNamespace, source.ParamModelID},
+		Description: "Prefix cache hit rate per pod (0.0-1.0, 5m rate)",
+	})
+
+	// --- Scheduler flow control queries (model-level) ---
+	// These come from the llm-d inference scheduler, not vLLM pods.
+	// They use target_model_name when available, falling back to model_name.
+	// The "or" clause handles cases where target_model_name is not set.
+	//
+	// TODO(#2309): These metrics currently lack a namespace label in the upstream
+	// gateway-api-inference-extension EPP. If the same model name exists in
+	// different namespaces, these queries will aggregate across all of them.
+	// Once the upstream adds a namespace label, these queries should filter by it.
+
+	// Number of requests queued in the scheduler's flow control layer
+	registry.MustRegister(source.QueryTemplate{
+		Name: QuerySchedulerQueueSize,
+		Type: source.QueryTypePromQL,
+		Template: `sum(inference_extension_flow_control_queue_size{target_model_name="{{.modelID}}"})` +
+			` or sum(inference_extension_flow_control_queue_size{model_name="{{.modelID}}",target_model_name=""})`,
+		Params:      []string{source.ParamModelID},
+		Description: "Total requests queued in scheduler flow control for this model",
+	})
+
+	// Total bytes of request bodies queued in the scheduler's flow control layer
+	registry.MustRegister(source.QueryTemplate{
+		Name: QuerySchedulerQueueBytes,
+		Type: source.QueryTypePromQL,
+		Template: `sum(inference_extension_flow_control_queue_bytes{target_model_name="{{.modelID}}"})` +
+			` or sum(inference_extension_flow_control_queue_bytes{model_name="{{.modelID}}",target_model_name=""})`,
+		Params:      []string{source.ParamModelID},
+		Description: "Total bytes queued in scheduler flow control for this model",
+	})
+
 }