vllm-project
diff --git a/‎development/app/config/mock/config-profile.yaml‎
Lines changed: 44 additions & 0 deletions b/‎development/app/config/mock/config-profile.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎development/app/config/mock/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions b/‎development/app/config/mock/kustomization.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/designs/model-config-profiles.rst‎
Lines changed: 162 additions & 0 deletions b/‎docs/source/designs/model-config-profiles.rst‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎pkg/constants/model.go‎
Lines changed: 5 additions & 0 deletions b/‎pkg/constants/model.go‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎pkg/plugins/gateway/algorithms/pd_disaggregation.go‎
Lines changed: 27 additions & 31 deletions b/‎pkg/plugins/gateway/algorithms/pd_disaggregation.go‎
Lines changed: 27 additions & 31 deletions
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mock-qwen3-8b
+  labels:
+    model.aibrix.ai/name: "qwen3-8b"
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adapter.model.aibrix.ai/enabled: "true"
+      model.aibrix.ai/name: "qwen3-8b"
+      app: "mock-qwen3-8b"
+  template:
+    metadata:
+      labels:
+        adapter.model.aibrix.ai/enabled: "true"
+        model.aibrix.ai/name: "qwen3-8b"
+        app: "mock-qwen3-8b"
+      annotations:
+        model.aibrix.ai/config: |
+          {
+            "defaultProfile": "least-request",
+            "profiles": {
+              "least-request": {
+                "routingStrategy": "least-request"
+              },
+              "throughput": {
+                "routingStrategy": "throughput"
+              }
+            }
+          }
+    spec:
+      serviceAccountName: mocked-app-sa
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-mock:nightly
+          command:
+            - python3
+            - app.py
+            - --api_key
+            - test-key-1234567890
@@ -1,6 +1,7 @@
 resources:
   - ../templates/deployment
   - components.yaml
+  - config-profile.yaml
 
 # enable following patch when we test lora + api-key
 patches:
 
@@ -0,0 +1,162 @@
+.. _model_config_profiles:
+
+=========================
+Model Config and Profiles
+=========================
+
+This design describes how to supply **model/gateway configuration** (routing strategy, PD bucket bounds, combined mode, etc.) via a **single annotation** (or ConfigMap), with support for **multiple named profiles** selectable at **runtime** by the client.
+
+Motivation
+----------
+
+Today, options are encoded as many pod labels (e.g. ``model.aibrix.ai/name``, ``model.aibrix.ai/port``, ``model.aibrix.ai/routing-strategy``, ``prompt-min-length``, etc.). Adding new options requires new labels and gateway changes to read them. This does not scale. Using a single structured annotation with **multiple profiles** allows:
+
+* One place to add new options (extend the JSON schema).
+* Different configurations for the same model (e.g. ``default``, ``pd``, ``low-latency``) selectable per request via a header.
+
+Overview
+--------
+
+* **Annotation** (on the pod): ``model.aibrix.ai/config`` holds a JSON object with a ``profiles`` map. Each profile is a set of gateway options: ``routingStrategy``, ``promptLenBucketMinLength``, ``promptLenBucketMaxLength``, ``combined``.
+* **Runtime selection**: Client sends header ``config-profile: <profile-name>`` (e.g. ``pd``, ``low-latency``). If omitted, the ``defaultProfile`` (or ``"default"``) is used.
+
+JSON Schema (Implementation)
+----------------------------
+
+The implementation parses the following structure. Extra fields (e.g. ``name``, ``port``, ``engine``) in the JSON are ignored.
+
+Root object:
+
+* ``defaultProfile`` (string, optional): Profile name to use when header is empty or profile not found. Default: ``"default"``.
+* ``profiles`` (object, required): Map of profile name → profile object.
+
+Profile object (``ModelConfigProfile``):
+
+* ``routingStrategy`` (string): e.g. ``random``, ``pd``, ``least-latency``.
+* ``promptLenBucketMinLength`` (int, optional): Lower bound for bucketing. Default: ``0``. If negative, normalized to ``0``.
+* ``promptLenBucketMaxLength`` (int, optional): Upper bound for bucketing. Default: ``math.MaxInt32`` when ``0`` or omitted.
+* ``combined`` (bool, optional): When true, indicates combined prefill/decode pod for PD routing.
+
+Single profile (backward compatible):
+
+.. code-block:: json
+
+  {
+    "profiles": {
+      "default": {
+        "routingStrategy": "pd",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      }
+    }
+  }
+
+Multiple profiles with default:
+
+.. code-block:: json
+
+  {
+    "defaultProfile": "pd",
+    "profiles": {
+      "default": {
+        "routingStrategy": "random",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 4096
+      },
+      "pd": {
+        "routingStrategy": "pd",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      },
+      "low-latency": {
+        "routingStrategy": "least-latency",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      }
+    }
+  }
+
+Runtime Behavior
+----------------
+
+1. Gateway resolves config from pod annotation ``model.aibrix.ai/config``. ConfigMap lookup is not yet implemented. If no annotation, fall back to existing label-based resolution.
+2. Gateway reads ``config-profile`` from request headers. If missing, use ``defaultProfile`` from the JSON, or ``"default"``.
+3. Gateway selects the profile via ``GetProfile(profileName)``: exact match first, then fallback to ``defaultProfile``, then ``"default"``.
+4. The resolved profile is stored on ``RoutingContext.ConfigProfile`` (``ResolvedConfigProfile``) for the request.
+5. Routing strategy is derived from: request headers → ``ConfigProfile.RoutingStrategy`` → env ``ROUTING_ALGORITHM``.
+6. PD router uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` with fallback to the default profile; prompt bounds and ``combined`` are read from the selected profile.
+
+Annotation Example (StormService pod template)
+----------------------------------------------
+
+.. code-block:: yaml
+
+  template:
+    metadata:
+      labels:
+        app: sglang-qwen3-8b-1p1d-0-2k
+        model.aibrix.ai/name: qwen3-8B
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "30000"
+        prometheus.io/path: "/metrics"
+        model.aibrix.ai/config: |
+          {
+            "defaultProfile": "pd",
+            "profiles": {
+              "default": {
+                "routingStrategy": "random",
+                "promptLenBucketMinLength": 0,
+                "promptLenBucketMaxLength": 4096
+              },
+              "pd": {
+                "routingStrategy": "pd",
+                "promptLenBucketMinLength": 0,
+                "promptLenBucketMaxLength": 2048
+              }
+            }
+          }
+
+Client Usage
+------------
+
+* Use default profile: do not set any header (or set ``config-profile: default``).
+* Use a specific profile: set header ``config-profile: pd`` or ``config-profile: low-latency``.
+
+Implementation
+-------------
+
+Package: ``pkg/plugins/gateway/configprofiles/``
+
+* ``ModelConfigProfile``: struct with ``RoutingStrategy``, ``PromptLenBucketMinLength``, ``PromptLenBucketMaxLength``, ``Combined``.
+* ``ModelConfigProfiles``: struct with ``DefaultProfile``, ``Profiles map[string]ModelConfigProfile``.
+* ``ParseModelConfig(jsonStr)``: parses JSON; normalizes ``promptLenBucketMinLength`` (≥0) and ``promptLenBucketMaxLength`` (0→MaxInt32).
+* ``GetProfile(name)``: returns profile by name; falls back to ``defaultProfile`` then ``"default"``.
+* ``ResolveProfile(pods, headerProfile)``: iterates pods, returns first non-nil from ``ResolveProfileFromPod``.
+* ``ResolveProfileFromPod(pod, headerProfile)``: reads ``model.aibrix.ai/config`` from pod, parses, returns ``GetProfile(headerProfile)``.
+* Prompt length bounds normalization occurs in ``ParseModelConfig``: ``promptLenBucketMinLength`` (<0 → 0), ``promptLenBucketMaxLength`` (0 → ``math.MaxInt32``).
+
+Constants: ``ModelAnnoConfig`` (pkg/constants/model.go), ``HeaderConfigProfile`` (pkg/plugins/gateway/types.go).
+
+Gateway flow:
+
+* ``HandleRequestHeaders``: captures ``config-profile`` into ``ReqConfigProfile``.
+* ``HandleRequestBody``: calls ``applyConfigProfile`` which resolves config from pod annotation, sets ``routingCtx.ConfigProfile``, and provides routing strategy to ``deriveRoutingStrategyFromContext``.
+* ``deriveRoutingStrategyFromContext``: chooses the routing strategy for the request using this precedence: (1) request header ``routing-strategy`` if present and non-empty; (2) ``routingCtx.ConfigProfile.RoutingStrategy`` from the resolved profile (config-profile + pod annotation); (3) environment default. Returns the strategy and whether it was explicitly set (used to validate and set ``routingCtx.Algorithm`` in ``HandleRequestBody``).
+
+PD router:
+
+* ``isPodSuitableForPromptLength(routingCtx, pod, promptLength)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``promptLenBucketMinLength``/``promptLenBucketMaxLength``.
+* ``isCombinedPod(routingCtx, pod)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``combined``.
+
+Backward Compatibility
+----------------------
+
+If no annotation is present, ``ResolveProfile`` returns nil. Gateway continues to use existing pod labels and env for routing strategy, port, engine, etc.
+
+Future Work
+----------
+
+* ConfigMap lookup (wire when gateway config supports it).
+* Extend profile schema: ``port``, ``metricPort``, ``engine``, ``name`` for full parity with labels.
+* Use request-level ``ConfigProfile`` (from ``config-profile``) for PD bucketing instead of per-pod ``"pd"`` profile.
@@ -45,4 +45,9 @@ const (
 	// ModelAnnoRouterCustomPath is the anno for add PathPrefixes in httpRoute, split by comma
 	// Example: "model.aibrix.ai/model-router-custom-paths": "/score,/version"
 	ModelAnnoRouterCustomPath = "model.aibrix.ai/model-router-custom-paths"
+
+	// ModelAnnoConfig is the annotation holding JSON model config with multiple profiles.
+	// Client selects profile at runtime via config-profile header or defaultProfile is selected.
+	// See docs/source/designs/model-config-profiles.rst for schema.
+	ModelAnnoConfig = "model.aibrix.ai/config"
 )
@@ -33,6 +33,7 @@ import (
 	"github.com/vllm-project/aibrix/pkg/cache"
 	"github.com/vllm-project/aibrix/pkg/constants"
 	"github.com/vllm-project/aibrix/pkg/metrics"
+	"github.com/vllm-project/aibrix/pkg/plugins/gateway/configprofiles"
 	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
 	"github.com/vllm-project/aibrix/pkg/utils/prefixcacheindexer"
@@ -50,11 +51,10 @@ const (
 	LLMEngineIdentifier           string                 = constants.ModelLabelEngine
 	PDRoleSetIdentifier           string                 = "roleset-name"
 	PDRoleIdentifier              string                 = "role-name"
-	CombinedIdentifier            string                 = "model.aibrix.ai/combined"
 	RoleReplicaIndex              string                 = "stormservice.orchestration.aibrix.ai/role-replica-index"
 	PodGroupIndex                 string                 = "stormservice.orchestration.aibrix.ai/pod-group-index"
-	PromptMinLength               string                 = "prompt-min-length"
-	PromptMaxLength               string                 = "prompt-max-length"
+	PromptLenBucketMinLength      string                 = "prompt-len-bucket-min-length"
+	PromptLenBucketMaxLength      string                 = "prompt-len-bucket-max-length"
 	defaultPrefillRequestTimeout  int                    = 30
 
 	defaultMaxRequest                   float64 = 32
@@ -73,6 +73,9 @@ const (
 	// KV connector types for different backends
 	KVConnectorTypeSHFS = "shfs" // Default - AIBrix SHFS/KVCacheManager (GPU)
 	KVConnectorTypeNIXL = "nixl" // NIXL for Neuron (uses disagg_prefill_resp wrapper)
+
+	HeaderPrefillTargetPodIP = "prefill-target-pod-ip"
+	HeaderPrefillTargetPod   = "prefill-target-pod"
 )
 
 var (
@@ -172,6 +175,11 @@ func (r *pdRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList)
 
 	if prefillPod != nil {
 		klog.InfoS("selected prefill/decode pods", "request_id", ctx.RequestID, "prefill_pod", prefillPod.Name, "decode_pod", decodePod.Name)
+		if ctx.RespHeaders == nil {
+			ctx.RespHeaders = make(map[string]string)
+		}
+		ctx.RespHeaders[HeaderPrefillTargetPod] = prefillPod.Name
+		ctx.RespHeaders[HeaderPrefillTargetPodIP] = prefillPod.Status.PodIP
 		err = r.doPrefillRequest(ctx, prefillPod, llmEngine)
 		if err != nil {
 			metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
@@ -203,7 +211,7 @@ func (r *pdRouter) filterPrefillDecodePods(routingCtx *types.RoutingContext, rea
 		klog.V(4).InfoS("prompt length based filtering enabled", "request_id", routingCtx.RequestID, "prompt_length", promptLength)
 	}
 
-	prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(readyPods, promptLength)
+	prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(routingCtx, readyPods, promptLength)
 	combinedAvailable := aibrixPromptLengthBucketing && len(combinedPods) > 0
 	if len(prefillPods) == 0 && !combinedAvailable {
 		return nil, nil, fmt.Errorf("prefill pods are not ready: prefill=%d, decode=%d", len(prefillPods), len(decodePods))
@@ -932,8 +940,12 @@ func (t *PrefillRequestTracker) GetPrefillRequestCountsForPod(podname string) in
 	return int(countInterface.(*atomic.Int32).Load())
 }
 
-func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) bool {
-	minLength, maxLength := r.getPodPromptRange(pod)
+func (r *pdRouter) isPodSuitableForPromptLength(routingCtx *types.RoutingContext, pod *v1.Pod, promptLength int) bool {
+	profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)
+	if profile == nil {
+		return false
+	}
+	minLength, maxLength := profile.PromptLenBucketMinLength, profile.PromptLenBucketMaxLength
 
 	if minLength > maxLength {
 		return false
@@ -946,31 +958,15 @@ func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) b
 	return promptLength >= minLength && promptLength <= maxLength
 }
 
-// getPodPromptRange retrieves the minimum and maximum prompt lengths from pod labels.
-func (r *pdRouter) getPodPromptRange(pod *v1.Pod) (int, int) {
-	minLength := 0
-	maxLength := math.MaxInt32
-
-	if val, ok := pod.Labels[PromptMinLength]; ok {
-		if parsed, err := strconv.Atoi(val); err == nil {
-			minLength = parsed
-		}
-	}
-
-	if val, ok := pod.Labels[PromptMaxLength]; ok {
-		if parsed, err := strconv.Atoi(val); err == nil {
-			maxLength = parsed
-		}
+func isCombinedPod(routingCtx *types.RoutingContext, pod *v1.Pod) bool {
+	profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)
+	if profile == nil {
+		return false
 	}
-
-	return minLength, maxLength
-}
-
-func isCombinedPod(pod *v1.Pod) bool {
-	return pod != nil && pod.Labels[CombinedIdentifier] == "true"
+	return profile.Combined
 }
 
-func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) {
+func (r *pdRouter) collectAndBucketPods(routingCtx *types.RoutingContext, readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) {
 	prefillPods, decodePods := []*v1.Pod{}, []*v1.Pod{}
 	promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, promptLengthBucketingCombinedPods := []*v1.Pod{}, []*v1.Pod{}, []*v1.Pod{}
 
@@ -991,16 +987,16 @@ func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) (
 		switch pod.Labels[PDRoleIdentifier] {
 		case "prefill":
 			prefillPods = append(prefillPods, pod)
-			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingPrefillPods = append(promptLengthBucketingPrefillPods, pod)
 			}
 		case "decode":
 			decodePods = append(decodePods, pod)
-			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingDecodePods = append(promptLengthBucketingDecodePods, pod)
 			}
 		default:
-			if aibrixPromptLengthBucketing && isCombinedPod(pod) && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && isCombinedPod(routingCtx, pod) && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingCombinedPods = append(promptLengthBucketingCombinedPods, pod)
 			}
 		}