diff --git a/development/app/config/mock/config-profile.yaml b/development/app/config/mock/config-profile.yaml
new file mode 100644
index 000000000..bbedb0d3e
--- /dev/null
+++ b/development/app/config/mock/config-profile.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mock-qwen3-8b
+  labels:
+    model.aibrix.ai/name: "qwen3-8b"
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adapter.model.aibrix.ai/enabled: "true"
+      model.aibrix.ai/name: "qwen3-8b"
+      app: "mock-qwen3-8b"
+  template:
+    metadata:
+      labels:
+        adapter.model.aibrix.ai/enabled: "true"
+        model.aibrix.ai/name: "qwen3-8b"
+        app: "mock-qwen3-8b"
+      annotations:
+        model.aibrix.ai/config: |
+          {
+            "defaultProfile": "least-request",
+            "profiles": {
+              "least-request": {
+                "routingStrategy": "least-request"
+              },
+              "throughput": {
+                "routingStrategy": "throughput"
+              }
+            }
+          }
+    spec:
+      serviceAccountName: mocked-app-sa
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-mock:nightly
+          command:
+            - python3
+            - app.py
+            - --api_key
+            - test-key-1234567890
\ No newline at end of file
diff --git a/development/app/config/mock/kustomization.yaml b/development/app/config/mock/kustomization.yaml
index 412c56f20..5a6f6f8f4 100644
--- a/development/app/config/mock/kustomization.yaml
+++ b/development/app/config/mock/kustomization.yaml
@@ -1,6 +1,7 @@
 resources:
   - ../templates/deployment
   - components.yaml
+  - config-profile.yaml
 
 # enable following patch when we test lora + api-key
 patches:
diff --git a/docs/source/designs/model-config-profiles.rst b/docs/source/designs/model-config-profiles.rst
new file mode 100644
index 000000000..6c22c346f
--- /dev/null
+++ b/docs/source/designs/model-config-profiles.rst
@@ -0,0 +1,162 @@
+.. _model_config_profiles:
+
+=========================
+Model Config and Profiles
+=========================
+
+This design describes how to supply **model/gateway configuration** (routing strategy, PD bucket bounds, combined mode, etc.) via a **single annotation** (or ConfigMap), with support for **multiple named profiles** selectable at **runtime** by the client.
+
+Motivation
+----------
+
+Today, options are encoded as many pod labels (e.g. ``model.aibrix.ai/name``, ``model.aibrix.ai/port``, ``model.aibrix.ai/routing-strategy``, ``prompt-min-length``, etc.). Adding new options requires new labels and gateway changes to read them. This does not scale. Using a single structured annotation with **multiple profiles** allows:
+
+* One place to add new options (extend the JSON schema).
+* Different configurations for the same model (e.g. ``default``, ``pd``, ``low-latency``) selectable per request via a header.
+
+Overview
+--------
+
+* **Annotation** (on the pod): ``model.aibrix.ai/config`` holds a JSON object with a ``profiles`` map. Each profile is a set of gateway options: ``routingStrategy``, ``promptLenBucketMinLength``, ``promptLenBucketMaxLength``, ``combined``.
+* **Runtime selection**: Client sends header ``config-profile: <profile-name>`` (e.g. ``pd``, ``low-latency``). If omitted, the ``defaultProfile`` (or ``"default"``) is used.
+
+JSON Schema (Implementation)
+----------------------------
+
+The implementation parses the following structure. Extra fields (e.g. ``name``, ``port``, ``engine``) in the JSON are ignored.
+
+Root object:
+
+* ``defaultProfile`` (string, optional): Profile name to use when header is empty or profile not found. Default: ``"default"``.
+* ``profiles`` (object, required): Map of profile name → profile object.
+
+Profile object (``ModelConfigProfile``):
+
+* ``routingStrategy`` (string): e.g. ``random``, ``pd``, ``least-latency``.
+* ``promptLenBucketMinLength`` (int, optional): Lower bound for bucketing. Default: ``0``. If negative, normalized to ``0``.
+* ``promptLenBucketMaxLength`` (int, optional): Upper bound for bucketing. Default: ``math.MaxInt32`` when ``0`` or omitted.
+* ``combined`` (bool, optional): When true, indicates combined prefill/decode pod for PD routing.
+
+Single profile (backward compatible):
+
+.. code-block:: json
+
+  {
+    "profiles": {
+      "default": {
+        "routingStrategy": "pd",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      }
+    }
+  }
+
+Multiple profiles with default:
+
+.. code-block:: json
+
+  {
+    "defaultProfile": "pd",
+    "profiles": {
+      "default": {
+        "routingStrategy": "random",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 4096
+      },
+      "pd": {
+        "routingStrategy": "pd",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      },
+      "low-latency": {
+        "routingStrategy": "least-latency",
+        "promptLenBucketMinLength": 0,
+        "promptLenBucketMaxLength": 2048
+      }
+    }
+  }
+
+Runtime Behavior
+----------------
+
+1. Gateway resolves config from pod annotation ``model.aibrix.ai/config``. ConfigMap lookup is not yet implemented. If no annotation, fall back to existing label-based resolution.
+2. Gateway reads ``config-profile`` from request headers. If missing, use ``defaultProfile`` from the JSON, or ``"default"``.
+3. Gateway selects the profile via ``GetProfile(profileName)``: exact match first, then fallback to ``defaultProfile``, then ``"default"``.
+4. The resolved profile is stored on ``RoutingContext.ConfigProfile`` (``ResolvedConfigProfile``) for the request.
+5. Routing strategy is derived from: request headers → ``ConfigProfile.RoutingStrategy`` → env ``ROUTING_ALGORITHM``.
+6. PD router uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` with fallback to the default profile; prompt bounds and ``combined`` are read from the selected profile.
+
+Annotation Example (StormService pod template)
+----------------------------------------------
+
+.. code-block:: yaml
+
+  template:
+    metadata:
+      labels:
+        app: sglang-qwen3-8b-1p1d-0-2k
+        model.aibrix.ai/name: qwen3-8B
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "30000"
+        prometheus.io/path: "/metrics"
+        model.aibrix.ai/config: |
+          {
+            "defaultProfile": "pd",
+            "profiles": {
+              "default": {
+                "routingStrategy": "random",
+                "promptLenBucketMinLength": 0,
+                "promptLenBucketMaxLength": 4096
+              },
+              "pd": {
+                "routingStrategy": "pd",
+                "promptLenBucketMinLength": 0,
+                "promptLenBucketMaxLength": 2048
+              }
+            }
+          }
+
+Client Usage
+------------
+
+* Use default profile: do not set any header (or set ``config-profile: default``).
+* Use a specific profile: set header ``config-profile: pd`` or ``config-profile: low-latency``.
+
+Implementation
+-------------
+
+Package: ``pkg/plugins/gateway/configprofiles/``
+
+* ``ModelConfigProfile``: struct with ``RoutingStrategy``, ``PromptLenBucketMinLength``, ``PromptLenBucketMaxLength``, ``Combined``.
+* ``ModelConfigProfiles``: struct with ``DefaultProfile``, ``Profiles map[string]ModelConfigProfile``.
+* ``ParseModelConfig(jsonStr)``: parses JSON; normalizes ``promptLenBucketMinLength`` (≥0) and ``promptLenBucketMaxLength`` (0→MaxInt32).
+* ``GetProfile(name)``: returns profile by name; falls back to ``defaultProfile`` then ``"default"``.
+* ``ResolveProfile(pods, headerProfile)``: iterates pods, returns first non-nil from ``ResolveProfileFromPod``.
+* ``ResolveProfileFromPod(pod, headerProfile)``: reads ``model.aibrix.ai/config`` from pod, parses, returns ``GetProfile(headerProfile)``.
+* Prompt length bounds normalization occurs in ``ParseModelConfig``: ``promptLenBucketMinLength`` (<0 → 0), ``promptLenBucketMaxLength`` (0 → ``math.MaxInt32``).
+
+Constants: ``ModelAnnoConfig`` (pkg/constants/model.go), ``HeaderConfigProfile`` (pkg/plugins/gateway/types.go).
+
+Gateway flow:
+
+* ``HandleRequestHeaders``: captures ``config-profile`` into ``ReqConfigProfile``.
+* ``HandleRequestBody``: calls ``applyConfigProfile`` which resolves config from pod annotation, sets ``routingCtx.ConfigProfile``, and provides routing strategy to ``deriveRoutingStrategyFromContext``.
+* ``deriveRoutingStrategyFromContext``: chooses the routing strategy for the request using this precedence: (1) request header ``routing-strategy`` if present and non-empty; (2) ``routingCtx.ConfigProfile.RoutingStrategy`` from the resolved profile (config-profile + pod annotation); (3) environment default. Returns the strategy and whether it was explicitly set (used to validate and set ``routingCtx.Algorithm`` in ``HandleRequestBody``).
+
+PD router:
+
+* ``isPodSuitableForPromptLength(routingCtx, pod, promptLength)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``promptLenBucketMinLength``/``promptLenBucketMaxLength``.
+* ``isCombinedPod(routingCtx, pod)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``combined``.
+
+Backward Compatibility
+----------------------
+
+If no annotation is present, ``ResolveProfile`` returns nil. Gateway continues to use existing pod labels and env for routing strategy, port, engine, etc.
+
+Future Work
+----------
+
+* ConfigMap lookup (wire when gateway config supports it).
+* Extend profile schema: ``port``, ``metricPort``, ``engine``, ``name`` for full parity with labels.
+* Use request-level ``ConfigProfile`` (from ``config-profile``) for PD bucketing instead of per-pod ``"pd"`` profile.
diff --git a/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json b/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json
index d0ddbb9e1..a35d5a67d 100644
--- a/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json
+++ b/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json
@@ -4,8 +4,8 @@
       {
         "builtIn": 1,
         "datasource": {
-          "type": "grafana",
-          "uid": "-- Grafana --"
+          "type": "datasource",
+          "uid": "grafana"
         },
         "enable": true,
         "hide": true,
@@ -18,44 +18,40 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": null,
+  "id": 4,
   "links": [],
-  "liveNow": false,
   "panels": [
     {
       "collapsed": false,
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
       "gridPos": {
         "h": 1,
         "w": 24,
         "x": 0,
         "y": 0
       },
-      "id": 10,
+      "id": 34,
       "panels": [],
-      "title": "Router: vtc-basic",
+      "title": "Priority 0 - Service Health & Availability",
       "type": "row"
     },
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "ff75su0268000c"
       },
-      "description": "Shows whether the adaptive bucket size stays stable or jumps around. Big jumps = the algorithm is reacting too quickly.",
       "fieldConfig": {
         "defaults": {
           "color": {
             "mode": "palette-classic"
           },
           "custom": {
+            "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Bucket Size",
+            "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 10,
             "gradientMode": "none",
@@ -64,8 +60,9 @@
               "tooltip": false,
               "viz": false
             },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
             "pointSize": 5,
             "scaleDistribution": {
               "type": "linear"
@@ -77,7 +74,7 @@
               "mode": "none"
             },
             "thresholdsStyle": {
-              "mode": "area"
+              "mode": "off"
             }
           },
           "mappings": [],
@@ -85,93 +82,305 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
-                "value": null
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
               }
             ]
           },
-          "unit": "none"
+          "unit": "short"
         },
-        "overrides": []
+        "overrides": [
+          {
+            "__systemRef": "hideSeriesFrom",
+            "matcher": {
+              "id": "byNames",
+              "options": {
+                "mode": "exclude",
+                "names": [
+                  "qwen3-8B-200"
+                ],
+                "prefix": "All except:",
+                "readOnly": true
+              }
+            },
+            "properties": [
+              {
+                "id": "custom.hideFrom",
+                "value": {
+                  "legend": false,
+                  "tooltip": false,
+                  "viz": true
+                }
+              }
+            ]
+          }
+        ]
       },
       "gridPos": {
-        "h": 12,
-        "w": 16,
+        "h": 8,
+        "w": 24,
         "x": 0,
         "y": 1
       },
-      "id": 1,
+      "id": 10,
       "options": {
+        "alertThreshold": true,
         "legend": {
-          "calcs": ["min", "max", "mean", "stdDev"],
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
           "displayMode": "table",
-          "placement": "bottom",
+          "placement": "right",
           "showLegend": true
         },
         "tooltip": {
+          "hideZeros": false,
           "mode": "multi",
           "sort": "none"
         }
       },
+      "pluginVersion": "12.0.2",
       "targets": [
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "ff75su0268000c"
           },
           "editorMode": "code",
-          "expr": "vtc_bucket_size_active",
-          "legendFormat": "{{exported_pod}} ({{model}}) - {{namespace}}",
+          "expr": "sum by (model, code) (rate(gateway_request_total[5m]))",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "{{model}}-{{code}}",
           "range": true,
           "refId": "A"
         }
       ],
-      "title": "VTC Bucket Size",
+      "title": "gateway_request_rate-nodata",
       "type": "timeseries"
     },
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
       },
-      "description": "How to read the VTC bucket size metric",
       "gridPos": {
-        "h": 12,
-        "w": 8,
-        "x": 16,
-        "y": 1
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 9
       },
-      "id": 2,
+      "id": 32,
       "options": {
-        "code": {
-          "language": "plaintext",
-          "showLineNumbers": false,
-          "showMiniMap": false
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
         },
-        "content": "## VTC Bucket Size Metric\n\n**Metric**: `vtc_bucket_size_active` (gauge with pod,model labels)\n\n**Why we track it**: Shows whether the adaptive bucket size stays stable or jumps around. Big jumps indicate the algorithm is reacting too quickly.\n\n**How to read / act**:\n- **Smooth, gradual slope** → Algorithm is working well\n- **Saw-tooth jumps** → Algorithm needs tuning (increase minimum bucket size or lengthen adjustment window)\n\n**What to look for**:\n- **Stability**: Look for consistent patterns across pods\n- **Oscillations**: Watch for rapid up/down movements\n- **Correlation**: Check if changes correlate with load patterns\n\n**Configuration**:\n See env `AIBRIX_ROUTER_VTC_*`",
-        "mode": "markdown"
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
       },
-      "pluginVersion": "10.0.3",
-      "title": "Interpretation Guide",
-      "type": "text"
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": false,
+          "disableTextWrap": false,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(num_requests_running[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "isCounter": false,
+          "legendFormat": "running-{{engine_type}}-{{roleset}}",
+          "metric": "inf.aibrix.num_requests_running",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "RUNNING",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "shouldComputeTopK": false,
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(num_requests_waiting[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "isCounter": false,
+          "legendFormat": "waiting-{{engine_type}}-{{roleset}}",
+          "metric": "inf.aibrix.num_requests_waiting",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "WAITING",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "requests_running/waiting_rate",
+      "type": "timeseries"
     },
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "ff75su0268000c"
       },
-      "description": "Shows how quickly the bucket size is changing. Large spikes indicate rapid adjustments that might need tuning.",
       "fieldConfig": {
         "defaults": {
           "color": {
             "mode": "palette-classic"
           },
           "custom": {
-            "axisCenteredZero": true,
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Change Rate",
+            "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 10,
             "gradientMode": "none",
@@ -180,8 +389,9 @@
               "tooltip": false,
               "viz": false
             },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
             "pointSize": 5,
             "scaleDistribution": {
               "type": "linear"
@@ -193,7 +403,7 @@
               "mode": "none"
             },
             "thresholdsStyle": {
-              "mode": "area"
+              "mode": "off"
             }
           },
           "mappings": [],
@@ -201,56 +411,93 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
-                "value": null
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
               }
             ]
           },
-          "unit": "none"
+          "unit": "short"
         },
         "overrides": []
       },
       "gridPos": {
-        "h": 12,
-        "w": 24,
+        "h": 8,
+        "w": 12,
         "x": 0,
-        "y": 13
+        "y": 17
       },
-      "id": 3,
+      "id": 24,
       "options": {
+        "alertThreshold": true,
         "legend": {
-          "calcs": ["min", "max", "mean", "stdDev"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "displayMode": "table",
           "placement": "bottom",
           "showLegend": true
         },
         "tooltip": {
+          "hideZeros": false,
           "mode": "multi",
           "sort": "none"
         }
       },
+      "pluginVersion": "12.0.2",
       "targets": [
         {
+          "aggregator": "max",
+          "alias": "$tag_model:$tag_pod:awake",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
           "datasource": {
             "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "ff75su0268000c"
           },
+          "disableDownsampling": true,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
           "editorMode": "code",
-          "expr": "abs(deriv(vtc_bucket_size_active[1m]))",
-          "legendFormat": "{{exported_pod}} ({{model}}) - {{namespace}}",
+          "explicitTags": true,
+          "expr": "up{pod=~\"sglang-.*\"}",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "{{pod}}",
+          "metric": "inf.aibrix.engine_sleep_state",
           "range": true,
-          "refId": "A"
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
         }
       ],
-      "title": "VTC Bucket Size Rate of Change",
+      "title": "Engine Sleep State",
       "type": "timeseries"
     },
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "ff75su0268000c"
       },
-      "description": "Time-to-First-Token latency by model, showing impact of router changes on user experience",
       "fieldConfig": {
         "defaults": {
           "color": {
@@ -260,7 +507,7 @@
             "axisBorderShow": false,
             "axisCenteredZero": false,
             "axisColorMode": "text",
-            "axisLabel": "Latency (s)",
+            "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
             "barWidthFactor": 0.6,
@@ -272,8 +519,9 @@
               "tooltip": false,
               "viz": false
             },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
             "pointSize": 5,
             "scaleDistribution": {
               "type": "linear"
@@ -285,7 +533,7 @@
               "mode": "none"
             },
             "thresholdsStyle": {
-              "mode": "area"
+              "mode": "off"
             }
           },
           "mappings": [],
@@ -293,92 +541,6553 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
-                "value": null
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
               }
             ]
           },
-          "unit": "s"
+          "unit": "short"
         },
         "overrides": []
       },
       "gridPos": {
-        "h": 12,
-        "w": 16,
-        "x": 0,
-        "y": 25
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 17
       },
-      "id": 4,
+      "id": 64,
       "options": {
+        "alertThreshold": true,
         "legend": {
-          "calcs": ["min", "max", "mean", "stdDev"],
+          "calcs": [
+            "lastNotNull"
+          ],
           "displayMode": "table",
-          "placement": "bottom",
+          "placement": "right",
           "showLegend": true
         },
         "tooltip": {
+          "hideZeros": false,
           "mode": "multi",
           "sort": "none"
         }
       },
+      "pluginVersion": "12.0.2",
       "targets": [
         {
+          "aggregator": "max",
+          "alias": "$tag_model-pdqueue_threshold",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
           "datasource": {
             "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "ff75su0268000c"
           },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
           "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
-          "legendFormat": "P99 - {{model_name}}",
+          "expr": "pd_queue_exceeds_threshold",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod_name",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.pd_queue_exceeds_threshold",
           "range": true,
-          "refId": "A"
+          "rateDownsampleType": "before_downsample",
+          "refId": "pd_threshold",
+          "tenant": "default"
+        }
+      ],
+      "title": "pd_queue_threshold",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 25
+      },
+      "id": 68,
+      "panels": [],
+      "title": "Request Metrics",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 26
+      },
+      "id": 66,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
         },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
         {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
           "datasource": {
             "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "ff75su0268000c"
           },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
           "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
-          "legendFormat": "P50 - {{model_name}}",
+          "explicitTags": true,
+          "expr": "rate(sglang:prompt_tokens_total[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{pod}}",
+          "metric": "inf.aibrix.gateway_prompt_token_bucket_total",
           "range": true,
-          "refId": "B"
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
         }
       ],
-      "title": "Time to First Token by Model",
+      "title": "prompt_token_rate",
       "type": "timeseries"
     },
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
       },
-      "description": "How to interpret TTFT patterns and their correlation with router changes",
       "gridPos": {
-        "h": 12,
-        "w": 8,
-        "x": 16,
-        "y": 25
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 26
       },
-      "id": 5,
+      "id": 75,
       "options": {
-        "code": {
-          "language": "plaintext",
-          "showLineNumbers": false,
-          "showMiniMap": false
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
         },
-        "content": "## TTFT Monitoring Guide\n\n**What to look for**:\n- **Healthy Pattern**: P99 close to P50\n- **Warning Sign**: P99 widens significantly\n\n**Correlation with Router**:\n- If P99 spikes when bucket size changes → router needs tuning\n- If P99 stays stable during changes → router is working well\n\n**Model Differences**:\n- Different models may show different latency characteristics\n- Compare models to identify performance differences\n\n**Action Items**:\n- If P99 widens: Check bucket size changes\n- If model differences grow: Review routing fairness",
-        "mode": "markdown"
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
       },
-      "pluginVersion": "10.0.3",
-      "title": "TTFT Interpretation Guide",
-      "type": "text"
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(gateway_completion_token_bucket_total[1m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{roleset}}-{{role}}-{{bucket}} tokens",
+          "metric": "inf.aibrix.gateway_completion_token_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "generation_token rate",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 34
+      },
+      "id": 80,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "pd_selected_prefill_pod_total",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-prefill-roleset:{{roleset}}",
+          "metric": "inf.aibrix.gateway_prompt_token_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "editorMode": "code",
+          "expr": "pd_selected_decode_pod_total",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "{{model}}-decode-roleset:{{roleset}}",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "pd_request_counter",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 34
+      },
+      "id": 77,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "gateway_routing_time_bucket_total",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "{{model}}-range[{{bucket}}]",
+          "metric": "inf.aibrix.gateway_routing_time_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "routing_time_taken",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 42
+      },
+      "id": 81,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(gateway_prefill_time_bucket_total[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-prefill-roleset:{{roleset}}-[{{bucket}}]",
+          "metric": "inf.aibrix.gateway_prompt_token_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "prefill_time bucket ",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 42
+      },
+      "id": 82,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(gateway_decode_time_bucket_total[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-decode-roleset:{{roleset}}-[{{bucket}}]",
+          "metric": "inf.aibrix.gateway_prompt_token_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "prefill_time bucket ",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 50
+      },
+      "id": 70,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "sglang:kv_transfer_latency_ms",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{pod}}",
+          "metric": "inf.aibrix.gateway_kv_transfer_time_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "kv_transfer_time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 50
+      },
+      "id": 71,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p99",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p90",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p50",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "ttft_time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 58
+      },
+      "id": 83,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p99",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p90",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p50",
+          "metric": "inf.aibrix.gateway_ttft_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "tpot_time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 58
+      },
+      "id": 73,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "gateway_total_time_bucket_total",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "wildcard"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "bucket",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "{{model}}-{{bucket}}",
+          "metric": "inf.aibrix.gateway_total_time_bucket_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "total_time",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 66
+      },
+      "id": 36,
+      "panels": [],
+      "title": "Priority 1 - End-to-End Latency （User-visible SLOs）",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "ms"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 67
+      },
+      "id": 21,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "builder",
+          "explicitTags": true,
+          "expr": "e2e_request_latency_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "isCounter": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p99",
+          "metric": "inf.aibrix.e2e_request_latency_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "e2e_request_latency_p99",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "builder",
+          "expr": "e2e_request_latency_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p90",
+          "metric": "inf.aibrix.e2e_request_latency_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "e2e_request_latency_p90",
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "builder",
+          "explicitTags": true,
+          "expr": "e2e_request_latency_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "isCounter": true,
+          "legendFormat": "{{model}}-{{engine_type}}-p50",
+          "metric": "inf.aibrix.e2e_request_latency_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "e2e_request_latency_p50",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default",
+          "useBackend": false
+        }
+      ],
+      "title": "e2e_request_latency_seconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 67
+      },
+      "id": 9,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model_name:$tag_role:$tag_pod",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "sum(up{pod=~\".*prefill.*\"})",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model_name",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "prefill",
+          "metric": "inf.aibrix.model_replicas",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model_name:$tag_role:$tag_pod",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "sum(up{pod=~\".*decode.*\"})",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model_name",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "pod",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "prefill",
+          "metric": "inf.aibrix.model_replicas",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "P/D Replica Count",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 75
+      },
+      "id": 31,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "expr": "request_inference_time_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": true,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_inference_time_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "expr": "request_inference_time_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": true,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_inference_time_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "request_inference_time_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_inference_time_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "request inference time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 75
+      },
+      "id": 29,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "disableTextWrap": false,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "histogram_quantile(0.50, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{model_name}}-p50",
+          "metric": "inf.aibrix.request_queue_time_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeRate": false,
+          "tenant": "default",
+          "useBackend": false
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "histogram_quantile(0.90, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model_name}}-p90",
+          "metric": "inf.aibrix.request_queue_time_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-$tag_role-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "histogram_quantile(0.99, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model_name}}-p99",
+          "metric": "inf.aibrix.request_queue_time_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "Request queue time ",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 83
+      },
+      "id": 37,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "HTTPRequestDurationSeconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 83
+      },
+      "id": 38,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_highr_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_highr_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_highr_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_highr_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "model",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "count",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "http_request_duration_highr_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.http_request_duration_highr_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "HTTPRequestDurationHighRSeconds",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 91
+      },
+      "id": 17,
+      "panels": [],
+      "title": "Priority 2 - Time To First Token （Prefill Performance)",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 92
+      },
+      "id": 14,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p50",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p90",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_to_first_token_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p99",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "tenant": "default"
+        }
+      ],
+      "title": "time_to_first_token_seconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 92
+      },
+      "id": 19,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "mean",
+            "max",
+            "min"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "request_prefill_time_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_prefill_time_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "request_prefill_time_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_prefill_time_seconds_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "request_prefill_time_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_prefill_time_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "request_prefill_time_seconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 100
+      },
+      "id": 42,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "rate(gateway_prompt_token_bucket_total[5m])",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "{{model}}-{{role}}-{{roleset}}-{{bucket}}",
+          "metric": "inf.aibrix.request_prompt_tokens_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "request_prompt_tokens rate",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 108
+      },
+      "id": 40,
+      "panels": [],
+      "title": "Priority 2 - Time To First Token （Decode Performance)",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 109
+      },
+      "id": 84,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p50",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p90",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "time_per_output_token_seconds_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            }
+          ],
+          "hide": false,
+          "legendFormat": "{{model}}-{{roleset}}-{{role}}-p99",
+          "metric": "inf.aibrix.time_to_first_token_seconds_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "tenant": "default"
+        }
+      ],
+      "title": "time_per_output_token_seconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 109
+      },
+      "id": 18,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.request_decode_time_seconds_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "metric": "inf.aibrix.request_decode_time_seconds_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "metric": "inf.aibrix.request_decode_time_seconds_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "request_decode_time_seconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 117
+      },
+      "id": 85,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "avg_generation_throughput_toks_per_s",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "{{model}}-{{role}}-{{roleset}}",
+          "metric": "inf.aibrix.request_prompt_tokens_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "generation_token_rate/s",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 117
+      },
+      "id": 45,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "iteration_tokens_total_p50",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.iteration_tokens_total_p50",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "iteration_tokens_total_p90",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.iteration_tokens_total_p90",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "expr": "iteration_tokens_total_p99",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.iteration_tokens_total_p99",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "iteration_tokens_total",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 125
+      },
+      "id": 44,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p50",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "metric": "inf.aibrix.request_max_num_generation_tokens_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "C",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p90",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "isCounter": true,
+          "metric": "inf.aibrix.request_max_num_generation_tokens_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        },
+        {
+          "aggregator": "sum",
+          "alias": "$tag_model-p99",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "max",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            }
+          ],
+          "isCounter": true,
+          "metric": "inf.aibrix.request_max_num_generation_tokens_p99",
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "request_max_num_generation_tokens",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 133
+      },
+      "id": 47,
+      "panels": [],
+      "title": "KVCache + NIXL",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineStyle": {
+              "fill": "solid"
+            },
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 134
+      },
+      "id": 51,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "kv_cache_usage_perc",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "{{model}}-{{role}}-{{roleset}}",
+          "metric": "inf.aibrix.kv_cache_usage_perc",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "KVCacheUsagePerc",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 134
+      },
+      "id": 54,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": true,
+          "expr": "kv",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.nixl_num_failed_notifications_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlNumFailedNotifications",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 142
+      },
+      "id": 56,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "editorMode": "code",
+          "explicitTags": false,
+          "expr": "",
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "literal_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "isCounter": true,
+          "legendFormat": "__auto",
+          "metric": "inf.aibrix.prefix_cache_hits_total",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeDelta": false,
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "PrefixCacheHitTotal",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 142
+      },
+      "id": 55,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.prefix_cache_queries_total",
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "PrefixCacheQueriesTotal",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 150
+      },
+      "id": 52,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.nixl_num_failed_transfers_total",
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlNumFailedTransfers",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 150
+      },
+      "id": 60,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.external_prefix_cache_hits_total",
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "tenant": "default"
+        }
+      ],
+      "title": "ExternalPrefilCacheHitsTotal",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 158
+      },
+      "id": 59,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.external_prefix_cache_hits_total",
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeRate": false,
+          "tenant": "default"
+        }
+      ],
+      "title": "ExternalPrefilCacheHitsTotal",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 158
+      },
+      "id": 61,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.nixl_post_time_seconds_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_post_time_seconds_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P90",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_post_time_seconds_p99",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlPostTimeSeconds",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 166
+      },
+      "id": 58,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.nixl_xfer_time_seconds_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_xfer_time_seconds_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P90",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_xfer_time_seconds_p99",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlXferTimeSeconds",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 174
+      },
+      "id": 5,
+      "panels": [],
+      "title": "machines metrics",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 175
+      },
+      "id": 2,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max",
+            "min"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_cluster-$tag_container_name",
+          "currentField": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "currentTagValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "30s",
+          "editorMode": "code",
+          "expr": "sum by (pod) (\n  rate(container_cpu_usage_seconds_total{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}[5m])\n)\n/\nsum by (pod) (\n  kube_pod_container_resource_limits{pod=~\".*aibrix.*\", resource=\"cpu\", unit=\"core\"}\n)",
+          "fields": [
+            "usage_ratio"
+          ],
+          "filters": [],
+          "legendFormat": "__auto",
+          "metric": "tce.container.cpu_usage.mt",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeMulti": true,
+          "tags": {
+            "_psm": "inf.aibrix.gateway",
+            "cluster": "echo*",
+            "container_name": "gateway-plugin|envoy"
+          },
+          "tenant": "computation.tce"
+        }
+      ],
+      "title": "aibrix component cpu usage ratio",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percentunit"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 175
+      },
+      "id": 3,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_cluster-$tag_container_name",
+          "currentField": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "currentTagValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "30s",
+          "editorMode": "code",
+          "expr": "sum by (pod) (\n  container_memory_working_set_bytes{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}\n)\n/\nsum by (pod) (\n  kube_pod_container_resource_limits{pod=~\".*aibrix.*\", resource=\"memory\", unit=\"byte\"}\n)",
+          "fields": [
+            "utilization"
+          ],
+          "filters": [],
+          "legendFormat": "__auto",
+          "metric": "tce.container.mem_usage.mt",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeMulti": true,
+          "tags": {
+            "_psm": "inf.aibrix.gateway",
+            "cluster": "echo*",
+            "container_name": "gateway-plugin|envoy"
+          },
+          "tenant": "computation.tce"
+        }
+      ],
+      "title": "gateway memory utilization",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "decbytes"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 183
+      },
+      "id": 6,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_cluster-$tag_container_name",
+          "currentField": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "currentTagValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "30s",
+          "editorMode": "code",
+          "expr": "sum by (pod) (\n  container_memory_working_set_bytes{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}\n)",
+          "fields": [
+            "used"
+          ],
+          "filters": [],
+          "legendFormat": "__auto",
+          "metric": "tce.container.mem_usage.mt",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeMulti": true,
+          "tags": {
+            "_psm": "inf.aibrix.gateway",
+            "cluster": "echo*",
+            "container_name": "gateway-plugin|envoy"
+          },
+          "tenant": "computation.tce"
+        }
+      ],
+      "title": "aibrix component memory used",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "decbytes"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 183
+      },
+      "id": 7,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull",
+            "max",
+            "min"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "avg",
+          "alias": "$tag_cluster-$tag_container_name-rx-bytes",
+          "currentField": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "currentTagValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "30s",
+          "editorMode": "code",
+          "expr": "sum by (pod) (\n  rate(container_network_receive_bytes_total{pod=~\".*aibrix.*\"}[5m])\n)",
+          "fields": [
+            "rx_bytes"
+          ],
+          "filters": [],
+          "legendFormat": "{{pod}}-receive-bytes",
+          "metric": "tce.container.net_tcp.mt",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "A",
+          "shouldComputeMulti": true,
+          "tags": {
+            "_psm": "inf.aibrix.gateway",
+            "cluster": "echo*",
+            "container_name": "gateway-plugin|envoy"
+          },
+          "tenant": "computation.tce"
+        },
+        {
+          "aggregator": "avg",
+          "alias": "$tag_cluster-tx-bytes",
+          "currentField": "",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "currentTagKey": "",
+          "currentTagValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "downsampleInterval": "30s",
+          "editorMode": "code",
+          "expr": "sum by (pod) (\n  rate(container_network_transmit_bytes_total{pod=~\".*aibrix.*\"}[5m])\n)",
+          "fields": [
+            "tx_bytes"
+          ],
+          "filters": [],
+          "hide": false,
+          "legendFormat": "{{pod}}-send-bytes",
+          "metric": "tce.container.net_tcp.mt",
+          "range": true,
+          "rateDownsampleType": "before_downsample",
+          "refId": "B",
+          "shouldComputeMulti": true,
+          "tags": {
+            "_psm": "inf.aibrix.gateway",
+            "cluster": "echo*"
+          },
+          "tenant": "computation.tce"
+        }
+      ],
+      "title": "component network",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "decbytes"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 191
+      },
+      "id": 62,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.nixl_bytes_transferred_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_bytes_transferred_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P90",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_bytes_transferred_p99",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlBytesTransferred",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "ff75su0268000c"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "short"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 191
+      },
+      "id": 63,
+      "options": {
+        "alertThreshold": true,
+        "legend": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.2",
+      "targets": [
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "metric": "inf.aibrix.nixl_num_descriptors_p50",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P50",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_num_descriptors_p90",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P90",
+          "tenant": "default"
+        },
+        {
+          "aggregator": "max",
+          "currentFilterGroupBy": false,
+          "currentFilterKey": "",
+          "currentFilterType": "literal_or",
+          "currentFilterValue": "",
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ff75su0268000c"
+          },
+          "disableDownsampling": true,
+          "downsampleAggregator": "avg",
+          "downsampleAggregatorDisabled": false,
+          "downsampleFillPolicy": "none",
+          "downsampleFillPolicyDisabled": false,
+          "explicitTags": true,
+          "filters": [
+            {
+              "filter": "*",
+              "groupBy": true,
+              "tagk": "model",
+              "type": "iliteral_or"
+            },
+            {
+              "filter": "decode",
+              "groupBy": true,
+              "tagk": "role",
+              "type": "literal_or"
+            }
+          ],
+          "hide": false,
+          "metric": "inf.aibrix.nixl_num_descriptors_p99",
+          "rateDownsampleType": "before_downsample",
+          "refId": "P99",
+          "tenant": "default"
+        }
+      ],
+      "title": "NixlNumDescriptors",
+      "type": "timeseries"
     }
   ],
-  "refresh": "5s",
-  "schemaVersion": 38,
-  "style": "dark",
-  "tags": ["vtc", "metrics", "gateway", "router"],
+  "preload": false,
+  "refresh": "1m",
+  "schemaVersion": 41,
+  "tags": [],
   "templating": {
     "list": []
   },
@@ -388,8 +7097,7 @@
   },
   "timepicker": {},
   "timezone": "",
-  "title": "AIBrix Envoy Gateway Plugins Dashboard",
-  "uid": "aibrix-envoy-gateway-plugins",
-  "version": 1,
-  "weekStart": ""
-}
+  "title": "AIBrix gateway",
+  "uid": "5rRLZ0zDz",
+  "version": 46
+}
\ No newline at end of file
diff --git a/pkg/cache/cache_init.go b/pkg/cache/cache_init.go
index e4ba517cb..fccb081bc 100644
--- a/pkg/cache/cache_init.go
+++ b/pkg/cache/cache_init.go
@@ -17,6 +17,7 @@ limitations under the License.
 package cache
 
 import (
+	"container/list"
 	"context"
 	"errors"
 	"fmt"
@@ -113,6 +114,9 @@ type Store struct {
 
 	// KV event management - optional enhancement
 	kvEventManager *KVEventManager
+
+	// Prometheus event queue
+	promqlJobs chan *Pod
 }
 
 // Get retrieves the cache instance
@@ -379,6 +383,7 @@ func InitWithOptions(config *rest.Config, stopCh <-chan struct{}, opts InitOptio
 //	stopCh: Stop signal channel
 func initMetricsCache(store *Store, stopCh <-chan struct{}) {
 	ticker := time.NewTicker(podMetricRefreshInterval)
+	store.initPromQLWorker(stopCh)
 	go func() {
 		for {
 			select {
@@ -599,3 +604,102 @@ func (s *Store) Close() {
 
 	// Other cleanup can be added here in the future
 }
+
+func (c *Store) enqueuePromQL(pod *Pod) {
+	if c.promqlJobs == nil {
+		return
+	}
+	// Non-blocking enqueue so slow PromQL queries do not affect the main path.
+	select {
+	case c.promqlJobs <- pod:
+	default:
+		// Drop when the queue is full (the next pod refresh cycle will enqueue again).
+		klog.V(5).InfoS("PromQL queue full, dropping promql job", "pod", pod.Name)
+	}
+}
+
+func (c *Store) initPromQLWorker(stopCh <-chan struct{}) {
+	if c.prometheusApi == nil {
+		klog.InfoS("Prometheus API is nil, skip initializing PromQL worker")
+		return
+	}
+	c.promqlJobs = make(chan *Pod, 2*c.podMetricsWorkerCount)
+	go c.promQueryLoop(stopCh)
+}
+
+func (c *Store) promQueryLoop(stopCh <-chan struct{}) {
+	ticker := time.NewTicker(promQueryInterval)
+	defer ticker.Stop()
+
+	// pendingPods keeps at most one pending job per pod key (ns/name).
+	// If the same pod is enqueued multiple times, we overwrite with the latest *Pod.
+	pendingPods := make(map[string]*Pod)
+
+	// fifoKeys records the processing order of pending pod keys.
+	// A key is appended only when it is first seen in pendingPods.
+	fifoKeys := list.New()
+
+	// Build stable key for dedupe/order.
+	podKey := func(p *Pod) string {
+		ns := p.Namespace
+		if ns == "" && p.Pod != nil {
+			ns = p.Pod.Namespace
+		}
+		return ns + "/" + p.Name
+	}
+
+	// Helper: enqueue into (pendingPods + fifoKeys) with dedupe.
+	enqueuePending := func(key string, p *Pod) {
+		if _, exists := pendingPods[key]; !exists {
+			fifoKeys.PushBack(key) // first time seen: record order
+		}
+		pendingPods[key] = p // always keep latest pod pointer
+	}
+
+	for {
+		select {
+		case <-stopCh:
+			return
+
+		// Accept pods from worker and deduplicate.
+		case p := <-c.promqlJobs:
+			if p == nil || p.Pod == nil || !utils.FilterReadyPod(p.Pod) {
+				continue
+			}
+			key := podKey(p)
+			if key == "" || key == "/" {
+				continue
+			}
+			enqueuePending(key, p)
+
+		// Every tick, process exactly one pending pod to cap QPS.
+		case <-ticker.C:
+			if fifoKeys.Len() == 0 {
+				continue
+			}
+
+			// Pop head key (FIFO).
+			element := fifoKeys.Front()
+			key := element.Value.(string)
+			fifoKeys.Remove(element)
+
+			// Get latest pod pointer and mark it as dequeued.
+			p := pendingPods[key]
+			delete(pendingPods, key)
+
+			// Pod may become unready while waiting in queue.
+			if p == nil || p.Pod == nil || !utils.FilterReadyPod(p.Pod) {
+				continue
+			}
+
+			ctx, cancel := context.WithTimeout(context.Background(), promQueryTimeout)
+			err := c.updateMetricFromPromQL(ctx, p)
+			cancel()
+
+			if err != nil {
+				// Best-effort retry: put it back to the tail.
+				enqueuePending(key, p)
+			}
+		}
+	}
+}
diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go
index 2b4be2554..482cc1d88 100644
--- a/pkg/cache/cache_metrics.go
+++ b/pkg/cache/cache_metrics.go
@@ -42,6 +42,8 @@ const (
 	defaultEngineLabelValue             = "vllm"
 	defaultPodMetricRefreshIntervalInMS = 50
 	defaultPodMetricsWorkerCount        = 10
+	defaultPromQueryIntervalInMS        = 200
+	defaultPromQueryTimeoutInMS         = 2000
 )
 
 var (
@@ -82,6 +84,7 @@ var (
 		metrics.RequestDecodeTimeSeconds,
 		metrics.RequestPrefillTimeSeconds,
 		metrics.HTTPRequestDurationSeconds,
+		metrics.PerStageReqLatencySeconds,
 		metrics.HTTPRequestDurationHighRSeconds,
 		metrics.RequestPromptTokens,
 		metrics.RequestGenerationTokens,
@@ -113,6 +116,8 @@ var (
 		metrics.RunningLoraAdapters,
 	}
 	podMetricRefreshInterval = time.Duration(utils.LoadEnvInt("AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS", defaultPodMetricRefreshIntervalInMS)) * time.Millisecond
+	promQueryInterval        = time.Duration(utils.LoadEnvInt("AIBRIX_PROMETHEUS_QUERY_INTERVAL_MS", defaultPromQueryIntervalInMS)) * time.Millisecond
+	promQueryTimeout         = time.Duration(utils.LoadEnvInt("AIBRIX_PROMETHEUS_QUERY_TIMEOUT_MS", defaultPromQueryTimeoutInMS)) * time.Millisecond
 )
 
 // MetricSnapshot represents a metric value at a specific timestamp
@@ -247,12 +252,11 @@ func (c *Store) worker(jobs <-chan *Pod) {
 			continue
 		}
 
-		podLabelNames, podLabelValues := buildMetricLabels(pod, engineType, "")
 		for metricName, metricValue := range result.Metrics {
 			if shouldSkipMetric(pod.Name, metricName) {
 				continue
 			}
-			metrics.EmitMetricToPrometheus(metricName, metricValue, podLabelNames, podLabelValues)
+			metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod.Pod, metricName, metricValue, metricValue.GetLabelValues())
 		}
 
 		for metricName, metricValue := range result.ModelMetrics {
@@ -267,8 +271,6 @@ func (c *Store) worker(jobs <-chan *Pod) {
 				continue
 			}
 
-			labelNames, labelValues := buildMetricLabels(pod, engineType, model)
-
 			var rateMetricName string
 			if strings.Contains(pod.Name, "prefill") && metric == metrics.PromptTokenTotal {
 				rateMetricName = metrics.AvgPromptThroughputToksPerS
@@ -279,20 +281,19 @@ func (c *Store) worker(jobs <-chan *Pod) {
 				perSecRate := c.calculatePerSecondRate(pod, model, metric, metricValue.GetSimpleValue())
 				if perSecRate >= 0 {
 					rateValue := &metrics.SimpleMetricValue{Value: perSecRate}
-					metrics.SetGaugeMetric(rateMetricName, metrics.GetMetricHelp(rateMetricName), rateValue.GetSimpleValue(), labelNames, labelValues...)
+					metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, pod.Pod, rateMetricName, rateValue, metricValue.GetLabelValues())
 					_ = c.updatePodRecord(pod, model, rateMetricName, metrics.PodModelMetricScope, rateValue)
 					klog.V(4).InfoS("get metric per sec rate", "metric", rateMetricName, "raw_value", metricValue.GetSimpleValue(), "per_sec_rate", rateValue.GetSimpleValue())
 				}
 			}
-
-			metrics.EmitMetricToPrometheus(metric, metricValue, labelNames, labelValues)
+			metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, pod.Pod, metric, metricValue, metricValue.GetLabelValues())
 		}
 		// Update pod metrics using typed results
 		c.updatePodMetricsFromTypedResult(pod, result)
 
 		// Handle Prometheus-based metrics separately (these require PromQL queries)
 		if c.prometheusApi != nil {
-			c.updateMetricFromPromQL(ctx, pod)
+			c.enqueuePromQL(pod)
 		} else {
 			klog.V(4).InfoS("Prometheus API not initialized, skipping PromQL metrics", "pod", pod.Name)
 		}
@@ -308,7 +309,7 @@ func (c *Store) worker(jobs <-chan *Pod) {
 	}
 }
 
-func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) {
+func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) (queryErr error) {
 	podName := pod.Name
 	podMetricPort := getPodMetricPort(pod)
 	for _, metricName := range prometheusMetricNames {
@@ -325,6 +326,9 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) {
 			err := c.queryUpdatePromQLMetrics(ctx, metric, queryLabels, pod, "", metricName, podMetricPort)
 			if err != nil {
 				klog.V(4).Infof("Failed to query and update PromQL metrics: %v", err)
+				if queryErr == nil {
+					queryErr = err
+				}
 				continue
 			}
 		} else if scope == metrics.PodModelMetricScope {
@@ -334,6 +338,9 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) {
 					err := c.queryUpdatePromQLMetrics(ctx, metric, queryLabels, pod, modelName, metricName, podMetricPort)
 					if err != nil {
 						klog.V(4).Infof("Failed to query and update PromQL metrics: %v", err)
+						if queryErr == nil {
+							queryErr = err
+						}
 						continue
 					}
 				}
@@ -344,6 +351,7 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) {
 			klog.V(4).Infof("Scope %v is not supported", scope)
 		}
 	}
+	return queryErr
 }
 
 func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Metric, queryLabels map[string]string, pod *Pod, modelName string, metricName string, podMetricPort int) error {
@@ -352,7 +360,7 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met
 	// Querying metrics
 	result, warnings, err := c.prometheusApi.Query(ctx, query, time.Now())
 	if err != nil {
-		metrics.EmitCounterMetric(&types.RoutingContext{Model: modelName}, pod.Pod, metrics.PrometheusQueryFail, 1.0, nil)
+		metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: modelName}, pod.Pod, metrics.PrometheusQueryFail, &metrics.SimpleMetricValue{Value: 1.0}, nil)
 		// Skip this model fetching if an error is thrown
 		return fmt.Errorf("error executing query: %v", err)
 	}
diff --git a/pkg/cache/cache_metrics_test.go b/pkg/cache/cache_metrics_test.go
index 650848f02..43143e3b3 100644
--- a/pkg/cache/cache_metrics_test.go
+++ b/pkg/cache/cache_metrics_test.go
@@ -26,6 +26,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/stretchr/testify/require"
 	"github.com/vllm-project/aibrix/pkg/metrics"
+	"github.com/vllm-project/aibrix/pkg/types"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/rest"
@@ -153,10 +154,13 @@ func TestEmitMetricToPrometheus_GaugeAndCounter(t *testing.T) {
 		}{name: name, value: value})
 	}
 
-	labels := []string{"pod"}
-	values := []string{"p1"}
-
-	metrics.EmitMetricToPrometheus(metrics.NumRequestsRunning, &metrics.SimpleMetricValue{Value: 3}, labels, values)
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "p1",
+			Namespace: "ns1",
+		},
+	}
+	metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod, metrics.NumRequestsRunning, &metrics.SimpleMetricValue{Value: 3}, nil)
 	require.Len(t, gaugeCalls, 1)
 	require.Equal(t, metrics.NumRequestsRunning, gaugeCalls[0].name)
 	require.Equal(t, 3.0, gaugeCalls[0].value)
@@ -189,7 +193,13 @@ func TestEmitMetricToPrometheus_HistogramAlsoEmitsQuantiles(t *testing.T) {
 			"+Inf":     2,
 		},
 	}
-	metrics.EmitMetricToPrometheus(metrics.TimeToFirstTokenSeconds, hv, []string{"pod"}, []string{"p1"})
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "p1",
+			Namespace: "ns1",
+		},
+	}
+	metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod, metrics.TimeToFirstTokenSeconds, hv, nil)
 
 	require.Contains(t, gaugeMetricNames, metrics.TimeToFirstTokenSeconds+"_p50")
 	require.Contains(t, gaugeMetricNames, metrics.TimeToFirstTokenSeconds+"_p90")
diff --git a/pkg/cache/kvcache/event_types.go b/pkg/cache/kvcache/event_types.go
index a69beb519..6d7f21c1e 100644
--- a/pkg/cache/kvcache/event_types.go
+++ b/pkg/cache/kvcache/event_types.go
@@ -82,11 +82,16 @@ type KVEvent interface {
 // ------------------------------------------------------------
 //
 // lora_id and medium are unused for now.
+//
+// Note: BlockHashes are converted at decode time:
+// - vLLM legacy format (int64) → stored as-is
+// - vLLM new format (32-byte SHA-256 from PR #23673) → first 8 bytes converted to int64
+// This ensures internal consistency and compatibility with existing code.
 type BlockStoredEvent struct {
 	_               struct{}  `msgpack:",array"` // msgspec array encoding
 	Type            EventType `msgpack:"-"`
-	BlockHashes     []int64
-	ParentBlockHash *int64
+	BlockHashes     []int64   // Decoded from vLLM, supports both old and new formats
+	ParentBlockHash *int64    // Decoded from vLLM, supports both old and new formats
 	TokenIDs        [][]byte
 
 	// NOTE: These are NOT part of msgpack
@@ -110,10 +115,14 @@ func (e *BlockStoredEvent) setPodName(name string)    { e.PodName = name }
 // ------------------------------------------------------------
 //
 // lora_id is unused for now.
+//
+// Note: BlockHashes are converted at decode time:
+// - vLLM legacy format (int64) → stored as-is
+// - vLLM new format (32-byte SHA-256 from PR #23673) → first 8 bytes converted to int64
 type BlockRemovedEvent struct {
 	_           struct{}  `msgpack:",array"`
 	Type        EventType `msgpack:"-"`
-	BlockHashes []int64
+	BlockHashes []int64   // Decoded from vLLM, supports both old and new formats
 
 	// NOTE: These are NOT part of msgpack
 	Timestamp time.Time `msgpack:"-"`
diff --git a/pkg/cache/kvcache/msgpack_decoder.go b/pkg/cache/kvcache/msgpack_decoder.go
index 471c315af..5cc72e374 100644
--- a/pkg/cache/kvcache/msgpack_decoder.go
+++ b/pkg/cache/kvcache/msgpack_decoder.go
@@ -21,6 +21,7 @@ import (
 	"time"
 
 	msgpack "github.com/vmihailenco/msgpack/v5"
+	"k8s.io/klog/v2"
 )
 
 // DecodeEventBatch parses a raw msgpack batch of events.
@@ -35,7 +36,15 @@ func DecodeEventBatch(
 	if err := msgpack.Unmarshal(data, &rawBatch); err != nil {
 		return nil, fmt.Errorf("failed to unmarshal event batch: %w", err)
 	}
-	if len(rawBatch) != 2 {
+	// if size of rawBatch is 3, the third element is the data parallel rank
+	// data_parallel_rank is not used in aibrix now
+	if len(rawBatch) == 3 {
+		if data_parallel_rank, err := parseInt(rawBatch[2]); err != nil {
+			return nil, fmt.Errorf("data_parallel_rank is not an int: %T", rawBatch[2])
+		} else {
+			klog.V(4).Infof("event has data_parallel_rank: %d", data_parallel_rank)
+		}
+	} else if len(rawBatch) != 2 {
 		return nil, fmt.Errorf("expected 2 elements in batch (ts, events), got %d", len(rawBatch))
 	}
 
@@ -97,13 +106,13 @@ func parseEventArray(arr []interface{}) (KVEvent, error) {
 		}
 
 		// 1: block_hashes
-		blockHashes, err := toInt64Slice(arr[1])
+		blockHashes, err := toBlockHashSlice(arr[1])
 		if err != nil {
 			return nil, fmt.Errorf("invalid block_hashes: %w", err)
 		}
 
 		// 2: parent_block_hash
-		parentHash, err := toInt64Ptr(arr[2])
+		parentHash, err := toBlockHashPtr(arr[2])
 		if err != nil {
 			return nil, fmt.Errorf("invalid parent_block_hash: %w", err)
 		}
@@ -148,7 +157,7 @@ func parseEventArray(arr []interface{}) (KVEvent, error) {
 			return nil, fmt.Errorf("BlockRemoved expects ≥2 fields, got %d", len(arr))
 		}
 
-		blockHashes, err := toInt64Slice(arr[1])
+		blockHashes, err := toBlockHashSlice(arr[1])
 		if err != nil {
 			return nil, fmt.Errorf("invalid block_hashes: %w", err)
 		}
@@ -190,6 +199,128 @@ func applyBatchMetadata(evt KVEvent, ts time.Time, model, pod string) {
 	}
 }
 
+// toBlockHashSlice converts block_hashes field to []int64.
+// Supports both legacy int64 format and new bytes format from vLLM PR #23673.
+// This function handles the conversion at decode time, keeping the rest of the codebase simple.
+func toBlockHashSlice(v any) ([]int64, error) {
+	raw, ok := v.([]interface{})
+	if !ok {
+		return nil, fmt.Errorf("expected []interface{}, got %T", v)
+	}
+
+	out := make([]int64, len(raw))
+	for i, x := range raw {
+		hash, err := parseBlockHashToInt64(x)
+		if err != nil {
+			return nil, fmt.Errorf("block_hashes[%d]: %w", i, err)
+		}
+		out[i] = hash
+	}
+	return out, nil
+}
+
+// bytesToInt64 converts a byte array to int64 using big-endian encoding.
+// If the byte array is shorter than 8 bytes, it pads with leading zeros.
+func bytesToInt64(b []byte) int64 {
+	if len(b) >= 8 {
+		// Use first 8 bytes for both 8-byte and 32-byte formats
+		return int64(binary.BigEndian.Uint64(b[:8]))
+	}
+	// Unexpected short byte array: pad with leading zeros for big-endian
+	padded := make([]byte, 8)
+	copy(padded[8-len(b):], b)
+	return int64(binary.BigEndian.Uint64(padded))
+}
+
+// parseBlockHashToInt64 parses a single block hash and converts it to int64.
+// Supports:
+// 1. int64 types (legacy format from old vLLM) → used directly
+// 2. []byte (new format from vLLM PR #23673):
+//   - 8 bytes: big-endian int64
+//   - 32 bytes: SHA-256, uses first 8 bytes
+//
+// 3. string (msgpack may decode bytes as string) → same as []byte
+//
+// Using the first 8 bytes of SHA-256 provides sufficient uniqueness:
+// - Collision probability ≈ 1/2^64 ≈ 10^-19 (extremely low)
+// - In typical scenarios (thousands to millions of blocks), collisions are virtually impossible
+func parseBlockHashToInt64(v any) (int64, error) {
+	switch x := v.(type) {
+	case []byte:
+		return bytesToInt64(x), nil
+
+	case string:
+		// msgpack may decode bytes as string
+		return bytesToInt64([]byte(x)), nil
+
+	// Legacy format: integer types → convert to int64
+	case int64:
+		return x, nil
+
+	case uint64:
+		return int64(x), nil
+
+	case int:
+		return int64(x), nil
+
+	case uint:
+		return int64(x), nil
+
+	case int8:
+		return int64(x), nil
+
+	case int16:
+		return int64(x), nil
+
+	case int32:
+		return int64(x), nil
+
+	case uint8:
+		return int64(x), nil
+
+	case uint16:
+		return int64(x), nil
+
+	case uint32:
+		return int64(x), nil
+
+	// Floating-point types (for backward compatibility with msgpack decoding)
+	case float32:
+		f := float64(x)
+		if f < math.MinInt64 || f > math.MaxInt64 {
+			return 0, fmt.Errorf("float32 out of int64 range: %f", f)
+		}
+		if f != math.Trunc(f) {
+			return 0, fmt.Errorf("float32 has fractional part: %f", f)
+		}
+		return int64(f), nil
+
+	case float64:
+		if x < math.MinInt64 || x > math.MaxInt64 {
+			return 0, fmt.Errorf("float64 out of int64 range: %f", x)
+		}
+		if x != math.Trunc(x) {
+			return 0, fmt.Errorf("float64 has fractional part: %f", x)
+		}
+		return int64(x), nil
+
+	default:
+		return 0, fmt.Errorf("unsupported block hash type: %T", v)
+	}
+}
+
+// toBlockHashPtr converts a single block hash (can be nil) to *int64
+func toBlockHashPtr(v any) (*int64, error) {
+	if v == nil {
+		return nil, nil
+	}
+	hash, err := parseBlockHashToInt64(v)
+	if err != nil {
+		return nil, err
+	}
+	return &hash, nil
+}
+
 func toInt64Slice(v any) ([]int64, error) {
 	raw, ok := v.([]interface{})
 	if !ok {
diff --git a/pkg/cache/kvcache/msgpack_decoder_test.go b/pkg/cache/kvcache/msgpack_decoder_test.go
index e3b37a9b9..d7b16ed65 100644
--- a/pkg/cache/kvcache/msgpack_decoder_test.go
+++ b/pkg/cache/kvcache/msgpack_decoder_test.go
@@ -21,6 +21,7 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	msgpack "github.com/vmihailenco/msgpack/v5"
 )
 
 func TestBlockStoredEventEncodeDecode(t *testing.T) {
@@ -225,3 +226,149 @@ func TestMultipleEventsInBatch_MixedEvents(t *testing.T) {
 		}
 	}
 }
+
+func TestBlockHashesAsBytesInDecodeEventBatch(t *testing.T) {
+	// Test DecodeEventBatch with BlockHashes as [][]byte format
+	// This simulates the new vLLM format where block hashes are sent as bytes
+	ts := time.Now().UTC()
+
+	// Construct msgpack data manually with BlockHashes as [][]byte
+	// Format: [timestamp, [event_array]]
+	// event_array for BlockStored: ["block_stored", block_hashes, parent_hash, token_ids, block_size]
+
+	// Create block hash bytes (8-byte big-endian int64)
+	hash1 := make([]byte, 8)
+	binary.BigEndian.PutUint64(hash1, uint64(12345))
+
+	hash2 := make([]byte, 8)
+	binary.BigEndian.PutUint64(hash2, uint64(67890))
+
+	parentHash := make([]byte, 8)
+	binary.BigEndian.PutUint64(parentHash, uint64(99999))
+
+	// Create a BlockStored event with bytes format
+	eventArray := []interface{}{
+		"BlockStored",
+		[]interface{}{hash1, hash2}, // block_hashes as [][]byte
+		parentHash,                  // parent_block_hash as []byte
+		[]interface{}{uint32(1), uint32(2), uint32(3), uint32(4)}, // token_ids
+		int(2), // block_size
+	}
+
+	batch := []interface{}{
+		float64(ts.Unix()),        // timestamp as float64
+		[]interface{}{eventArray}, // events
+	}
+
+	data, err := msgpack.Marshal(batch)
+	require.NoError(t, err)
+
+	// Decode
+	decoded, err := DecodeEventBatch(data, "test-model", "test-pod")
+	require.NoError(t, err)
+	require.Len(t, decoded.Events, 1)
+
+	// Verify BlockStoredEvent
+	stored, ok := decoded.Events[0].(*BlockStoredEvent)
+	require.True(t, ok, "decoded event is not BlockStoredEvent")
+
+	// Verify block hashes were correctly converted from []byte to int64
+	assert.Equal(t, int64(12345), stored.BlockHashes[0])
+	assert.Equal(t, int64(67890), stored.BlockHashes[1])
+
+	// Verify parent hash
+	require.NotNil(t, stored.ParentBlockHash)
+	assert.Equal(t, int64(99999), *stored.ParentBlockHash)
+
+	// Verify token IDs
+	require.Len(t, stored.TokenIDs, 2) // 4 tokens / block_size(2) = 2 blocks
+	for i, block := range stored.TokenIDs {
+		for j := 0; j < len(block); j += 4 {
+			val := binary.BigEndian.Uint32(block[j : j+4])
+			expectedVal := uint32(i*2 + j/4 + 1) // 1,2 for first block, 3,4 for second
+			assert.Equal(t, expectedVal, val,
+				"token mismatch at block %d index %d", i, j/4)
+		}
+	}
+
+	// Verify metadata
+	assert.Equal(t, "test-model", stored.ModelName)
+	assert.Equal(t, "test-pod", stored.PodName)
+}
+
+func TestBlockHashesAsSHA256BytesInDecodeEventBatch(t *testing.T) {
+	// Test DecodeEventBatch with 32-byte SHA-256 hashes
+	// The decoder should use the first 8 bytes
+	ts := time.Now().UTC()
+
+	// Create a 32-byte SHA-256 hash
+	sha256Hash := make([]byte, 32)
+	for i := 0; i < 32; i++ {
+		sha256Hash[i] = byte(i)
+	}
+
+	// Expected: first 8 bytes converted to int64
+	expectedHash := int64(binary.BigEndian.Uint64(sha256Hash[:8]))
+
+	eventArray := []interface{}{
+		"BlockStored",
+		[]interface{}{sha256Hash},           // 32-byte hash
+		nil,                                 // no parent
+		[]interface{}{uint32(1), uint32(2)}, // token_ids
+		int(2),                              // block_size
+	}
+
+	batch := []interface{}{
+		float64(ts.Unix()), // timestamp as float64
+		[]interface{}{eventArray},
+	}
+
+	data, err := msgpack.Marshal(batch)
+	require.NoError(t, err)
+
+	decoded, err := DecodeEventBatch(data, "sha256-model", "sha256-pod")
+	require.NoError(t, err)
+	require.Len(t, decoded.Events, 1)
+
+	stored, ok := decoded.Events[0].(*BlockStoredEvent)
+	require.True(t, ok)
+
+	assert.Len(t, stored.BlockHashes, 1)
+	assert.Equal(t, expectedHash, stored.BlockHashes[0])
+	assert.Nil(t, stored.ParentBlockHash)
+}
+
+func TestBlockRemovedEventWithBytesHashes(t *testing.T) {
+	// Test BlockRemovedEvent with block hashes as bytes
+	ts := time.Now().UTC()
+
+	hash1 := make([]byte, 8)
+	binary.BigEndian.PutUint64(hash1, uint64(111))
+
+	hash2 := make([]byte, 8)
+	binary.BigEndian.PutUint64(hash2, uint64(222))
+
+	eventArray := []interface{}{
+		"BlockRemoved",
+		[]interface{}{hash1, hash2}, // block_hashes as bytes
+	}
+
+	batch := []interface{}{
+		float64(ts.Unix()), // timestamp as float64
+		[]interface{}{eventArray},
+	}
+
+	data, err := msgpack.Marshal(batch)
+	require.NoError(t, err)
+
+	decoded, err := DecodeEventBatch(data, "removed-model", "removed-pod")
+	require.NoError(t, err)
+	require.Len(t, decoded.Events, 1)
+
+	removed, ok := decoded.Events[0].(*BlockRemovedEvent)
+	require.True(t, ok)
+
+	assert.Equal(t, []int64{111, 222}, removed.BlockHashes)
+	assert.Equal(t, "removed-model", removed.ModelName)
+	assert.Equal(t, "removed-pod", removed.PodName)
+}
diff --git a/pkg/cache/kvcache/msgpack_encoder.go b/pkg/cache/kvcache/msgpack_encoder.go
index c363c3a1d..4a832c7b8 100644
--- a/pkg/cache/kvcache/msgpack_encoder.go
+++ b/pkg/cache/kvcache/msgpack_encoder.go
@@ -75,7 +75,7 @@ func encodeEvent(event KVEvent) ([]interface{}, error) {
 		arr := []interface{}{
 			string(e.Type),    // tag
 			e.BlockHashes,     // block_hashes
-			e.ParentBlockHash, // parent_block_hash (nullable)
+			e.ParentBlockHash, // parent_block_hash (nullable *[]byte)
 			tokenIDs,          // flat token IDs
 			blockSize,         // block_size
 		}
diff --git a/pkg/cache/kvcache/zmq_client.go b/pkg/cache/kvcache/zmq_client.go
index 22c3371ba..7fbada3f1 100644
--- a/pkg/cache/kvcache/zmq_client.go
+++ b/pkg/cache/kvcache/zmq_client.go
@@ -377,11 +377,13 @@ func (c *ZMQClient) requestReplay(fromSeq int64) error {
 	}
 
 	// Prepare replay request
+	// DEALER-ROUTER pattern requires: [empty_delimiter, payload]
+	// The DEALER socket will automatically prepend the identity frame
 	reqData := make([]byte, 8)
 	binary.BigEndian.PutUint64(reqData, uint64(fromSeq))
 
-	// Send replay request
-	if _, err := socket.SendBytes(reqData, 0); err != nil {
+	// Send replay request as multipart message: [empty_delimiter, start_seq_bytes]
+	if _, err := socket.SendMessage([]byte{}, reqData); err != nil {
 		return fmt.Errorf("failed to send replay request: %w", err)
 	}
 
diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go
index ae3124193..69df58aae 100644
--- a/pkg/cache/utils.go
+++ b/pkg/cache/utils.go
@@ -76,6 +76,69 @@ func buildMetricLabels(pod *Pod, engineType string, model string) ([]string, []s
 	return labelNames, labelValues
 }
 
+func mergeLabelPairs(primaryNames, primaryValues, secondaryNames, secondaryValues []string) ([]string, []string) {
+	pLen := len(primaryNames)
+	if len(primaryValues) < pLen {
+		klog.Warningf("primary labels length mismatch: names=%d, values=%d", pLen, len(primaryValues))
+		pLen = len(primaryValues)
+	}
+	sLen := len(secondaryNames)
+	if len(secondaryValues) < sLen {
+		klog.Warningf("secondary labels length mismatch: names=%d, values=%d", sLen, len(secondaryValues))
+		sLen = len(secondaryValues)
+	}
+
+	secondaryMap := make(map[string]string, sLen)
+	secondaryOrder := make([]string, 0, sLen)
+	for i := 0; i < sLen; i++ {
+		n := secondaryNames[i]
+		if n == "" {
+			continue
+		}
+		if _, exists := secondaryMap[n]; !exists {
+			secondaryOrder = append(secondaryOrder, n)
+		}
+		secondaryMap[n] = secondaryValues[i] // last-wins
+	}
+
+	outNames := make([]string, 0, pLen+len(secondaryOrder))
+	outValues := make([]string, 0, pLen+len(secondaryOrder))
+	seen := make(map[string]struct{}, pLen+len(secondaryOrder))
+
+	// primary first, but allow secondary override
+	for i := 0; i < pLen; i++ {
+		n := primaryNames[i]
+		if n == "" {
+			continue
+		}
+		if _, ok := seen[n]; ok {
+			continue
+		}
+		seen[n] = struct{}{}
+		v := primaryValues[i]
+		if sv, ok := secondaryMap[n]; ok {
+			v = sv
+		}
+		outNames = append(outNames, n)
+		outValues = append(outValues, v)
+	}
+
+	// then add secondary-only labels (use map value to respect last-wins)
+	for _, n := range secondaryOrder {
+		if n == "" {
+			continue
+		}
+		if _, ok := seen[n]; ok {
+			continue
+		}
+		seen[n] = struct{}{}
+		outNames = append(outNames, n)
+		outValues = append(outValues, secondaryMap[n])
+	}
+
+	return outNames, outValues
+}
+
 func shouldSkipMetric(podName string, metricName string) bool {
 	if strings.Contains(podName, "prefill") && isDecodeOnlyMetric(metricName) {
 		return true
diff --git a/pkg/cache/utils_test.go b/pkg/cache/utils_test.go
new file mode 100644
index 000000000..71d910332
--- /dev/null
+++ b/pkg/cache/utils_test.go
@@ -0,0 +1,52 @@
+/*
+Copyright 2025 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package cache
+
+import (
+	"testing"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/stretchr/testify/require"
+)
+
+func TestMergeLabelPairs_DedupAndPreferSecondaryValue(t *testing.T) {
+	primaryNames := []string{"engine_type", "model_name"}
+	primaryValues := []string{"from_engine", "m1"}
+	secondaryNames := []string{"namespace", "engine_type", "pod"}
+	secondaryValues := []string{"ns1", "vllm", "p1"}
+
+	mergedNames, mergedValues := mergeLabelPairs(primaryNames, primaryValues, secondaryNames, secondaryValues)
+	require.Equal(t, []string{"engine_type", "model_name", "namespace", "pod"}, mergedNames)
+	require.Equal(t, []string{"vllm", "m1", "ns1", "p1"}, mergedValues)
+
+	seen := make(map[string]struct{}, len(mergedNames))
+	for _, n := range mergedNames {
+		_, ok := seen[n]
+		require.False(t, ok)
+		seen[n] = struct{}{}
+	}
+
+	descDup := prometheus.NewDesc("num_requests_running", "help", []string{"engine_type", "engine_type"}, nil)
+	require.Panics(t, func() {
+		_ = prometheus.MustNewConstMetric(descDup, prometheus.GaugeValue, 1, "a", "b")
+	})
+
+	descMerged := prometheus.NewDesc("num_requests_running", "help", mergedNames, nil)
+	require.NotPanics(t, func() {
+		_ = prometheus.MustNewConstMetric(descMerged, prometheus.GaugeValue, 1, mergedValues...)
+	})
+}
diff --git a/pkg/constants/model.go b/pkg/constants/model.go
index e2b1a3082..b988889d0 100644
--- a/pkg/constants/model.go
+++ b/pkg/constants/model.go
@@ -45,4 +45,9 @@ const (
 	// ModelAnnoRouterCustomPath is the anno for add PathPrefixes in httpRoute, split by comma
 	// Example: "model.aibrix.ai/model-router-custom-paths": "/score,/version"
 	ModelAnnoRouterCustomPath = "model.aibrix.ai/model-router-custom-paths"
+
+	// ModelAnnoConfig is the annotation holding JSON model config with multiple profiles.
+	// Client selects profile at runtime via config-profile header or defaultProfile is selected.
+	// See docs/source/designs/model-config-profiles.rst for schema.
+	ModelAnnoConfig = "model.aibrix.ai/config"
 )
diff --git a/pkg/kvevent/handler.go b/pkg/kvevent/handler.go
index 1774c8b96..ec6aab47f 100644
--- a/pkg/kvevent/handler.go
+++ b/pkg/kvevent/handler.go
@@ -77,6 +77,7 @@ func (h *eventHandler) handleBlockStored(ctx context.Context, event *kvcache.Blo
 	}
 
 	// Convert to sync event
+	// Note: BlockHashes are already []int64 after msgpack decoding
 	syncEvent := BlockStoredEvent{
 		BlockHashes:     event.BlockHashes,
 		ModelName:       h.modelName,
@@ -110,6 +111,7 @@ func (h *eventHandler) handleBlockRemoved(ctx context.Context, event *kvcache.Bl
 	}
 
 	// Convert to sync event
+	// Note: BlockHashes are already []int64 after msgpack decoding
 	syncEvent := BlockRemovedEvent{
 		BlockHashes: event.BlockHashes,
 		ModelName:   h.modelName,
diff --git a/pkg/kvevent/handler_test.go b/pkg/kvevent/handler_test.go
index 702800fc9..331496482 100644
--- a/pkg/kvevent/handler_test.go
+++ b/pkg/kvevent/handler_test.go
@@ -26,6 +26,15 @@ import (
 	"github.com/vllm-project/aibrix/pkg/cache/kvcache"
 )
 
+// Helper function to convert int32 slice to bytes (big-endian)
+func int32SliceToBytes(tokens []int32) []byte {
+	result := make([]byte, len(tokens)*4)
+	for i, token := range tokens {
+		binary.BigEndian.PutUint32(result[i*4:], uint32(token))
+	}
+	return result
+}
+
 // mockSyncIndexerWithErrors allows simulating errors
 type mockSyncIndexerWithErrors struct {
 	blockStoredErr  error
@@ -62,86 +71,6 @@ func (m *mockSyncProvider) GetSyncIndexer(ctx context.Context) (SyncIndexer, err
 	return m.indexer, nil
 }
 
-// Test tokenIDsToBytes conversion
-func TestTokenIDsToBytes(t *testing.T) {
-	tests := []struct {
-		name     string
-		tokenIDs []int32
-		expected []byte
-	}{
-		{
-			name:     "empty tokens",
-			tokenIDs: []int32{},
-			expected: []byte{},
-		},
-		{
-			name:     "single token",
-			tokenIDs: []int32{12345},
-			expected: []byte{0, 0, 48, 57}, // 12345 in big-endian
-		},
-		{
-			name:     "multiple tokens",
-			tokenIDs: []int32{1, 256, 65535},
-			expected: []byte{
-				0, 0, 0, 1, // 1
-				0, 0, 1, 0, // 256
-				0, 0, 255, 255, // 65535
-			},
-		},
-		{
-			name:     "negative token",
-			tokenIDs: []int32{-1},
-			expected: []byte{255, 255, 255, 255}, // -1 in two's complement
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := tokenIDsToBytes(tt.tokenIDs)
-			if len(result) != len(tt.expected) {
-				t.Fatalf("Expected length %d, got %d", len(tt.expected), len(result))
-			}
-			for i := range result {
-				if result[i] != tt.expected[i] {
-					t.Errorf("Byte %d: expected %d, got %d", i, tt.expected[i], result[i])
-				}
-			}
-		})
-	}
-}
-
-// Test convertTokenIDs
-func TestConvertTokenIDs(t *testing.T) {
-	input := [][]int32{
-		{1, 2, 3},
-		{},
-		{12345},
-	}
-
-	result := convertTokenIDs(input)
-
-	if len(result) != 3 {
-		t.Fatalf("Expected 3 results, got %d", len(result))
-	}
-
-	// Check first array
-	expected0 := []byte{0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3}
-	if len(result[0]) != len(expected0) {
-		t.Errorf("Result[0] length mismatch")
-	}
-
-	// Check second array (empty)
-	if len(result[1]) != 0 {
-		t.Errorf("Result[1] should be empty")
-	}
-
-	// Check third array
-	expected2 := []byte{0, 0, 48, 57}
-	if len(result[2]) != len(expected2) {
-		t.Errorf("Result[2] length mismatch")
-	}
-}
-
 // Test HandleEvent with BlockStoredEvent
 func TestHandleBlockStoredEvent(t *testing.T) {
 	syncIndexer := &mockSyncIndexerWithErrors{}
@@ -165,7 +94,10 @@ func TestHandleBlockStoredEvent(t *testing.T) {
 	event := &kvcache.BlockStoredEvent{
 		BlockHashes:     []int64{1001, 1002, 1003},
 		ParentBlockHash: &[]int64{1000}[0],
-		TokenIDs:        [][]int32{{1, 2, 3}, {4, 5, 6}},
+		TokenIDs: [][]byte{
+			int32SliceToBytes([]int32{1, 2, 3}),
+			int32SliceToBytes([]int32{4, 5, 6}),
+		},
 	}
 
 	err := handler.HandleEvent(event)
diff --git a/pkg/kvevent/integration_test.go b/pkg/kvevent/integration_test.go
index 88f5f5b8a..59f94d735 100644
--- a/pkg/kvevent/integration_test.go
+++ b/pkg/kvevent/integration_test.go
@@ -19,6 +19,7 @@ package kvevent_test
 
 import (
 	"context"
+	"encoding/binary"
 	"testing"
 
 	v1 "k8s.io/api/core/v1"
@@ -31,6 +32,15 @@ import (
 	syncindexer "github.com/vllm-project/aibrix/pkg/utils/syncprefixcacheindexer"
 )
 
+// Helper function to convert int32 slice to bytes (big-endian)
+func int32SliceToBytes(tokens []int32) []byte {
+	result := make([]byte, len(tokens)*4)
+	for i, token := range tokens {
+		binary.BigEndian.PutUint32(result[i*4:], uint32(token))
+	}
+	return result
+}
+
 // TestIntegrationEventHandlerWithRealComponents tests event handling with real components
 // This ensures we're not just testing mocks but actual integration between components
 func TestIntegrationEventHandlerWithRealComponents(t *testing.T) {
@@ -79,7 +89,11 @@ func TestIntegrationEventHandlerWithRealComponents(t *testing.T) {
 	storedEvent := &kvcache.BlockStoredEvent{
 		BlockHashes:     []int64{1001, 1002, 1003},
 		ParentBlockHash: &[]int64{1000}[0],
-		TokenIDs:        [][]int32{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+		TokenIDs: [][]byte{
+			int32SliceToBytes([]int32{1, 2, 3}),
+			int32SliceToBytes([]int32{4, 5, 6}),
+			int32SliceToBytes([]int32{7, 8, 9}),
+		},
 	}
 
 	err := handler.HandleEvent(storedEvent)
diff --git a/pkg/kvevent/manager.go b/pkg/kvevent/manager.go
index 6779878ef..1b12f8b97 100644
--- a/pkg/kvevent/manager.go
+++ b/pkg/kvevent/manager.go
@@ -274,7 +274,7 @@ func isPodSubscribable(pod *v1.Pod) bool {
 
 func isSamePod(pod1 *v1.Pod, pod2 *v1.Pod) bool {
 	// For now, we just check if PodIP is the same. Other conditions may be added if needed.
-	return pod1.Status.PodIP != pod2.Status.PodIP
+	return pod1.Status.PodIP == pod2.Status.PodIP
 }
 
 func (m *Manager) subscribeToPod(ctx context.Context, podKey string, podInfo *PodInfo) error {
diff --git a/pkg/metrics/custom_metrics.go b/pkg/metrics/custom_metrics.go
index 2646faa76..923159602 100644
--- a/pkg/metrics/custom_metrics.go
+++ b/pkg/metrics/custom_metrics.go
@@ -25,7 +25,6 @@ import (
 
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
-	"github.com/vllm-project/aibrix/pkg/constants"
 	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
 	v1 "k8s.io/api/core/v1"
@@ -38,6 +37,7 @@ var (
 	customCountersMu   sync.RWMutex
 	customHistograms   = make(map[string]*histogramCollector)
 	customHistogramsMu sync.RWMutex
+	gatewayPodName     = os.Getenv("POD_NAME")
 
 	// Function variables that can be overridden for testing
 	SetGaugeMetricFnForTest         = defaultSetGaugeMetric
@@ -73,7 +73,7 @@ func IncrementCounterMetric(name string, help string, value float64, labelNames
 	IncrementCounterMetricFnForTest(name, help, value, labelNames, labelValues...)
 }
 
-func EmitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) {
+func emitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) {
 	var model string
 	if routingCtx == nil {
 		model = ""
@@ -81,10 +81,10 @@ func EmitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string,
 		model = routingCtx.Model
 	}
 	labelNames, labelValues := buildMetricLabels(pod, model, extras)
-	SetGaugeMetricFnForTest(name, GetMetricHelp(name), value, labelNames, labelValues...)
+	SetGaugeMetric(name, GetMetricHelp(name), value, labelNames, labelValues...)
 }
 
-func EmitCounterMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) {
+func emitCounterMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) {
 	var model string
 	if routingCtx == nil {
 		model = ""
@@ -267,18 +267,23 @@ func SetupCounterMetricsForTest(metricName string, labelNames []string) (*promet
 	return testCounter, func() { IncrementCounterMetricFnForTest = originalFn }
 }
 
-func EmitMetricToPrometheus(metricName string, metricValue MetricValue, labelNames []string, labelValues []string) {
+func EmitMetricToPrometheus(routingCtx *types.RoutingContext, pod *v1.Pod, metricName string, metricValue MetricValue, extra map[string]string) {
 	metricDef, exists := Metrics[metricName]
 	if !exists {
 		return
 	}
+	var model string
+	if routingCtx != nil {
+		model = routingCtx.Model
+	}
 
 	switch metricDef.MetricType.Raw {
 	case Gauge:
-		SetGaugeMetric(metricName, GetMetricHelp(metricName), metricValue.GetSimpleValue(), labelNames, labelValues...)
+		emitGaugeMetric(routingCtx, pod, metricName, metricValue.GetSimpleValue(), extra)
 	case Counter:
-		SetGaugeMetric(metricName, GetMetricHelp(metricName), metricValue.GetSimpleValue(), labelNames, labelValues...)
+		emitCounterMetric(routingCtx, pod, metricName, metricValue.GetSimpleValue(), extra)
 	default:
+		labelNames, labelValues := buildMetricLabels(pod, model, extra)
 		if hv := metricValue.GetHistogramValue(); hv != nil {
 			SetHistogramMetric(metricName, GetMetricHelp(metricName), hv, labelNames, labelValues...)
 			p50, _ := hv.GetPercentile(50)
@@ -292,13 +297,24 @@ func EmitMetricToPrometheus(metricName string, metricValue MetricValue, labelNam
 }
 
 func buildMetricLabels(pod *v1.Pod, model string, extras map[string]string) ([]string, []string) {
-	labelNames, labelValues := generateDefaultMetricLabelsMap(pod, model)
+	defaultLabelMap := generateDefaultMetricLabelsMap(pod, model)
+	labelNames := make([]string, 0, len(defaultLabelMap)+len(extras))
+	labelValues := make([]string, 0, len(defaultLabelMap)+len(extras))
+	for k, v := range defaultLabelMap {
+		labelNames = append(labelNames, k)
+		labelValues = append(labelValues, v)
+	}
+
 	if len(extras) > 0 {
 		keys := make([]string, 0, len(extras))
 		for k := range extras {
-			if k != "" {
-				keys = append(keys, k)
+			if k == "" {
+				continue
 			}
+			if _, exist := defaultLabelMap[k]; exist {
+				continue
+			}
+			keys = append(keys, k)
 		}
 		sort.Strings(keys)
 		for _, k := range keys {
@@ -309,35 +325,21 @@ func buildMetricLabels(pod *v1.Pod, model string, extras map[string]string) ([]s
 	return labelNames, labelValues
 }
 
-func generateDefaultMetricLabelsMap(pod *v1.Pod, model string) (labelNames []string, labelValues []string) {
-	labelNames = []string{
-		"namespace",
-		"pod",
-		"model",
-		"engine_type",
-		"roleset",
-		"role",
-		"role_replica_index",
-		"gateway_pod",
-	}
-	var namespace, podName, engineType, roleset, role, roleReplica string
-	if pod != nil {
-		namespace = pod.Namespace
-		podName = pod.Name
-		engineType = utils.GetLLMEngine(pod, constants.ModelLabelEngine, utils.DefaultLLMEngine)
-		roleset = utils.GetPodEnv(pod, "ROLESET_NAME", "")
-		role = utils.GetPodEnv(pod, "ROLE_NAME", "")
-		roleReplica = utils.GetPodEnv(pod, "ROLE_REPLICA_INDEX", "")
+func generateDefaultMetricLabelsMap(pod *v1.Pod, model string) map[string]string {
+	if pod == nil {
+		return map[string]string{
+			"model":       model,
+			"gateway_pod": gatewayPodName,
+		}
 	}
-	labelValues = []string{
-		namespace,
-		podName,
-		model,
-		engineType,
-		roleset,
-		role,
-		roleReplica,
-		os.Getenv("POD_NAME"), // gateway-plugin pod
+	return map[string]string{
+		"namespace":          pod.Namespace,
+		"pod":                pod.Name,
+		"model":              model,
+		"engine_type":        GetEngineType(*pod),
+		"roleset":            utils.GetPodEnv(pod, "ROLESET_NAME", ""),
+		"role":               utils.GetPodEnv(pod, "ROLE_NAME", ""),
+		"role_replica_index": utils.GetPodEnv(pod, "ROLE_REPLICA_INDEX", ""),
+		"gateway_pod":        gatewayPodName,
 	}
-	return labelNames, labelValues
 }
diff --git a/pkg/metrics/engine_fetcher.go b/pkg/metrics/engine_fetcher.go
index 827726621..ed0ac3468 100644
--- a/pkg/metrics/engine_fetcher.go
+++ b/pkg/metrics/engine_fetcher.go
@@ -295,11 +295,11 @@ func (ef *EngineMetricsFetcher) parseMetricFromFamily(allMetrics map[string]*dto
 	if metric.MetricType.IsRawMetric() {
 		switch metric.MetricType.Raw {
 		case Gauge, Counter:
-			value, err := GetCounterGaugeValue(firstMetric, metricFamily.GetType())
+			simpleValue, err := GetCounterGaugeValue(firstMetric, metricFamily.GetType())
 			if err != nil {
 				return nil, fmt.Errorf("failed to parse counter/gauge metric %s: %v", rawMetricName, err)
 			}
-			return &SimpleMetricValue{Value: value}, nil
+			return simpleValue, nil
 
 		case Histogram:
 			histValue, err := GetHistogramValue(firstMetric)
@@ -355,7 +355,7 @@ func (ef *EngineMetricsFetcher) fetchAllMetricsFromURL(ctx context.Context, url
 
 	resp, err := ef.client.Do(req)
 	if err != nil {
-		EmitCounterMetric(nil, nil, LLMEngineMetricsQueryFail, 1.0, nil)
+		EmitMetricToPrometheus(nil, nil, LLMEngineMetricsQueryFail, &SimpleMetricValue{Value: 1.0}, nil)
 		return nil, fmt.Errorf("failed to fetch metrics from %s: %v", url, err)
 	}
 	defer func() {
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index 1caf8cb82..1014a5a97 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -29,6 +29,7 @@ const (
 	E2ERequestLatencySeconds        = "e2e_request_latency_seconds"
 	RequestQueueTimeSeconds         = "request_queue_time_seconds"
 	RequestInferenceTimeSeconds     = "request_inference_time_seconds"
+	PerStageReqLatencySeconds       = "per_stage_req_latency_seconds"
 	HTTPRequestDurationSeconds      = "http_request_duration_seconds"
 	HTTPRequestDurationHighRSeconds = "http_request_duration_highr_seconds"
 
@@ -237,6 +238,17 @@ var (
 			},
 			Description: "Request inference time in seconds",
 		},
+		PerStageReqLatencySeconds: {
+			MetricScope:  PodModelMetricScope,
+			MetricSource: PodRawMetrics,
+			MetricType: MetricType{
+				Raw: Histogram,
+			},
+			EngineMetricsNameMapping: map[string]string{
+				"sglang": "sglang:per_stage_req_latency_seconds",
+			},
+			Description: "Per-stage request latency in seconds",
+		},
 		HTTPRequestDurationSeconds: {
 			MetricScope:  PodMetricScope,
 			MetricSource: PodRawMetrics,
diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go
index 50852ff2c..a21b3299c 100644
--- a/pkg/metrics/types.go
+++ b/pkg/metrics/types.go
@@ -92,7 +92,7 @@ type MetricValue interface {
 	GetSimpleValue() float64
 	GetHistogramValue() *HistogramMetricValue
 	GetPrometheusResult() *model.Value
-	GetLabelValue() string
+	GetLabelValues() map[string]string
 }
 
 var _ MetricValue = (*SimpleMetricValue)(nil)
@@ -102,7 +102,8 @@ var _ MetricValue = (*LabelValueMetricValue)(nil)
 
 // SimpleMetricValue represents simple metrics (e.g., gauge or counter).
 type SimpleMetricValue struct {
-	Value float64
+	Value  float64
+	Labels map[string]string // Optional: Additional labels for the metric.
 }
 
 func (s *SimpleMetricValue) GetSimpleValue() float64 {
@@ -117,8 +118,8 @@ func (s *SimpleMetricValue) GetPrometheusResult() *model.Value {
 	return nil
 }
 
-func (s *SimpleMetricValue) GetLabelValue() string {
-	return ""
+func (s *SimpleMetricValue) GetLabelValues() map[string]string {
+	return s.Labels
 }
 
 // HistogramMetricValue represents a detailed histogram metric.
@@ -126,6 +127,7 @@ type HistogramMetricValue struct {
 	Sum     float64
 	Count   float64
 	Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2}
+	Labels  map[string]string  // Optional: Additional labels for the histogram.
 }
 
 func (h *HistogramMetricValue) GetSimpleValue() float64 {
@@ -230,8 +232,8 @@ func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error
 	return 0, fmt.Errorf("percentile not found")
 }
 
-func (s *HistogramMetricValue) GetLabelValue() string {
-	return ""
+func (h *HistogramMetricValue) GetLabelValues() map[string]string {
+	return h.Labels
 }
 
 // PrometheusMetricValue represents Prometheus query results.
@@ -251,8 +253,8 @@ func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value {
 	return p.Result
 }
 
-func (s *PrometheusMetricValue) GetLabelValue() string {
-	return ""
+func (s *PrometheusMetricValue) GetLabelValues() map[string]string {
+	return map[string]string{}
 }
 
 // PrometheusMetricValue represents Prometheus query results.
@@ -272,8 +274,8 @@ func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value {
 	return nil
 }
 
-func (l *LabelValueMetricValue) GetLabelValue() string {
-	return l.Value
+func (l *LabelValueMetricValue) GetLabelValues() map[string]string {
+	return map[string]string{"value": l.Value}
 }
 
 func ExtractNumericFromPromResult(r *model.Value) (float64, error) {
diff --git a/pkg/metrics/utils.go b/pkg/metrics/utils.go
index 6993fdc29..7d0d53029 100644
--- a/pkg/metrics/utils.go
+++ b/pkg/metrics/utils.go
@@ -181,18 +181,25 @@ func GetLabelValueForKey(metric *dto.Metric, key string) (string, error) {
 	return "", fmt.Errorf("Label %s not found", key)
 }
 
-func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (float64, error) {
-	if metricType == dto.MetricType_COUNTER {
-		return metric.GetCounter().GetValue(), nil
-	} else if metricType == dto.MetricType_GAUGE {
-		return metric.GetGauge().GetValue(), nil
+func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (*SimpleMetricValue, error) {
+	labels := make(map[string]string)
+	for _, labelPair := range metric.Label {
+		labels[labelPair.GetName()] = labelPair.GetValue()
+	}
+	switch metricType {
+	case dto.MetricType_COUNTER:
+		return &SimpleMetricValue{Value: metric.GetCounter().GetValue(), Labels: labels}, nil
+	case dto.MetricType_GAUGE:
+		return &SimpleMetricValue{Value: metric.GetGauge().GetValue(), Labels: labels}, nil
+	default:
+		return nil, fmt.Errorf("Metric type not supported: %v", metricType)
 	}
-	return 0, fmt.Errorf("Metric type not supported: %v", metricType)
 }
 
 func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error) {
 	histogram := &HistogramMetricValue{
 		Buckets: make(map[string]float64),
+		Labels:  make(map[string]string),
 	}
 	histogramMetric := metric.GetHistogram()
 	if histogramMetric == nil {
@@ -205,6 +212,9 @@ func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error) {
 		bound := fmt.Sprintf("%f", bucket.GetUpperBound())
 		histogram.Buckets[bound] = float64(bucket.GetCumulativeCount())
 	}
+	for _, labelPair := range metric.Label {
+		histogram.Labels[labelPair.GetName()] = labelPair.GetValue()
+	}
 	return histogram, nil
 }
 
diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache.go b/pkg/plugins/gateway/algorithms/least_kv_cache.go
index fd2898a3e..3680680d0 100644
--- a/pkg/plugins/gateway/algorithms/least_kv_cache.go
+++ b/pkg/plugins/gateway/algorithms/least_kv_cache.go
@@ -57,7 +57,7 @@ func (r leastKvCacheRouter) Route(ctx *types.RoutingContext, readyPodList types.
 		// Due to metric refactor (pull/543) to better support lora and multi models,
 		// we change to use PodModelMetrics instead of PodMetrics in some scenarios.
 		// This works but doesn't look very promising, we can revisit this part later.
-		gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc)
+		gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.KVCacheUsagePerc)
 		if err != nil {
 			klog.Error(err)
 			continue
diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache_test.go b/pkg/plugins/gateway/algorithms/least_kv_cache_test.go
index ac4420dde..bf12009d8 100644
--- a/pkg/plugins/gateway/algorithms/least_kv_cache_test.go
+++ b/pkg/plugins/gateway/algorithms/least_kv_cache_test.go
@@ -53,19 +53,19 @@ func TestLeastKvCache(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.2},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3},
 				},
 				"p2": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.1},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.1},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5},
 				},
 				"p3": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.6},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6},
 				},
 				"p4": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.6},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.8},
 				},
 			},
@@ -93,15 +93,15 @@ func TestLeastKvCache(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.2},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3},
 				},
 				"p2": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.5},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5},
 				},
 				"p3": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.3},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2},
 				},
 			},
@@ -120,7 +120,7 @@ func TestLeastKvCache(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2},
+					metrics.KVCacheUsagePerc:  &metrics.SimpleMetricValue{Value: 0.2},
 					metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3},
 				},
 			},
diff --git a/pkg/plugins/gateway/algorithms/pd_disaggregation.go b/pkg/plugins/gateway/algorithms/pd_disaggregation.go
index 4b631f210..ca4881352 100644
--- a/pkg/plugins/gateway/algorithms/pd_disaggregation.go
+++ b/pkg/plugins/gateway/algorithms/pd_disaggregation.go
@@ -33,6 +33,7 @@ import (
 	"github.com/vllm-project/aibrix/pkg/cache"
 	"github.com/vllm-project/aibrix/pkg/constants"
 	"github.com/vllm-project/aibrix/pkg/metrics"
+	"github.com/vllm-project/aibrix/pkg/plugins/gateway/configprofiles"
 	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
 	"github.com/vllm-project/aibrix/pkg/utils/prefixcacheindexer"
@@ -50,11 +51,10 @@ const (
 	LLMEngineIdentifier           string                 = constants.ModelLabelEngine
 	PDRoleSetIdentifier           string                 = "roleset-name"
 	PDRoleIdentifier              string                 = "role-name"
-	CombinedIdentifier            string                 = "model.aibrix.ai/combined"
 	RoleReplicaIndex              string                 = "stormservice.orchestration.aibrix.ai/role-replica-index"
 	PodGroupIndex                 string                 = "stormservice.orchestration.aibrix.ai/pod-group-index"
-	PromptMinLength               string                 = "prompt-min-length"
-	PromptMaxLength               string                 = "prompt-max-length"
+	PromptLenBucketMinLength      string                 = "prompt-len-bucket-min-length"
+	PromptLenBucketMaxLength      string                 = "prompt-len-bucket-max-length"
 	defaultPrefillRequestTimeout  int                    = 30
 
 	defaultMaxRequest                   float64 = 32
@@ -73,6 +73,9 @@ const (
 	// KV connector types for different backends
 	KVConnectorTypeSHFS = "shfs" // Default - AIBrix SHFS/KVCacheManager (GPU)
 	KVConnectorTypeNIXL = "nixl" // NIXL for Neuron (uses disagg_prefill_resp wrapper)
+
+	HeaderPrefillTargetPodIP = "prefill-target-pod-ip"
+	HeaderPrefillTargetPod   = "prefill-target-pod"
 )
 
 var (
@@ -158,28 +161,33 @@ func (r *pdRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList)
 	// Validate engine consistency across all prefill pods
 	llmEngine, err := validateAndGetLLMEngine(readyPodList.All())
 	if err != nil {
-		metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0,
+		metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
 			map[string]string{"status": pdRouteValidateLLMEngineFail, "status_code": "400"})
 		return "", fmt.Errorf("engine validation failed for request %s: %w", ctx.RequestID, err)
 	}
 
 	prefillPod, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All())
 	if err != nil {
-		metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0,
+		metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
 			map[string]string{"status": pdRouteFilterPrefillDecodePodsFail, "status_code": "400"})
 		return "", fmt.Errorf("failed to filter prefill/decode pods for request %s: %w", ctx.RequestID, err)
 	}
 
 	if prefillPod != nil {
 		klog.InfoS("selected prefill/decode pods", "request_id", ctx.RequestID, "prefill_pod", prefillPod.Name, "decode_pod", decodePod.Name)
+		if ctx.RespHeaders == nil {
+			ctx.RespHeaders = make(map[string]string)
+		}
+		ctx.RespHeaders[HeaderPrefillTargetPod] = prefillPod.Name
+		ctx.RespHeaders[HeaderPrefillTargetPodIP] = prefillPod.Status.PodIP
 		err = r.doPrefillRequest(ctx, prefillPod, llmEngine)
 		if err != nil {
-			metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0,
+			metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
 				map[string]string{"status": pdRoutePrefillRequestError, "status_code": "500"})
 			klog.ErrorS(err, pdRoutePrefillRequestError, "request_id", ctx.RequestID)
 			return "", fmt.Errorf("prefill request failed for request %s: %w", ctx.RequestID, err)
 		}
-		metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestSuccessTotal, 1.0,
+		metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestSuccessTotal, &metrics.SimpleMetricValue{Value: 1.0},
 			map[string]string{"status": pdRoutePrefillRequestSuccess, "status_code": "200"})
 	}
 
@@ -203,7 +211,7 @@ func (r *pdRouter) filterPrefillDecodePods(routingCtx *types.RoutingContext, rea
 		klog.V(4).InfoS("prompt length based filtering enabled", "request_id", routingCtx.RequestID, "prompt_length", promptLength)
 	}
 
-	prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(readyPods, promptLength)
+	prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(routingCtx, readyPods, promptLength)
 	combinedAvailable := aibrixPromptLengthBucketing && len(combinedPods) > 0
 	if len(prefillPods) == 0 && !combinedAvailable {
 		return nil, nil, fmt.Errorf("prefill pods are not ready: prefill=%d, decode=%d", len(prefillPods), len(decodePods))
@@ -509,8 +517,8 @@ func (r *pdRouter) finalPDScore(routingCtx *types.RoutingContext,
 	r.selectionCounts[targetDecodePod.Name]++
 	r.countersMu.Unlock()
 
-	metrics.EmitCounterMetric(routingCtx, targetPrefillPod, metrics.PDSelectedPrefillPodTotal, 1.0, nil)
-	metrics.EmitCounterMetric(routingCtx, targetDecodePod, metrics.PDSelectedDecodePodTotal, 1.0, nil)
+	metrics.EmitMetricToPrometheus(routingCtx, targetPrefillPod, metrics.PDSelectedPrefillPodTotal, &metrics.SimpleMetricValue{Value: 1.0}, nil)
+	metrics.EmitMetricToPrometheus(routingCtx, targetDecodePod, metrics.PDSelectedDecodePodTotal, &metrics.SimpleMetricValue{Value: 1.0}, nil)
 
 	return targetPrefillPod, targetDecodePod, nil
 }
@@ -684,7 +692,7 @@ func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingConte
 	resp, err := r.httpClient.Do(req)
 	if err != nil {
 		status, code := metrics.HttpFailureStatusCode(ctx, err, nil)
-		metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0,
+		metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
 			map[string]string{"status": status, "status_code": code})
 		return nil, fmt.Errorf("failed to execute http prefill request: %w", err)
 	}
@@ -701,7 +709,7 @@ func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingConte
 	// Check response status
 	if resp.StatusCode != http.StatusOK {
 		status, code := metrics.HttpFailureStatusCode(ctx, nil, resp)
-		metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0,
+		metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0},
 			map[string]string{"status": status, "status_code": code})
 		return nil, fmt.Errorf("http prefill request failed with status %d: %s", resp.StatusCode, string(body))
 	}
@@ -932,8 +940,12 @@ func (t *PrefillRequestTracker) GetPrefillRequestCountsForPod(podname string) in
 	return int(countInterface.(*atomic.Int32).Load())
 }
 
-func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) bool {
-	minLength, maxLength := r.getPodPromptRange(pod)
+func (r *pdRouter) isPodSuitableForPromptLength(routingCtx *types.RoutingContext, pod *v1.Pod, promptLength int) bool {
+	profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)
+	if profile == nil {
+		return false
+	}
+	minLength, maxLength := profile.PromptLenBucketMinLength, profile.PromptLenBucketMaxLength
 
 	if minLength > maxLength {
 		return false
@@ -946,31 +958,15 @@ func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) b
 	return promptLength >= minLength && promptLength <= maxLength
 }
 
-// getPodPromptRange retrieves the minimum and maximum prompt lengths from pod labels.
-func (r *pdRouter) getPodPromptRange(pod *v1.Pod) (int, int) {
-	minLength := 0
-	maxLength := math.MaxInt32
-
-	if val, ok := pod.Labels[PromptMinLength]; ok {
-		if parsed, err := strconv.Atoi(val); err == nil {
-			minLength = parsed
-		}
-	}
-
-	if val, ok := pod.Labels[PromptMaxLength]; ok {
-		if parsed, err := strconv.Atoi(val); err == nil {
-			maxLength = parsed
-		}
+func isCombinedPod(routingCtx *types.RoutingContext, pod *v1.Pod) bool {
+	profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)
+	if profile == nil {
+		return false
 	}
-
-	return minLength, maxLength
-}
-
-func isCombinedPod(pod *v1.Pod) bool {
-	return pod != nil && pod.Labels[CombinedIdentifier] == "true"
+	return profile.Combined
 }
 
-func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) {
+func (r *pdRouter) collectAndBucketPods(routingCtx *types.RoutingContext, readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) {
 	prefillPods, decodePods := []*v1.Pod{}, []*v1.Pod{}
 	promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, promptLengthBucketingCombinedPods := []*v1.Pod{}, []*v1.Pod{}, []*v1.Pod{}
 
@@ -991,16 +987,16 @@ func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) (
 		switch pod.Labels[PDRoleIdentifier] {
 		case "prefill":
 			prefillPods = append(prefillPods, pod)
-			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingPrefillPods = append(promptLengthBucketingPrefillPods, pod)
 			}
 		case "decode":
 			decodePods = append(decodePods, pod)
-			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingDecodePods = append(promptLengthBucketingDecodePods, pod)
 			}
 		default:
-			if aibrixPromptLengthBucketing && isCombinedPod(pod) && r.isPodSuitableForPromptLength(pod, promptLength) {
+			if aibrixPromptLengthBucketing && isCombinedPod(routingCtx, pod) && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) {
 				promptLengthBucketingCombinedPods = append(promptLengthBucketingCombinedPods, pod)
 			}
 		}
diff --git a/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go b/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go
index 6279d32bb..fd1932b9e 100644
--- a/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go
+++ b/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go
@@ -19,9 +19,11 @@ package routingalgorithms
 import (
 	"context"
 	"io"
+	"math"
 	"net"
 	"net/http"
 	"net/http/httptest"
+	"strconv"
 	"testing"
 	"time"
 
@@ -1524,88 +1526,62 @@ func TestLoadImbalanceSelectDecodePod(t *testing.T) {
 func TestIsPodSuitableForPromptLength(t *testing.T) {
 	tests := []struct {
 		name         string
-		podLabels    map[string]string
+		minLen       int
+		maxLen       int
 		promptLength int
 		expected     bool
 	}{
 		{
-			name: "no prompt length range configured",
-			podLabels: map[string]string{
-				"roleset-name": "test",
-				"role-name":    "prefill",
-			},
+			name:         "no prompt length range configured",
+			minLen:       0,
+			maxLen:       math.MaxInt32,
 			promptLength: 1000,
 			expected:     true,
 		},
 		{
-			name: "prompt length exactly at min",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "1000",
-				"prompt-max-length": "2000",
-			},
+			name:         "prompt length exactly at min",
+			minLen:       1000,
+			maxLen:       2000,
 			promptLength: 1000,
 			expected:     true,
 		},
 		{
-			name: "prompt length exactly at max",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "1000",
-				"prompt-max-length": "2000",
-			},
+			name:         "prompt length exactly at max",
+			minLen:       1000,
+			maxLen:       2000,
 			promptLength: 2000,
 			expected:     true,
 		},
 		{
-			name: "prompt length in middle of range",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "1000",
-				"prompt-max-length": "2000",
-			},
+			name:         "prompt length in middle of range",
+			minLen:       1000,
+			maxLen:       2000,
 			promptLength: 1500,
 			expected:     true,
 		},
 		{
-			name: "prompt length below min",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "1000",
-				"prompt-max-length": "2000",
-			},
+			name:         "prompt length below min",
+			minLen:       1000,
+			maxLen:       2000,
 			promptLength: 900,
 			expected:     false,
 		},
 		{
-			name: "prompt length above max",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "1000",
-				"prompt-max-length": "2000",
-			},
+			name:         "prompt length above max",
+			minLen:       1000,
+			maxLen:       2000,
 			promptLength: 2100,
 			expected:     false,
 		},
 		{
-			name: "prompt length min larger than max",
-			podLabels: map[string]string{
-				"roleset-name":      "test",
-				"role-name":         "prefill",
-				"prompt-min-length": "2000",
-				"prompt-max-length": "1000",
-			},
+			name:         "prompt length min larger than max",
+			minLen:       2000,
+			maxLen:       1000,
 			promptLength: 1000,
 			expected:     false,
 		},
 	}
 
-	// Create a router instance
 	router := &pdRouter{
 		cache:                 cache.NewForTest(),
 		tokenizer:             tokenizer.NewCharacterTokenizer(),
@@ -1613,28 +1589,27 @@ func TestIsPodSuitableForPromptLength(t *testing.T) {
 		prefillRequestTracker: NewPrefillRequestTracker(),
 		httpClient:            &http.Client{},
 	}
+	ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "", "req", "user")
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			// Create test pod
-			pod := &v1.Pod{
-				ObjectMeta: metav1.ObjectMeta{
-					Name:   "test-pod",
-					Labels: tt.podLabels,
-				},
-				Status: v1.PodStatus{
-					Conditions: []v1.PodCondition{
-						{Type: v1.PodReady, Status: v1.ConditionTrue},
-					},
-				},
-			}
-
-			result := router.isPodSuitableForPromptLength(pod, tt.promptLength)
+			config := pdConfigAnnotation(tt.minLen, tt.maxLen, false)
+			pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Annotations: map[string]string{constants.ModelAnnoConfig: config}}}
+			result := router.isPodSuitableForPromptLength(ctx, pod, tt.promptLength)
 			assert.Equal(t, tt.expected, result)
 		})
 	}
 }
 
+// pdConfigAnnotation returns model.aibrix.ai/config annotation JSON for prompt length bucketing.
+func pdConfigAnnotation(minLen, maxLen int, combined bool) string {
+	combinedStr := "false"
+	if combined {
+		combinedStr = "true"
+	}
+	return `{"defaultProfile":"pd","profiles":{"pd":{"routingStrategy":"pd","promptLenBucketMinLength":` + strconv.Itoa(minLen) + `,"promptLenBucketMaxLength":` + strconv.Itoa(maxLen) + `,"combined":` + combinedStr + `}}}`
+}
+
 func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) {
 	aibrixPromptLengthBucketing = true
 
@@ -1647,10 +1622,13 @@ func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) {
 		selectionCounts:       map[string]int64{},
 	}
 
-	prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
-	prefillBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "1000000", PromptMaxLength: "2000000"}}}
-	decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
-	decodeBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "1000000", PromptMaxLength: "2000000"}}}
+	// Pods use model.aibrix.ai/config annotation (not labels) for prompt length bucketing.
+	configOK := pdConfigAnnotation(0, 1000000, false)
+	configBlocked := pdConfigAnnotation(1000000, 2000000, false)
+	prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configOK}}}
+	prefillBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}}
+	decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configOK}}}
+	decodeBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}}
 
 	ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "short", "req-bucket", "user")
 	prefill, decode, err := r.filterPrefillDecodePods(ctx, []*v1.Pod{prefillOK, prefillBlocked, decodeOK, decodeBlocked})
@@ -1662,7 +1640,6 @@ func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) {
 }
 
 func TestFilterPrefillDecodePods_CombinedFallbackBucketing(t *testing.T) {
-	// os.Setenv("AIBRIX_PROMPT_LENGTH_BUCKETING", "true")
 	aibrixPromptLengthBucketing = true
 
 	r := pdRouter{
@@ -1671,11 +1648,16 @@ func TestFilterPrefillDecodePods_CombinedFallbackBucketing(t *testing.T) {
 		prefixCacheIndexer:    prefixcacheindexer.NewPrefixHashTable(),
 		prefillRequestTracker: NewPrefillRequestTracker(),
 		httpClient:            &http.Client{},
+		selectionCounts:       map[string]int64{},
 	}
 
-	combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-1", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", CombinedIdentifier: "true", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
-	prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "0", PromptMaxLength: "1"}}}
-	decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "0", PromptMaxLength: "1"}}}
+	// prefill/decode with 0-1 range: blocked for "say test" (prompt length > 1)
+	// combined with 0-1000000 + combined:true: suitable for fallback
+	configBlocked := pdConfigAnnotation(0, 1, false)
+	configCombined := pdConfigAnnotation(0, 1000000, true)
+	combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-1", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined"}, Annotations: map[string]string{constants.ModelAnnoConfig: configCombined}}}
+	prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}}
+	decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}}
 
 	ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "say test", "req-combined", "user")
 	prefill, decode, err := r.filterPrefillDecodePods(ctx, []*v1.Pod{prefillOK, decodeOK, combined})
@@ -1716,11 +1698,14 @@ func TestFilterPrefillDecodePods_CombinedPickImbalance(t *testing.T) {
 		},
 	}
 
+	configPrefillDecode := pdConfigAnnotation(0, 1000000, false)
+	configCombined := pdConfigAnnotation(0, 1000000, true)
+
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			prefill := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-high", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", constants.ModelLabelName: "test-model", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
-			decode := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-mid", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", constants.ModelLabelName: "test-model", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
-			combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-low", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", constants.ModelLabelName: "test-model", CombinedIdentifier: "true", PromptMinLength: "0", PromptMaxLength: "1000000"}}}
+			prefill := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-high", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configPrefillDecode}}}
+			decode := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-mid", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configPrefillDecode}}}
+			combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-low", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configCombined}}}
 
 			metricsMap := map[string]map[string]metrics.MetricValue{}
 			vecDrain100 := model.Vector{&model.Sample{Metric: model.Metric{"__name__": "drain_rate_1m"}, Value: model.SampleValue(100)}}
diff --git a/pkg/plugins/gateway/algorithms/prefix_cache.go b/pkg/plugins/gateway/algorithms/prefix_cache.go
index e225efcc0..f28f54b2d 100644
--- a/pkg/plugins/gateway/algorithms/prefix_cache.go
+++ b/pkg/plugins/gateway/algorithms/prefix_cache.go
@@ -388,14 +388,14 @@ func (p prefixCacheRouter) routeOriginal(ctx *types.RoutingContext, readyPodList
 		for _, pod := range leastReqPodList {
 			readyPodsMap[pod.Name] = struct{}{}
 		}
-		klog.InfoS("prefix_cache_load_imbalanced",
+		klog.V(4).InfoS("prefix_cache_load_imbalanced",
 			"request_id", ctx.RequestID,
 			"pod_request_count", getRequestCounts(p.cache, readyPods),
 			"target_pod_list", readyPodsMap)
 	}
 	// handle request with readyPodsMap from balanced or imbalanced filter
 	matchedPods, prefixHashes = p.prefixCacheIndexer.MatchPrefix(tokens, ctx.Model, readyPodsMap)
-	klog.InfoS("prefix_hashes", "request_id", ctx.RequestID, "prefix_hashes", prefixHashes)
+	klog.V(4).InfoS("prefix_hashes", "request_id", ctx.RequestID, "prefix_hashes", prefixHashes)
 
 	if len(matchedPods) > 0 {
 		targetPod = getTargetPodFromMatchedPods(p.cache, readyPods, matchedPods)
diff --git a/pkg/plugins/gateway/algorithms/router.go b/pkg/plugins/gateway/algorithms/router.go
index ff9952d57..6a9f68a20 100644
--- a/pkg/plugins/gateway/algorithms/router.go
+++ b/pkg/plugins/gateway/algorithms/router.go
@@ -48,7 +48,7 @@ type RouterManager struct {
 
 func NewRouterManager() *RouterManager {
 	rm := &RouterManager{}
-	rm.routerInited, rm.routerDoneInit = context.WithTimeout(context.Background(), 1*time.Second)
+	rm.routerInited, rm.routerDoneInit = context.WithTimeout(context.Background(), 5*time.Second)
 	rm.routerFactory = make(map[types.RoutingAlgorithm]types.RouterProviderFunc)
 	rm.routerConstructor = make(map[types.RoutingAlgorithm]types.RouterProviderRegistrationFunc)
 	return rm
diff --git a/pkg/plugins/gateway/algorithms/throughput.go b/pkg/plugins/gateway/algorithms/throughput.go
index d3dc1d0b0..6d0e64d28 100644
--- a/pkg/plugins/gateway/algorithms/throughput.go
+++ b/pkg/plugins/gateway/algorithms/throughput.go
@@ -56,12 +56,12 @@ func (r throughputRouter) Route(ctx *types.RoutingContext, readyPodList types.Po
 	readyPods := readyPodList.All()
 
 	for _, pod := range readyPods {
-		promptThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgPromptThroughputToksPerS)
+		promptThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgPromptToksPerReq)
 		if err != nil {
 			klog.Error(err)
 			continue
 		}
-		generationThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgGenerationThroughputToksPerS)
+		generationThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgGenerationToksPerReq)
 		if err != nil {
 			klog.Error(err)
 			continue
diff --git a/pkg/plugins/gateway/algorithms/throughput_test.go b/pkg/plugins/gateway/algorithms/throughput_test.go
index 032f1955d..a585a5622 100644
--- a/pkg/plugins/gateway/algorithms/throughput_test.go
+++ b/pkg/plugins/gateway/algorithms/throughput_test.go
@@ -53,20 +53,20 @@ func TestThroughput(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 1},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 1},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2},
 				},
 				"p2": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 2},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 1},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 2},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 1},
 				},
 				"p3": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 3},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 3},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 3},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 3},
 				},
 				"p4": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 4},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 4},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 4},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 4},
 				},
 			},
 			expectErr:  false,
@@ -93,16 +93,16 @@ func TestThroughput(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 1},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 4},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 1},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 4},
 				},
 				"p2": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 5},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 5},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 5},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 5},
 				},
 				"p3": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 2},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 2},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2},
 				},
 			},
 			expectErr:  false,
@@ -120,8 +120,8 @@ func TestThroughput(t *testing.T) {
 			},
 			podMetrics: map[string]map[string]metrics.MetricValue{
 				"p1": {
-					metrics.AvgPromptThroughputToksPerS:     &metrics.SimpleMetricValue{Value: 1},
-					metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2},
+					metrics.AvgPromptToksPerReq:     &metrics.SimpleMetricValue{Value: 1},
+					metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2},
 				},
 			},
 			expectErr:  false,
diff --git a/pkg/plugins/gateway/configprofiles/configprofiles.go b/pkg/plugins/gateway/configprofiles/configprofiles.go
new file mode 100644
index 000000000..6a952ac1a
--- /dev/null
+++ b/pkg/plugins/gateway/configprofiles/configprofiles.go
@@ -0,0 +1,129 @@
+/*
+Copyright 2025 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package configprofiles parses the model.aibrix.ai/config annotation (or ConfigMap)
+// and supports multiple named profiles selectable at runtime via config-profile header.
+// See docs/source/designs/model-config-profiles.rst for the design.
+package configprofiles
+
+import (
+	"encoding/json"
+	"fmt"
+	"math"
+	"strings"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
+
+	"github.com/vllm-project/aibrix/pkg/constants"
+)
+
+const (
+	// DefaultProfileName is used when defaultProfile is not set in the JSON.
+	DefaultProfileName = "default"
+)
+
+// ModelConfigProfile holds gateway options for a single profile.
+type ModelConfigProfile struct {
+	RoutingStrategy          string `json:"routingStrategy"`
+	PromptLenBucketMinLength int    `json:"promptLenBucketMinLength"`
+	PromptLenBucketMaxLength int    `json:"promptLenBucketMaxLength"`
+	Combined                 bool   `json:"combined"`
+}
+
+// ModelConfigProfiles is the root JSON structure from model.aibrix.ai/config.
+type ModelConfigProfiles struct {
+	DefaultProfile string                        `json:"defaultProfile"`
+	Profiles       map[string]ModelConfigProfile `json:"profiles"`
+}
+
+// GetProfile returns the profile for the given name, or the default profile.
+// Falls back to defaultProfile/"default" when the requested profile does not exist.
+// Returns nil only if no default profile exists.
+func (c *ModelConfigProfiles) GetProfile(name string) *ModelConfigProfile {
+	if name != "" {
+		if p, ok := c.Profiles[name]; ok {
+			return &p
+		}
+	}
+	// Fall back to default
+	if name = c.DefaultProfile; name == "" {
+		name = DefaultProfileName
+	}
+	if p, ok := c.Profiles[name]; ok {
+		return &p
+	}
+	return nil
+}
+
+// ResolveProfile resolves the model config from pods (annotation),
+// then returns the profile selected by headerProfile (from config-profile).
+// configMapGetter can be nil; it is checked first when provided.
+func ResolveProfile(pods []*v1.Pod, headerProfile string) *ModelConfigProfile {
+	for _, pod := range pods {
+		if p := ResolveProfileFromPod(pod, headerProfile); p != nil {
+			return p
+		}
+	}
+	return nil
+}
+
+// ResolveProfileFromPod resolves the model config from a single pod annotation and returns the selected profile.
+func ResolveProfileFromPod(pod *v1.Pod, headerProfile string) *ModelConfigProfile {
+	if pod == nil {
+		return nil
+	}
+	anno := pod.Annotations[constants.ModelAnnoConfig]
+	if anno == "" {
+		return nil
+	}
+	cfg, err := ParseModelConfig(anno)
+	if err != nil {
+		klog.V(4).InfoS("failed to parse model config from pod annotation", "pod", pod.Name, "err", err)
+		return nil
+	}
+	if headerProfile == "" {
+		return cfg.GetProfile("")
+	}
+	return cfg.GetProfile(headerProfile)
+}
+
+// ParseModelConfig parses the JSON from annotation data.
+// Returns nil if jsonStr is empty or invalid.
+func ParseModelConfig(jsonStr string) (*ModelConfigProfiles, error) {
+	jsonStr = strings.TrimSpace(jsonStr)
+	if jsonStr == "" {
+		return nil, nil
+	}
+	var cfg ModelConfigProfiles
+	if err := json.Unmarshal([]byte(jsonStr), &cfg); err != nil {
+		return nil, fmt.Errorf("parse model config: %w", err)
+	}
+	if len(cfg.Profiles) == 0 {
+		return nil, fmt.Errorf("model config has no profiles")
+	}
+	// Default prompt bounds when not provided: min=0, max=MaxInt32
+	for name, p := range cfg.Profiles {
+		if p.PromptLenBucketMinLength < 0 {
+			p.PromptLenBucketMinLength = 0
+		}
+		if p.PromptLenBucketMaxLength == 0 {
+			p.PromptLenBucketMaxLength = math.MaxInt32
+		}
+		cfg.Profiles[name] = p
+	}
+	return &cfg, nil
+}
diff --git a/pkg/plugins/gateway/configprofiles/configprofiles_test.go b/pkg/plugins/gateway/configprofiles/configprofiles_test.go
new file mode 100644
index 000000000..69406ca98
--- /dev/null
+++ b/pkg/plugins/gateway/configprofiles/configprofiles_test.go
@@ -0,0 +1,240 @@
+/*
+Copyright 2025 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package configprofiles
+
+import (
+	"math"
+	"testing"
+
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	"github.com/vllm-project/aibrix/pkg/constants"
+)
+
+func TestParseModelConfig(t *testing.T) {
+	tests := []struct {
+		name    string
+		json    string
+		wantErr bool
+	}{
+		{
+			name: "empty",
+			json: "",
+		},
+		{
+			name: "single profile",
+			json: `{"profiles":{"default":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`,
+		},
+		{
+			name: "multiple profiles with defaultProfile",
+			json: `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`,
+		},
+		{
+			name: "with combined field",
+			json: `{"profiles":{"default":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048,"combined":true}}}`,
+		},
+		{
+			name:    "invalid json",
+			json:    `{`,
+			wantErr: true,
+		},
+		{
+			name:    "no profiles",
+			json:    `{}`,
+			wantErr: true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			cfg, err := ParseModelConfig(tt.json)
+			if tt.wantErr {
+				if err == nil || cfg != nil {
+					t.Errorf("ParseModelConfig() expected error, got cfg=%v err=%v", cfg, err)
+				}
+				return
+			}
+			if err != nil {
+				t.Errorf("ParseModelConfig() err=%v", err)
+				return
+			}
+			if tt.json != "" && cfg == nil {
+				t.Errorf("ParseModelConfig() expected config for non-empty input")
+			}
+		})
+	}
+}
+
+func TestParseModelConfig_DefaultValues(t *testing.T) {
+	// promptLenBucketMinLength negative → normalized to 0
+	// promptLenBucketMaxLength 0 or omitted → MaxInt32
+	json := `{"profiles":{"p1":{"routingStrategy":"pd","promptLenBucketMinLength":-5,"promptLenBucketMaxLength":0},"p2":{"routingStrategy":"random"}}}`
+
+	cfg, err := ParseModelConfig(json)
+	if err != nil || cfg == nil {
+		t.Fatalf("ParseModelConfig failed: %v", err)
+	}
+
+	p1 := cfg.GetProfile("p1")
+	if p1 == nil {
+		t.Fatal("GetProfile(p1) = nil")
+	}
+	if p1.PromptLenBucketMinLength != 0 {
+		t.Errorf("promptLenBucketMinLength -5 should be normalized to 0, got %d", p1.PromptLenBucketMinLength)
+	}
+	if p1.PromptLenBucketMaxLength != math.MaxInt32 {
+		t.Errorf("promptLenBucketMaxLength 0 should become MaxInt32, got %d", p1.PromptLenBucketMaxLength)
+	}
+
+	p2 := cfg.GetProfile("p2")
+	if p2 == nil {
+		t.Fatal("GetProfile(p2) = nil")
+	}
+	if p2.PromptLenBucketMaxLength != math.MaxInt32 {
+		t.Errorf("omitted promptLenBucketMaxLength should become MaxInt32, got %d", p2.PromptLenBucketMaxLength)
+	}
+}
+
+func TestGetProfile(t *testing.T) {
+	json := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`
+
+	cfg, err := ParseModelConfig(json)
+	if err != nil || cfg == nil {
+		t.Fatalf("ParseModelConfig failed: %v", err)
+	}
+
+	if p := cfg.GetProfile("pd"); p == nil || p.RoutingStrategy != "pd" {
+		t.Errorf("GetProfile(pd) = %v, want routingStrategy=pd", p)
+	}
+	if p := cfg.GetProfile(""); p == nil || p.RoutingStrategy != "pd" {
+		t.Errorf("GetProfile(\"\") should use defaultProfile, got %v", p)
+	}
+	if p := cfg.GetProfile("default"); p == nil || p.RoutingStrategy != "random" {
+		t.Errorf("GetProfile(default) = %v", p)
+	}
+	// nonexistent profile falls back to defaultProfile
+	if p := cfg.GetProfile("nonexistent"); p == nil || p.RoutingStrategy != "pd" {
+		t.Errorf("GetProfile(nonexistent) should fall back to default, got %v", p)
+	}
+}
+
+func TestGetProfile_NoDefault(t *testing.T) {
+	// No defaultProfile set; falls back to "default"
+	json := `{"profiles":{"default":{"routingStrategy":"random"},"pd":{"routingStrategy":"pd"}}}`
+
+	cfg, err := ParseModelConfig(json)
+	if err != nil || cfg == nil {
+		t.Fatalf("ParseModelConfig failed: %v", err)
+	}
+
+	// Empty/unknown name should use "default" (implied default)
+	if p := cfg.GetProfile(""); p == nil || p.RoutingStrategy != "random" {
+		t.Errorf("GetProfile(\"\") with no defaultProfile should use \"default\", got %v", p)
+	}
+}
+
+func TestResolveProfileFromPod(t *testing.T) {
+	configJSON := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random"},"pd":{"routingStrategy":"pd","promptLenBucketMaxLength":2048}}}`
+
+	podWithAnno := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        "pod1",
+			Namespace:   "default",
+			Annotations: map[string]string{constants.ModelAnnoConfig: configJSON},
+		},
+	}
+	podNoAnno := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "default"},
+	}
+
+	tests := []struct {
+		name          string
+		pod           *v1.Pod
+		headerProfile string
+		wantProfile   string
+	}{
+		{"nil pod", nil, "", ""},
+		{"pod without anno", podNoAnno, "", ""},
+		{"pod with anno, no header", podWithAnno, "", "pd"},
+		{"pod with anno, header pd", podWithAnno, "pd", "pd"},
+		{"pod with anno, header default", podWithAnno, "default", "random"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			p := ResolveProfileFromPod(tt.pod, tt.headerProfile)
+			if tt.wantProfile == "" {
+				if p != nil {
+					t.Errorf("ResolveProfileFromPod() = %v, want nil", p)
+				}
+				return
+			}
+			if p == nil {
+				t.Errorf("ResolveProfileFromPod() = nil, want profile with routingStrategy=%s", tt.wantProfile)
+				return
+			}
+			if p.RoutingStrategy != tt.wantProfile {
+				t.Errorf("ResolveProfileFromPod().RoutingStrategy = %s, want %s", p.RoutingStrategy, tt.wantProfile)
+			}
+		})
+	}
+}
+
+func TestResolveProfile(t *testing.T) {
+	configJSON := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`
+
+	podWithAnno := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:        "pod1",
+			Namespace:   "default",
+			Annotations: map[string]string{constants.ModelAnnoConfig: configJSON},
+		},
+	}
+	podNoAnno := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "default"},
+	}
+
+	tests := []struct {
+		name          string
+		pods          []*v1.Pod
+		headerProfile string
+		wantProfile   string
+	}{
+		{"no pods", nil, "", ""},
+		{"pods without anno", []*v1.Pod{podNoAnno}, "", ""},
+		{"pods with anno, no header", []*v1.Pod{podWithAnno}, "", "pd"},
+		{"pods with anno, header pd", []*v1.Pod{podWithAnno}, "pd", "pd"},
+		{"pods with anno, header default", []*v1.Pod{podWithAnno}, "default", "random"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			p := ResolveProfile(tt.pods, tt.headerProfile)
+			if tt.wantProfile == "" {
+				if p != nil {
+					t.Errorf("ResolveProfile() = %v, want nil", p)
+				}
+				return
+			}
+			if p == nil {
+				t.Errorf("ResolveProfile() = nil, want profile with routingStrategy=%s", tt.wantProfile)
+				return
+			}
+			if p.RoutingStrategy != tt.wantProfile {
+				t.Errorf("ResolveProfile().RoutingStrategy = %s, want %s", p.RoutingStrategy, tt.wantProfile)
+			}
+		})
+	}
+}
diff --git a/pkg/plugins/gateway/gateway.go b/pkg/plugins/gateway/gateway.go
index bc020bb39..629dd4231 100644
--- a/pkg/plugins/gateway/gateway.go
+++ b/pkg/plugins/gateway/gateway.go
@@ -21,6 +21,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"os"
 	"strings"
 	"time"
 
@@ -82,6 +83,8 @@ type processState struct {
 	completed        bool
 }
 
+var podName = os.Getenv("POD_NAME")
+
 func NewServer(redisClient *redis.Client, client kubernetes.Interface, gatewayClient gatewayapi.Interface) *Server {
 	c, err := cache.Get()
 	if err != nil {
@@ -273,6 +276,7 @@ func (s *Server) responseForResponseHeaderError(st *processState, resp *extProcP
 }
 
 func (s *Server) emitProcessMetrics(st *processState, resp *extProcPb.ProcessingResponse) {
+	s.emitGatewayRequestTotalMetric(resp, st.model)
 	if st.model == "" {
 		return
 	}
@@ -335,6 +339,7 @@ func (s *Server) selectTargetPod(ctx *types.RoutingContext, pods types.PodList,
 		ctx.SetTargetPod(readyPods[0])
 		return ctx.TargetAddress(), nil
 	}
+	utils.CryptoShuffle(readyPods)
 
 	return router.Route(ctx, &utils.PodArray{Pods: readyPods})
 }
@@ -433,8 +438,17 @@ func (s *Server) responseErrorProcessingWithHeaders(ctx context.Context, headers
 }
 
 func (s *Server) emitMetricsCounterHelper(metricName, model, status, statusCode string) {
-	labelNames, labelValues := buildGatewayPodMetricLabels(model, status, statusCode)
-	metrics.EmitMetricToPrometheus(metricName, &metrics.SimpleMetricValue{Value: 1.0}, labelNames, labelValues)
+	labels := buildGatewayPodMetricLabels(model, status, statusCode)
+	metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, nil, metricName, &metrics.SimpleMetricValue{Value: 1.0}, labels)
+}
+
+func (s *Server) emitGatewayRequestTotalMetric(resp *extProcPb.ProcessingResponse, model string) {
+	statusCode := "200"
+	if resp.GetImmediateResponse() != nil {
+		statusCode = fmt.Sprintf("%d", int(resp.GetImmediateResponse().Status.GetCode()))
+	}
+	labels := buildGatewayPodMetricLabels(model, "gateway_request_handled", statusCode)
+	metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, nil, metrics.GatewayRequestTotal, &metrics.SimpleMetricValue{Value: 1.0}, labels)
 }
 
 func getMetricErr(resp *extProcPb.ImmediateResponse, metricLabel string) string {
diff --git a/pkg/plugins/gateway/gateway_req_body.go b/pkg/plugins/gateway/gateway_req_body.go
index 2ca7e1936..72501060c 100644
--- a/pkg/plugins/gateway/gateway_req_body.go
+++ b/pkg/plugins/gateway/gateway_req_body.go
@@ -40,12 +40,12 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e
 
 	routingCtx, _ := ctx.(*types.RoutingContext)
 	requestPath := routingCtx.ReqPath
-	routingAlgorithm := routingCtx.Algorithm
 
 	body := req.Request.(*extProcPb.ProcessingRequest_RequestBody)
 
 	var model, message string
 	var stream bool
+	var routingAlgorithm types.RoutingAlgorithm
 	var errRes *extProcPb.ProcessingResponse
 
 	// Check if this is a multipart request (audio endpoints)
@@ -88,6 +88,19 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e
 			fmt.Sprintf("error on getting pods for model %s", model), ErrorCodeServiceUnavailable, ""), model, routingCtx, stream, term
 	}
 
+	// Resolve model config profile from annotation and apply overrides
+	applyConfigProfile(routingCtx, podsArr.All())
+
+	// Derive and validate routing strategy (headers -> profile -> env); return 400 on invalid
+	if strategy, enabled := deriveRoutingStrategyFromContext(routingCtx); enabled {
+		var ok bool
+		if routingAlgorithm, ok = routing.Validate(strategy); !ok {
+			klog.ErrorS(nil, "incorrect routing strategy", "requestID", requestID, "routing-strategy", strategy)
+			return buildErrorResponse(envoyTypePb.StatusCode_BadRequest, fmt.Sprintf("incorrect routing strategy %s", strategy), "", "", HeaderErrorRouting, "true"), model, routingCtx, stream, term
+		}
+		routingCtx.Algorithm = routingAlgorithm
+	}
+
 	headers := []*configPb.HeaderValueOption{}
 
 	// Path rewriting for image/video generation based on engine type
@@ -108,19 +121,15 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e
 		targetPodIP, err := s.selectTargetPod(routingCtx, podsArr, externalFilter)
 		if targetPodIP == "" || err != nil {
 			klog.ErrorS(err, "failed to select target pod", "requestID", requestID, "routingStrategy", routingAlgorithm, "model", model, "routingDuration", routingCtx.GetRoutingDelay())
-			return generateErrorResponse(
-				envoyTypePb.StatusCode_ServiceUnavailable,
-				[]*configPb.HeaderValueOption{{Header: &configPb.HeaderValue{
-					Key: HeaderErrorRouting, RawValue: []byte("true")}}},
-				"error on selecting target pod", ErrorCodeServiceUnavailable, ""), model, routingCtx, stream, term
+			return buildErrorResponse(envoyTypePb.StatusCode_ServiceUnavailable, "error on selecting target pod", ErrorCodeServiceUnavailable, "", HeaderErrorRouting, "true"), model, routingCtx, stream, term
 		}
 		headers = buildEnvoyProxyHeaders(headers,
 			HeaderRoutingStrategy, string(routingAlgorithm),
 			HeaderTargetPod, targetPodIP,
 			"content-length", strconv.Itoa(len(routingCtx.ReqBody)),
 			"X-Request-Id", routingCtx.RequestID)
-		var targetPodName string
-		var targetNamespace string
+
+		var targetPodName, targetNamespace string
 		var request_count float64
 		if routingCtx.HasRouted() && routingCtx.TargetPod() != nil {
 			targetPodName = routingCtx.TargetPod().Name
@@ -128,7 +137,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e
 			request_count = getRunningRequestsByPod(s, targetPodName, targetNamespace)
 		}
 		klog.InfoS("request_start", "request_id", requestID, "request_path", requestPath, "model", model, "stream", stream, "routing_strategy", routingAlgorithm,
-			"target_pod", targetPodName, "target_pod_ip", targetPodIP, "outstanding_requests", request_count, "routing_duration", routingCtx.GetRoutingDelay())
+			"target_pod", targetPodName, "target_pod_ip", targetPodIP, "outstanding_requests", request_count, "routing_time_taken", routingCtx.GetRoutingDelay())
 	}
 
 	routingCtx.RequestEndTime = time.Now()
diff --git a/pkg/plugins/gateway/gateway_req_body_test.go b/pkg/plugins/gateway/gateway_req_body_test.go
index 3c1f97d16..37d0124bf 100644
--- a/pkg/plugins/gateway/gateway_req_body_test.go
+++ b/pkg/plugins/gateway/gateway_req_body_test.go
@@ -32,7 +32,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	"github.com/vllm-project/aibrix/pkg/cache"
-	"github.com/vllm-project/aibrix/pkg/metrics"
 	routingalgorithms "github.com/vllm-project/aibrix/pkg/plugins/gateway/algorithms"
 	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
@@ -269,7 +268,7 @@ func Test_handleRequestBody(t *testing.T) {
 			},
 		},
 		{
-			name:        "invalid routing strategy - should fallback to random router",
+			name:        "invalid routing strategy - should return 400 BadRequest",
 			requestBody: `{"model": "test-model", "messages": [{"role": "user", "content": "test"}]}`,
 			user: utils.User{
 				Name: "test-user",
@@ -285,64 +284,34 @@ func Test_handleRequestBody(t *testing.T) {
 								Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}},
 							},
 						},
-						{
-							Status: v1.PodStatus{
-								PodIP:      "5.6.7.8",
-								Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}},
-							},
-						},
 					},
 				}
 				mockCache.On("ListPodsByModel", "test-model").Return(podList, nil)
-				mockCache.On("GetMetricValueByPod", mock.Anything, mock.Anything, metrics.RealtimeNumRequestsRunning).Return(&metrics.SimpleMetricValue{Value: 0}, nil)
-				mockCache.On("AddRequestCount", mock.Anything, mock.Anything, "test-model").Return(int64(1))
 			},
 			expected: testResponse{
-				statusCode: envoyTypePb.StatusCode_OK,
-				headers: []*configPb.HeaderValueOption{
-					{
-						Header: &configPb.HeaderValue{
-							Key:      HeaderRoutingStrategy,
-							RawValue: []byte("invalid-router"),
-						},
-					},
-					{
-						Header: &configPb.HeaderValue{
-							Key:      HeaderTargetPod,
-							RawValue: []byte("1.2.3.4:8000"),
-						},
-					},
-				},
+				statusCode: envoyTypePb.StatusCode_BadRequest,
 				model:      "test-model",
 				stream:     false,
-				term:       1,
+				term:       0,
 				routingCtx: &types.RoutingContext{},
 			},
 			validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, model string, routingCtx *types.RoutingContext, stream bool, term int64) {
-				assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK)
+				assert.Equal(t, tt.expected.statusCode, resp.GetImmediateResponse().GetStatus().GetCode())
+				// buildErrorResponse returns x-error-routing: true (no Content-Type in headers)
+				headers := resp.GetImmediateResponse().GetHeaders().GetSetHeaders()
+				assert.GreaterOrEqual(t, len(headers), 1)
+				foundErrorRouting := false
+				for _, h := range headers {
+					if h.Header.Key == HeaderErrorRouting && string(h.Header.RawValue) == "true" {
+						foundErrorRouting = true
+						break
+					}
+				}
+				assert.True(t, foundErrorRouting, "expected x-error-routing header")
 				assert.Equal(t, tt.expected.model, model)
 				assert.Equal(t, tt.expected.stream, stream)
 				assert.Equal(t, tt.expected.term, term)
 				assert.NotNil(t, routingCtx)
-				assert.Equal(t, tt.expected.model, routingCtx.Model)
-				assert.Equal(t, tt.routingAlgo, routingCtx.Algorithm)
-				// Verify both routing headers are set
-				foundRoutingStrategy := false
-				foundTargetPod := false
-				for _, header := range resp.GetRequestBody().GetResponse().GetHeaderMutation().GetSetHeaders() {
-					if header.Header.Key == HeaderRoutingStrategy {
-						foundRoutingStrategy = true
-						assert.Equal(t, "invalid-router", string(header.Header.RawValue))
-					}
-					if header.Header.Key == HeaderTargetPod {
-						foundTargetPod = true
-						// Since this is a random router, accept either valid pod IP from the mock setup
-						targetPodIP := string(header.Header.RawValue)
-						assert.Contains(t, []string{"1.2.3.4:8000", "5.6.7.8:8000"}, targetPodIP, "Target pod IP should be one of the pod IPs from the mock setup")
-					}
-				}
-				assert.True(t, foundRoutingStrategy, "HeaderRoutingStrategy not found")
-				assert.True(t, foundTargetPod, "HeaderTargetPod not found")
 			},
 		},
 		{
@@ -680,10 +649,17 @@ func Test_handleRequestBody(t *testing.T) {
 
 			// Call HandleRequestBody and validate the response
 			routingCtx := types.NewRoutingContext(context.Background(), tt.routingAlgo, tt.expected.model, "", "test-request-id", tt.user.Name)
-			routingCtx.ReqPath = "/v1/chat/completions"
+			routingCtx.ReqPath = PathChatCompletions
 			if tt.reqPath != "" {
 				routingCtx.ReqPath = tt.reqPath
 			}
+			// deriveRoutingStrategyFromContext reads from ReqHeaders, not Algorithm
+			if tt.routingAlgo != "" {
+				if routingCtx.ReqHeaders == nil {
+					routingCtx.ReqHeaders = make(map[string]string)
+				}
+				routingCtx.ReqHeaders[HeaderRoutingStrategy] = string(tt.routingAlgo)
+			}
 			resp, model, routingCtx, stream, term := server.HandleRequestBody(
 				routingCtx,
 				"test-request-id",
diff --git a/pkg/plugins/gateway/gateway_req_headers.go b/pkg/plugins/gateway/gateway_req_headers.go
index 2fdbf2f2a..fb84f8171 100644
--- a/pkg/plugins/gateway/gateway_req_headers.go
+++ b/pkg/plugins/gateway/gateway_req_headers.go
@@ -26,7 +26,6 @@ import (
 	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
 	envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3"
 
-	routing "github.com/vllm-project/aibrix/pkg/plugins/gateway/algorithms"
 	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
 )
@@ -46,6 +45,7 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req
 	var err error
 	var errRes *extProcPb.ProcessingResponse
 	var routingCtx *types.RoutingContext
+	var reqConfigProfile string
 
 	h := req.Request.(*extProcPb.ProcessingRequest_RequestHeaders)
 	reqHeaders := map[string]string{}
@@ -61,20 +61,13 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req
 			reqHeaders[n.Key] = string(n.RawValue)
 		case contentTypeKey:
 			reqHeaders[n.Key] = string(n.RawValue)
+		case HeaderRoutingStrategy:
+			reqHeaders[n.Key] = string(n.RawValue)
+		case HeaderConfigProfile:
+			reqConfigProfile = strings.TrimSpace(string(n.RawValue))
 		}
 	}
 
-	routingStrategy, routingStrategyEnabled := getRoutingStrategy(h.RequestHeaders.Headers.Headers)
-	routingAlgorithm, ok := routing.Validate(routingStrategy)
-	if routingStrategyEnabled && !ok {
-		klog.ErrorS(nil, "incorrect routing strategy", "requestID", requestID, "routing-strategy", routingStrategy)
-		return generateErrorResponse(
-			envoyTypePb.StatusCode_BadRequest,
-			[]*configPb.HeaderValueOption{{Header: &configPb.HeaderValue{
-				Key: HeaderErrorInvalidRouting, RawValue: []byte(routingStrategy),
-			}}}, "incorrect routing strategy", "", "routing-strategy"), utils.User{}, rpm, routingCtx
-	}
-
 	if username != "" {
 		user, err = utils.GetUser(ctx, utils.User{Name: username}, s.redisClient)
 		if err != nil {
@@ -94,9 +87,10 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req
 		}
 	}
 
-	routingCtx = types.NewRoutingContext(ctx, routingAlgorithm, "", "", requestID, user.Name)
+	routingCtx = types.NewRoutingContext(ctx, "", "", "", requestID, user.Name)
 	routingCtx.ReqPath = requestPath
 	routingCtx.ReqHeaders = reqHeaders
+	routingCtx.ReqConfigProfile = reqConfigProfile
 
 	headers := []*configPb.HeaderValueOption{}
 	headers = append(headers, &configPb.HeaderValueOption{
diff --git a/pkg/plugins/gateway/gateway_req_headers_test.go b/pkg/plugins/gateway/gateway_req_headers_test.go
index 4b6b5541a..bb9ed0764 100644
--- a/pkg/plugins/gateway/gateway_req_headers_test.go
+++ b/pkg/plugins/gateway/gateway_req_headers_test.go
@@ -59,7 +59,7 @@ func Test_handleRequestHeaders(t *testing.T) {
 	// Define test cases for different routing and error scenarios
 	tests := []testCase{
 		{
-			name: "not found strategy - should return error",
+			name: "invalid strategy - passes through to request body (validation deferred)",
 			requestHeaders: []*configPb.HeaderValue{
 				{
 					Key:      HeaderRoutingStrategy,
@@ -67,26 +67,23 @@ func Test_handleRequestHeaders(t *testing.T) {
 				},
 			},
 			expected: testResponse{
-				statusCode: envoyTypePb.StatusCode_BadRequest,
+				statusCode: envoyTypePb.StatusCode_OK,
 				headers: []*configPb.HeaderValueOption{
-					{Header: &configPb.HeaderValue{Key: HeaderErrorInvalidRouting, RawValue: []byte("not-found-strategy")}},
-					{Header: &configPb.HeaderValue{Key: "Content-Type", Value: "application/json"}},
+					{Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}},
 				},
-				routingCtx: nil,
-				user:       utils.User{},
-				rpm:        0,
+				routingCtx: &types.RoutingContext{
+					ReqHeaders: map[string]string{HeaderRoutingStrategy: "not-found-strategy"},
+				},
+				user: utils.User{},
+				rpm:  0,
 			},
 			validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, user utils.User, routingCtx *types.RoutingContext, rpm int64) {
-				// Validate request headers info
-				assert.Equal(t, tt.expected.statusCode, resp.GetImmediateResponse().GetStatus().GetCode())
-				assert.Equal(t, tt.expected.headers, resp.GetImmediateResponse().GetHeaders().GetSetHeaders())
+				assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK)
+				assert.Equal(t, tt.expected.headers, resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders())
 				assert.Equal(t, tt.expected.user, user)
-				assert.Nil(t, routingCtx)
+				assert.NotNil(t, routingCtx)
+				assert.Equal(t, tt.expected.routingCtx.ReqHeaders, routingCtx.ReqHeaders)
 				assert.Equal(t, tt.expected.rpm, rpm)
-				// Verify no special headers are set
-				for _, header := range resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders() {
-					assert.NotEqual(t, HeaderWentIntoReqHeaders, header.Header.Key)
-				}
 			},
 		},
 		{
@@ -151,13 +148,12 @@ func Test_handleRequestHeaders(t *testing.T) {
 				},
 				routingCtx: &types.RoutingContext{
 					ReqPath:    "test-path",
-					ReqHeaders: map[string]string{authorizationKey: "token:test-token"},
+					ReqHeaders: map[string]string{authorizationKey: "token:test-token", HeaderRoutingStrategy: "random"},
 				},
 				user: utils.User{},
 				rpm:  0,
 			},
 			validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, user utils.User, routingCtx *types.RoutingContext, rpm int64) {
-				// Validate request headers info
 				assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK)
 				assert.Equal(t, tt.expected.headers, resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders())
 				assert.Equal(t, tt.expected.user, user)
diff --git a/pkg/plugins/gateway/gateway_rsp_body.go b/pkg/plugins/gateway/gateway_rsp_body.go
index 5cfb5f5f1..a10a1409d 100644
--- a/pkg/plugins/gateway/gateway_rsp_body.go
+++ b/pkg/plugins/gateway/gateway_rsp_body.go
@@ -21,6 +21,7 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"math"
 	"net/http"
 	"strings"
 	"time"
@@ -133,13 +134,8 @@ func (s *Server) HandleResponseBody(ctx context.Context, requestID string, req *
 				HeaderUpdateTPM, fmt.Sprintf("%d", tpm))
 		}
 
-		var targetPod *v1.Pod
 		headers = buildEnvoyProxyHeaders(headers, HeaderRequestID, routerCtx.RequestID)
-		if routerCtx != nil && routerCtx.HasRouted() {
-			targetPod = routerCtx.TargetPod()
-			headers = buildEnvoyProxyHeaders(headers, HeaderTargetPod, routerCtx.TargetAddress())
-		}
-		fields := s.requestEndHelper(routerCtx, targetPod, arrival, promptTokens, completionTokens, totalTokens)
+		fields := s.requestEndHelper(routerCtx, arrival, promptTokens, completionTokens, totalTokens)
 		klog.InfoS("request_end", fields...)
 	} else if b.ResponseBody.EndOfStream {
 		complete = true
@@ -234,10 +230,14 @@ func processLanguageResponse(requestID string, b *extProcPb.ProcessingRequest_Re
 	return
 }
 
-func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v1.Pod, arrival time.Time,
+func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, arrival time.Time,
 	promptTokens, completionTokens, totalTokens int64) []interface{} {
 	requestID := routingCtx.RequestID
 	model := routingCtx.Model
+	var targetPod *v1.Pod
+	if routingCtx.HasRouted() {
+		targetPod = routingCtx.TargetPod()
+	}
 
 	fields := []interface{}{
 		"request_id", requestID,
@@ -248,19 +248,20 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v
 	}
 	pBucket := tokenBucketLabel(promptTokens)
 	cBucket := tokenBucketLabel(completionTokens)
-	metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayPromptTokenBucketTotal, 1.0, map[string]string{"bucket": pBucket})
-	metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayCompletionTokenBucketTotal, 1.0, map[string]string{"bucket": cBucket})
+	metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayPromptTokenBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": pBucket})
+	metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayCompletionTokenBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": cBucket})
 
 	if targetPod != nil {
+		outstandingRequestCount := math.Max(0, getRunningRequestsByPod(s, targetPod.Name, targetPod.Namespace)-1)
 		fields = append(fields,
 			"target_pod", targetPod.Name,
-			"outstanding_request_count", getRunningRequestsByPod(s, targetPod.Name, targetPod.Namespace))
+			"outstanding_request_count", outstandingRequestCount)
 	}
 
 	ttft := arrival.Sub(routingCtx.RequestTime)
 	if routingCtx.Stream {
 		ttftBucket := durationBucketLabel(ttft)
-		metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayTTFTBucketTotal, 1.0, map[string]string{"bucket": ttftBucket})
+		metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayTTFTBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": ttftBucket})
 	}
 
 	if routingCtx.Algorithm == "pd" {
@@ -275,12 +276,12 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v
 			"ttft", ttft,
 			"decode_time_taken", decodeTime,
 		)
-		metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayRoutingTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(routingTime)})
-		metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayPrefillTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(prefillTime)})
-		metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayKVTransferTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(kvTransferTime)})
-		metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayDecodeTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(decodeTime)})
+		metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayRoutingTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(routingTime)})
+		metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayPrefillTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(prefillTime)})
+		metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayKVTransferTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(kvTransferTime)})
+		metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayDecodeTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(decodeTime)})
 		if ttft > ttftThreshold {
-			metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayFirstTokenDelayOver1sTotal, 1.0, map[string]string{
+			metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayFirstTokenDelayOver1sTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{
 				"request_id": requestID,
 				"p_bucket":   pBucket, "c_bucket": cBucket,
 				"routing_time_taken":     fmt.Sprintf("%v", routingTime),
@@ -290,15 +291,11 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v
 				"decode_time_taken":      fmt.Sprintf("%v", decodeTime),
 			})
 		}
-	} else {
-		fields = append(fields,
-			"routing_time_taken", routingCtx.RequestEndTime.Sub(routingCtx.RequestTime),
-		)
+	} else if routingCtx.Algorithm != "" {
+		fields = append(fields, "routing_time_taken", routingCtx.GetRoutingDelay())
 	}
 	fields = append(fields, "total_time_taken", routingCtx.Elapsed(time.Now()))
-	metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayTotalTimeBucketTotal, 1.0, map[string]string{
-		"bucket": durationBucketLabel(routingCtx.Elapsed(time.Now())),
-	})
+	metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayTotalTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(routingCtx.Elapsed(time.Now()))})
 	return fields
 }
 
diff --git a/pkg/plugins/gateway/gateway_rsp_body_test.go b/pkg/plugins/gateway/gateway_rsp_body_test.go
new file mode 100644
index 000000000..39ab39c92
--- /dev/null
+++ b/pkg/plugins/gateway/gateway_rsp_body_test.go
@@ -0,0 +1,516 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package gateway
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/vllm-project/aibrix/pkg/cache"
+	"github.com/vllm-project/aibrix/pkg/types"
+	"github.com/vllm-project/aibrix/pkg/utils"
+)
+
+// mockRateLimiter implements ratelimiter.RateLimiter for testing
+type mockRateLimiter struct {
+	mock.Mock
+}
+
+func (m *mockRateLimiter) Get(ctx context.Context, key string) (int64, error) {
+	args := m.Called(ctx, key)
+	return args.Get(0).(int64), args.Error(1)
+}
+
+func (m *mockRateLimiter) GetLimit(ctx context.Context, key string) (int64, error) {
+	args := m.Called(ctx, key)
+	return args.Get(0).(int64), args.Error(1)
+}
+
+func (m *mockRateLimiter) Incr(ctx context.Context, key string, val int64) (int64, error) {
+	args := m.Called(ctx, key, val)
+	return args.Get(0).(int64), args.Error(1)
+}
+
+func TestIsLanguageRequest(t *testing.T) {
+	tests := []struct {
+		name        string
+		requestPath string
+		want        bool
+	}{
+		{
+			name:        "chat completions is language",
+			requestPath: "/v1/chat/completions",
+			want:        true,
+		},
+		{
+			name:        "completions is language",
+			requestPath: "/v1/completions",
+			want:        true,
+		},
+		{
+			name:        "embeddings is language",
+			requestPath: "/v1/embeddings",
+			want:        true,
+		},
+		{
+			name:        "images generations is not language",
+			requestPath: "/v1/images/generations",
+			want:        false,
+		},
+		{
+			name:        "video generations is not language",
+			requestPath: "/v1/video/generations",
+			want:        false,
+		},
+		{
+			name:        "audio transcriptions is not language",
+			requestPath: "/v1/audio/transcriptions",
+			want:        false,
+		},
+		{
+			name:        "audio translations is not language",
+			requestPath: "/v1/audio/translations",
+			want:        false,
+		},
+		{
+			name:        "empty path is language",
+			requestPath: "",
+			want:        true,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := isLanguageRequest(tt.requestPath)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestTokenBucketLabel(t *testing.T) {
+	tests := []struct {
+		name   string
+		tokens int64
+		want   string
+	}{
+		{"zero", 0, "0-256"},
+		{"small", 100, "0-256"},
+		{"boundary 256", 256, "256-512"},
+		{"mid range", 500, "256-512"},
+		{"boundary 512", 512, "512-1024"},
+		{"1024", 1024, "1024-2048"},
+		{"2048", 2048, "2048-4096"},
+		{"4096", 4096, "4096-8192"},
+		{"8192", 8192, "8192-16384"},
+		{"16384", 16384, "16384-32768"},
+		{"32768", 32768, "32768+"},
+		{"large", 100000, "32768+"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := tokenBucketLabel(tt.tokens)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestDurationBucketLabel(t *testing.T) {
+	tests := []struct {
+		name string
+		d    time.Duration
+		want string
+	}{
+		{"zero", 0, "0-1ms"},
+		{"sub millisecond", 500 * time.Microsecond, "0-1ms"},
+		{"1ms", time.Millisecond, "1-2ms"},
+		{"2ms", 2 * time.Millisecond, "2-5ms"},
+		{"5ms", 5 * time.Millisecond, "5-10ms"},
+		{"10ms", 10 * time.Millisecond, "10-20ms"},
+		{"50ms", 50 * time.Millisecond, "50-100ms"},
+		{"100ms", 100 * time.Millisecond, "100-200ms"},
+		{"500ms", 500 * time.Millisecond, "500-1000ms"},
+		{"1s", time.Second, "1000-2000ms"},
+		{"5s", 5 * time.Second, "5000ms+"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := durationBucketLabel(tt.d)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestProcessLanguageResponse_PartialChunk(t *testing.T) {
+	requestID := "test-partial-" + time.Now().Format("150405.000")
+	body := []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`)
+
+	req := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        body,
+			EndOfStream: false,
+		},
+	}
+
+	res, complete, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, req)
+
+	assert.False(t, complete)
+	assert.Equal(t, int64(0), promptTokens)
+	assert.Equal(t, int64(0), completionTokens)
+	assert.Equal(t, int64(0), totalTokens)
+	assert.NotNil(t, res)
+	assert.NotNil(t, res.GetResponseBody())
+	assert.NotNil(t, res.GetResponseBody().GetResponse())
+}
+
+func TestProcessLanguageResponse_ValidFullResponse(t *testing.T) {
+	requestID := "test-valid-" + time.Now().Format("150405.000")
+	body := []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`)
+
+	req := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        body,
+			EndOfStream: true,
+		},
+	}
+
+	res, complete, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, req)
+
+	// processLanguageResponse returns complete=false for valid case (no early return)
+	assert.False(t, complete)
+	assert.Equal(t, int64(10), promptTokens)
+	assert.Equal(t, int64(5), completionTokens)
+	assert.Equal(t, int64(15), totalTokens)
+	assert.Nil(t, res) // No error response for valid case
+}
+
+func TestProcessLanguageResponse_InvalidJSON(t *testing.T) {
+	requestID := "test-invalid-json-" + time.Now().Format("150405.000")
+	body := []byte(`{invalid json}`)
+
+	req := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        body,
+			EndOfStream: true,
+		},
+	}
+
+	res, complete, _, _, _ := processLanguageResponse(requestID, req)
+
+	assert.True(t, complete)
+	assert.NotNil(t, res)
+	// buildErrorResponse returns ImmediateResponse, not ResponseBody
+	immResp := res.GetImmediateResponse()
+	assert.NotNil(t, immResp)
+	headers := immResp.GetHeaders().GetSetHeaders()
+	found := false
+	for _, h := range headers {
+		if h.Header.Key == HeaderErrorResponseUnmarshal {
+			found = true
+			break
+		}
+	}
+	assert.True(t, found, "expected HeaderErrorResponseUnmarshal in response")
+}
+
+func TestProcessLanguageResponse_EmptyModel(t *testing.T) {
+	requestID := "test-empty-model-" + time.Now().Format("150405.000")
+	body := []byte(`{"model": "", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`)
+
+	req := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        body,
+			EndOfStream: true,
+		},
+	}
+
+	res, complete, _, _, _ := processLanguageResponse(requestID, req)
+
+	assert.True(t, complete)
+	assert.NotNil(t, res)
+	immResp := res.GetImmediateResponse()
+	assert.NotNil(t, immResp)
+	headers := immResp.GetHeaders().GetSetHeaders()
+	found := false
+	for _, h := range headers {
+		if h.Header.Key == HeaderErrorResponseUnknown {
+			found = true
+			break
+		}
+	}
+	assert.True(t, found, "expected HeaderErrorResponseUnknown in response")
+}
+
+func TestProcessLanguageResponse_ChunkedAccumulation(t *testing.T) {
+	requestID := "test-chunked-" + time.Now().Format("150405.000")
+
+	// First chunk - partial
+	chunk1 := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        []byte(`{"model": "test-model", "usage": {"prompt_tokens": `),
+			EndOfStream: false,
+		},
+	}
+	res1, complete1, _, _, _ := processLanguageResponse(requestID, chunk1)
+	assert.False(t, complete1)
+	assert.NotNil(t, res1)
+
+	// Second chunk - complete
+	chunk2 := &extProcPb.ProcessingRequest_ResponseBody{
+		ResponseBody: &extProcPb.HttpBody{
+			Body:        []byte(`10, "completion_tokens": 5, "total_tokens": 15}}`),
+			EndOfStream: true,
+		},
+	}
+	_, complete2, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, chunk2)
+	assert.False(t, complete2) // processLanguageResponse returns complete=false for valid case
+	assert.Equal(t, int64(10), promptTokens)
+	assert.Equal(t, int64(5), completionTokens)
+	assert.Equal(t, int64(15), totalTokens)
+}
+
+func TestHandleResponseBody_NonStreamNoTokens(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	// DoneRequestTrace(ctx, requestID, model, inputTokens, outputTokens, traceTerm)
+	mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(10), int64(5), int64(0)).Maybe()
+
+	server := &Server{
+		cache: mockCache,
+	}
+
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false)
+
+	assert.True(t, complete)
+	assert.NotNil(t, resp)
+	assert.NotNil(t, resp.GetResponseBody())
+}
+
+func TestHandleResponseBody_WithUserAndTPM(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	// DoneRequestTrace(ctx, requestID, model, term, inputTokens, outputTokens) - use mock.Anything for dynamic requestID
+	mockCache.On("DoneRequestTrace", mock.Anything, mock.Anything, "test-model", int64(10), int64(5), int64(0)).Maybe()
+
+	mockRL := &mockRateLimiter{}
+	mockRL.On("Incr", mock.Anything, "test-user_TPM_CURRENT", int64(15)).Return(int64(100), nil)
+
+	server := &Server{
+		cache:       mockCache,
+		ratelimiter: mockRL,
+	}
+
+	requestID := "test-req-tpm-" + time.Now().Format("150405.000")
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", requestID, "test-user")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, requestID, req, utils.User{Name: "test-user"}, 42, "test-model", false, 0, false)
+
+	assert.True(t, complete)
+	assert.NotNil(t, resp)
+	headers := resp.GetResponseBody().GetResponse().GetHeaderMutation().GetSetHeaders()
+	foundTPM := false
+	foundRPM := false
+	foundReqID := false
+	for _, h := range headers {
+		switch h.Header.Key {
+		case HeaderUpdateTPM:
+			foundTPM = true
+			assert.Equal(t, []byte("100"), h.Header.RawValue)
+		case HeaderUpdateRPM:
+			foundRPM = true
+			assert.Equal(t, []byte("42"), h.Header.RawValue)
+		case HeaderRequestID:
+			foundReqID = true
+			assert.Equal(t, []byte(requestID), h.Header.RawValue)
+		}
+	}
+	assert.True(t, foundTPM, "expected HeaderUpdateTPM in response")
+	assert.True(t, foundRPM, "expected HeaderUpdateRPM in response")
+	assert.True(t, foundReqID, "expected request-id in response")
+	mockRL.AssertExpectations(t)
+}
+
+func TestHandleResponseBody_NonLanguageRequest(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	// Non-language request: no tokens from processLanguageResponse, EndOfStream triggers complete
+	mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(0), int64(0), int64(0)).Maybe()
+
+	server := &Server{
+		cache: mockCache,
+	}
+
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "")
+	routerCtx.ReqPath = "/v1/images/generations"
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model": "test-model"}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false)
+
+	// Non-language request with EndOfStream sets complete=true
+	assert.True(t, complete)
+	assert.NotNil(t, resp)
+}
+
+func TestHandleResponseBody_EndOfStreamNoTokens(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	// Body {} parses but has empty model - returns error, DoneRequestTrace called with 0,0,0
+	mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(0), int64(0), int64(0)).Maybe()
+
+	server := &Server{
+		cache: mockCache,
+	}
+
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false)
+
+	assert.True(t, complete)
+	assert.NotNil(t, resp)
+}
+
+func TestHandleResponseBody_TPMIncrError(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	mockCache.On("DoneRequestTrace", mock.Anything, mock.Anything, "test-model", int64(10), int64(5), int64(0)).Maybe()
+
+	mockRL := &mockRateLimiter{}
+	mockRL.On("Incr", mock.Anything, "test-user_TPM_CURRENT", int64(15)).Return(int64(0), errors.New("mock error"))
+	server := &Server{
+		cache:       mockCache,
+		ratelimiter: mockRL,
+	}
+
+	requestID := "test-req-tpm-err-" + time.Now().Format("150405.000")
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", requestID, "test-user")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, requestID, req, utils.User{Name: "test-user"}, 0, "test-model", false, 0, false)
+
+	assert.True(t, complete)
+	assert.NotNil(t, resp)
+	// Error response uses ImmediateResponse
+	immResp := resp.GetImmediateResponse()
+	assert.NotNil(t, immResp)
+	headers := immResp.GetHeaders().GetSetHeaders()
+	found := false
+	for _, h := range headers {
+		if h.Header.Key == HeaderErrorIncrTPM {
+			found = true
+			break
+		}
+	}
+	assert.True(t, found, "expected HeaderErrorIncrTPM in response")
+	mockRL.AssertExpectations(t)
+}
+
+func TestHandleResponseBody_LanguagePartialResponse(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	server := &Server{cache: mockCache}
+
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "m", "", "rid-partial", "")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model":"m","usage":{"prompt_tokens":1}}`),
+				EndOfStream: false,
+			},
+		},
+	}
+
+	resp, complete := server.HandleResponseBody(routerCtx, "rid-partial", req, utils.User{}, 0, "m", false, 0, false)
+	assert.False(t, complete)
+	assert.NotNil(t, resp)
+	assert.NotNil(t, resp.GetResponseBody().GetResponse())
+}
+
+func TestHandleResponseBody_DoesNotDuplicateTrace(t *testing.T) {
+	mockCache := &MockCache{Cache: cache.NewForTest()}
+	server := &Server{cache: mockCache}
+
+	routerCtx := types.NewRoutingContext(context.Background(), "random", "m", "", "rid", "")
+	routerCtx.ReqPath = PathChatCompletions
+	routerCtx.RequestTime = time.Now()
+
+	req := &extProcPb.ProcessingRequest{
+		Request: &extProcPb.ProcessingRequest_ResponseBody{
+			ResponseBody: &extProcPb.HttpBody{
+				Body:        []byte(`{"model":"m","usage":{"prompt_tokens":10,"completion_tokens":5,"total_tokens":15}}`),
+				EndOfStream: true,
+			},
+		},
+	}
+
+	_, complete := server.HandleResponseBody(routerCtx, "rid", req, utils.User{}, 0, "m", false, 0, true)
+	assert.True(t, complete)
+	mockCache.AssertNotCalled(t, "DoneRequestTrace", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything)
+}
diff --git a/pkg/plugins/gateway/gateway_rsp_headers.go b/pkg/plugins/gateway/gateway_rsp_headers.go
index 9d88ee8db..68dd0f6ef 100644
--- a/pkg/plugins/gateway/gateway_rsp_headers.go
+++ b/pkg/plugins/gateway/gateway_rsp_headers.go
@@ -44,7 +44,10 @@ func (s *Server) HandleResponseHeaders(ctx context.Context, requestID string, mo
 	headers := []*configPb.HeaderValueOption{}
 	headers = buildEnvoyProxyHeaders(headers, HeaderWentIntoReqHeaders, "true", HeaderRequestID, requestID)
 	if routerCtx != nil && routerCtx.HasRouted() {
-		headers = buildEnvoyProxyHeaders(headers, HeaderTargetPod, routerCtx.TargetAddress())
+		headers = buildEnvoyProxyHeaders(headers,
+			HeaderRoutingStrategy, string(routerCtx.Algorithm),
+			HeaderTargetPod, routerCtx.TargetPod().Name,
+			HeaderTargetPodIP, routerCtx.TargetAddress())
 	}
 
 	if routerCtx != nil && routerCtx.RespHeaders != nil {
diff --git a/pkg/plugins/gateway/gateway_rsp_headers_test.go b/pkg/plugins/gateway/gateway_rsp_headers_test.go
index 41c68fe12..8c945baa5 100644
--- a/pkg/plugins/gateway/gateway_rsp_headers_test.go
+++ b/pkg/plugins/gateway/gateway_rsp_headers_test.go
@@ -84,7 +84,9 @@ func Test_HandleResponseHeaders(t *testing.T) {
 				headers: []*configPb.HeaderValueOption{
 					{Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}},
 					{Header: &configPb.HeaderValue{Key: HeaderRequestID, RawValue: []byte("test-req-id")}},
-					{Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("10.0.0.1:8000")}},
+					{Header: &configPb.HeaderValue{Key: "routing-strategy", RawValue: []byte("random")}},
+					{Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("test-pod")}},
+					{Header: &configPb.HeaderValue{Key: HeaderTargetPodIP, RawValue: []byte("10.0.0.1:8000")}},
 					{Header: &configPb.HeaderValue{Key: "X-Custom", RawValue: []byte("value")}},
 					{Header: &configPb.HeaderValue{Key: ":status", RawValue: []byte("200")}},
 				},
@@ -100,7 +102,9 @@ func Test_HandleResponseHeaders(t *testing.T) {
 				headers: []*configPb.HeaderValueOption{
 					{Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}},
 					{Header: &configPb.HeaderValue{Key: HeaderRequestID, RawValue: []byte("test-req-id")}},
-					{Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("10.0.0.1:8000")}},
+					{Header: &configPb.HeaderValue{Key: "routing-strategy", RawValue: []byte("random")}},
+					{Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("test-pod")}},
+					{Header: &configPb.HeaderValue{Key: HeaderTargetPodIP, RawValue: []byte("10.0.0.1:8000")}},
 					{Header: &configPb.HeaderValue{Key: ":status", RawValue: []byte("500")}},
 				},
 			},
diff --git a/pkg/plugins/gateway/gateway_test.go b/pkg/plugins/gateway/gateway_test.go
index 000dfe255..123110087 100644
--- a/pkg/plugins/gateway/gateway_test.go
+++ b/pkg/plugins/gateway/gateway_test.go
@@ -21,7 +21,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"os"
 	"testing"
 
 	configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
@@ -78,70 +77,6 @@ func Test_ValidateRoutingStrategy(t *testing.T) {
 	}
 }
 
-func TestGetRoutingStrategy(t *testing.T) {
-	var tests = []struct {
-		headers               []*configPb.HeaderValue
-		setEnvRoutingStrategy bool
-		envRoutingStrategy    string
-		expectedStrategy      string
-		expectedEnabled       bool
-		message               string
-	}{
-		{
-			headers:               []*configPb.HeaderValue{},
-			setEnvRoutingStrategy: false,
-			expectedStrategy:      "",
-			expectedEnabled:       false,
-			message:               "no routing strategy in headers or environment variable",
-		},
-		{
-			headers: []*configPb.HeaderValue{
-				{Key: "routing-strategy", RawValue: []byte("random")},
-			},
-			setEnvRoutingStrategy: false,
-			expectedStrategy:      "random",
-			expectedEnabled:       true,
-			message:               "routing strategy from headers",
-		},
-		{
-			headers:               []*configPb.HeaderValue{},
-			setEnvRoutingStrategy: true,
-			envRoutingStrategy:    "random",
-			expectedStrategy:      "random",
-			expectedEnabled:       true,
-			message:               "routing strategy from environment variable",
-		},
-		{
-			headers: []*configPb.HeaderValue{
-				{Key: "routing-strategy", RawValue: []byte("random")},
-			},
-			setEnvRoutingStrategy: true,
-			envRoutingStrategy:    "least-request",
-			expectedStrategy:      "random",
-			expectedEnabled:       true,
-			message:               "header routing strategy takes priority over environment variable",
-		},
-	}
-
-	for _, tt := range tests {
-		if tt.setEnvRoutingStrategy {
-			_ = os.Setenv("ROUTING_ALGORITHM", tt.envRoutingStrategy)
-		} else {
-			_ = os.Unsetenv("ROUTING_ALGORITHM")
-		}
-
-		// refresh default values, the process won't modify this environment variable during normal running
-		defaultRoutingStrategy, defaultRoutingStrategyEnabled = utils.LookupEnv(EnvRoutingAlgorithm)
-
-		routingStrategy, enabled := getRoutingStrategy(tt.headers)
-		assert.Equal(t, tt.expectedStrategy, routingStrategy, tt.message)
-		assert.Equal(t, tt.expectedEnabled, enabled, tt.message)
-
-		// Cleanup environment variable for next test
-		_ = os.Unsetenv("ROUTING_ALGORITHM")
-	}
-}
-
 func Test_buildEnvoyProxyHeaders(t *testing.T) {
 	headers := []*configPb.HeaderValueOption{}
 
diff --git a/pkg/plugins/gateway/gateway_test_helpers.go b/pkg/plugins/gateway/gateway_test_helpers.go
index 20941c89b..e37d74997 100644
--- a/pkg/plugins/gateway/gateway_test_helpers.go
+++ b/pkg/plugins/gateway/gateway_test_helpers.go
@@ -71,8 +71,8 @@ func (m *MockCache) DoneRequestCount(ctx *types.RoutingContext, requestID string
 	m.Called(ctx, requestID, model, term)
 }
 
-func (m *MockCache) DoneRequestTrace(ctx *types.RoutingContext, requestID string, model string, term int64, inputTokens int64, outputTokens int64) {
-	m.Called(ctx, requestID, model, term, inputTokens, outputTokens)
+func (m *MockCache) DoneRequestTrace(ctx *types.RoutingContext, requestID string, model string, inputTokens int64, outputTokens int64, traceTerm int64) {
+	m.Called(ctx, requestID, model, inputTokens, outputTokens, traceTerm)
 }
 
 func (m *MockCache) AddSubscriber(subscriber metrics.MetricSubscriber) {
diff --git a/pkg/plugins/gateway/types.go b/pkg/plugins/gateway/types.go
index 1be2f1067..bea058d46 100644
--- a/pkg/plugins/gateway/types.go
+++ b/pkg/plugins/gateway/types.go
@@ -45,11 +45,13 @@ const (
 
 	// Request & Target Headers
 	HeaderWentIntoReqHeaders = "x-went-into-req-headers"
+	HeaderTargetPodIP        = "target-pod-ip"
 	HeaderTargetPod          = "target-pod"
 	HeaderRoutingStrategy    = "routing-strategy"
 	HeaderRequestID          = "request-id"
 	HeaderModel              = "model"
 	HeaderExternalFilter     = "external-filter"
+	HeaderConfigProfile      = "config-profile"
 
 	// RPM & TPM Update Errors
 	HeaderUpdateTPM        = "x-update-tpm"
diff --git a/pkg/plugins/gateway/util.go b/pkg/plugins/gateway/util.go
index a0632d0b0..d0bf54626 100644
--- a/pkg/plugins/gateway/util.go
+++ b/pkg/plugins/gateway/util.go
@@ -33,9 +33,11 @@ import (
 	envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3"
 	"github.com/openai/openai-go"
 	"github.com/openai/openai-go/packages/param"
-	"k8s.io/klog/v2"
-
+	"github.com/vllm-project/aibrix/pkg/plugins/gateway/configprofiles"
+	"github.com/vllm-project/aibrix/pkg/types"
 	"github.com/vllm-project/aibrix/pkg/utils"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
 )
 
 var (
@@ -316,19 +318,47 @@ func validateStreamOptions(requestID string, user utils.User, stream *bool, stre
 	return nil
 }
 
+// applyConfigProfile resolves the model config from pod annotation (model.aibrix.ai/config)
+// and applies the selected profile: sets ConfigProfile on routingCtx.
+// - If the client provides config-profile, use that profile name.
+// - If not provided or not found, fall back to defaultProfile (or "default") in the JSON.
+func applyConfigProfile(routingCtx *types.RoutingContext, pods []*v1.Pod) {
+	headerProfile := routingCtx.ReqConfigProfile
+	profile := configprofiles.ResolveProfile(pods, headerProfile)
+	if profile == nil {
+		return
+	}
+	routingCtx.ConfigProfile = &types.ResolvedConfigProfile{
+		RoutingStrategy:          profile.RoutingStrategy,
+		PromptLenBucketMinLength: profile.PromptLenBucketMinLength,
+		PromptLenBucketMaxLength: profile.PromptLenBucketMaxLength,
+		Combined:                 profile.Combined,
+	}
+}
+
 var defaultRoutingStrategy, defaultRoutingStrategyEnabled = utils.LookupEnv(EnvRoutingAlgorithm)
 
-// getRoutingStrategy retrieves the routing strategy from the headers or environment variable
-// It returns the routing strategy value and whether custom routing strategy is enabled.
-func getRoutingStrategy(headers []*configPb.HeaderValue) (string, bool) {
-	// Check headers for routing strategy
-	for _, header := range headers {
-		if strings.ToLower(header.Key) == HeaderRoutingStrategy {
-			return string(header.RawValue), true
+// deriveRoutingStrategyFromContext retrieves routing strategy from headers or resolved profile, falling back to env defaults.
+func deriveRoutingStrategyFromContext(routingCtx *types.RoutingContext) (string, bool) {
+	// Check request headers (case-insensitive key match)
+	if routingCtx != nil && routingCtx.ReqHeaders != nil {
+		for k, v := range routingCtx.ReqHeaders {
+			if strings.ToLower(k) == HeaderRoutingStrategy {
+				if strings.TrimSpace(v) != "" {
+					return v, true
+				}
+				break
+			}
 		}
 	}
-
-	// If header not set, use default routing strategy from environment variable
+	// Fallback to resolved profile on routing context
+	if routingCtx != nil && routingCtx.ConfigProfile != nil {
+		s := strings.TrimSpace(routingCtx.ConfigProfile.RoutingStrategy)
+		if s != "" {
+			return s, true
+		}
+	}
+	// Fallback to environment default
 	return defaultRoutingStrategy, defaultRoutingStrategyEnabled
 }
 
@@ -589,20 +619,13 @@ func validateTokenInputs(tokenArrays [][]int64) error {
 	return nil
 }
 
-func buildGatewayPodMetricLabels(model, status, statusCode string) ([]string, []string) {
-	labelNames := []string{
-		"model",
-		"status",
-		"status_code",
-		"pod_name",
-	}
-	labelValues := []string{
-		model,
-		status,
-		statusCode,
-		POD_NAME,
+func buildGatewayPodMetricLabels(model, status, statusCode string) map[string]string {
+	return map[string]string{
+		"model":       GetModelTag(model),
+		"status":      status,
+		"status_code": statusCode,
+		"pod_name":    POD_NAME,
 	}
-	return labelNames, labelValues
 }
 
 func GetModelTag(model string) string {
diff --git a/pkg/types/router_context.go b/pkg/types/router_context.go
index d994677c6..21d221b3a 100644
--- a/pkg/types/router_context.go
+++ b/pkg/types/router_context.go
@@ -41,6 +41,16 @@ const (
 
 type RequestFeatures []float64
 
+// ResolvedConfigProfile holds the resolved model config profile for a request.
+// Populated from model.aibrix.ai/config annotation based on config-profile header or defaultProfile.
+// Nil when no config is present;
+type ResolvedConfigProfile struct {
+	RoutingStrategy          string
+	PromptLenBucketMinLength int
+	PromptLenBucketMaxLength int
+	Combined                 bool
+}
+
 // RoutingAlgorithm defines the routing algorithms
 type RoutingAlgorithm string
 
@@ -60,9 +70,10 @@ type RoutingContext struct {
 	TraceTerm      int64     // Trace term identifier, available after AddRequestCount call.
 	RoutedTime     time.Time // Time consumed during routing.
 
-	ReqHeaders map[string]string
-	ReqBody    []byte
-	ReqPath    string
+	ReqHeaders       map[string]string
+	ReqBody          []byte
+	ReqPath          string
+	ReqConfigProfile string
 
 	PrefillStartTime time.Time // Time when prefill request is started.
 	PrefillEndTime   time.Time // Time consumed during prefill.
@@ -74,6 +85,11 @@ type RoutingContext struct {
 	// during the Route() call.
 	RespHeaders map[string]string
 
+	// ConfigProfile holds the resolved model config profile for this request.
+	// Set in HandleRequestBody from model.aibrix.ai/config (annotation)
+	// based on config-profile header. Nil when no config is present.
+	ConfigProfile *ResolvedConfigProfile
+
 	targetPodSet chan struct{}
 	targetPod    atomic.Pointer[v1.Pod]
 	targetPort   atomic.Int32
@@ -306,12 +322,14 @@ func (r *RoutingContext) reset(ctx context.Context, algorithms RoutingAlgorithm,
 
 	r.ReqHeaders = map[string]string{}
 	r.ReqPath = ""
+	r.ReqConfigProfile = ""
 	r.ReqBody = []byte{}
 	r.PrefillStartTime = time.Time{}
 	r.PrefillEndTime = time.Time{}
 	// RoutedTime will not be reset, it must before ReqeustTime at this time.
 
 	r.RespHeaders = map[string]string{}
+	r.ConfigProfile = nil
 	r.targetPodSet = make(chan struct{}) // Initialize channel
 	r.targetPod.Store(nilPod)
 	r.lastError.Store(nil)
diff --git a/samples/disaggregation/sglang/pd-bucketing.yaml b/samples/disaggregation/sglang/pd-bucketing.yaml
index f50ba3045..01db63dfb 100644
--- a/samples/disaggregation/sglang/pd-bucketing.yaml
+++ b/samples/disaggregation/sglang/pd-bucketing.yaml
@@ -25,13 +25,22 @@ spec:
                 prometheus.io/scrape: "true"
                 prometheus.io/port: "30000"
                 prometheus.io/path: "/metrics"
+                model.aibrix.ai/config: |
+                  {
+                    "defaultProfile": "pd",
+                    "profiles": {
+                      "pd": {
+                        "routingStrategy": "pd",
+                        "promptLenBucketMinLength": 0,
+                        "promptLenBucketMaxLength": 2048
+                      }
+                    }
+                  }
               labels:
                 model.aibrix.ai/name: qwen3-8B
                 model.aibrix.ai/port: "30000"
                 model.aibrix.ai/metric-port: "30000"
                 model.aibrix.ai/engine: sglang
-                prompt-min-length: "0"
-                prompt-max-length: "2048"
             spec:
               nodeSelector:
                 kubernetes.io/hostname: 10.0.151.187
@@ -155,13 +164,22 @@ spec:
                 prometheus.io/scrape: "true"
                 prometheus.io/port: "30000"
                 prometheus.io/path: "/metrics"
+                model.aibrix.ai/config: |
+                  {
+                    "defaultProfile": "pd",
+                    "profiles": {
+                      "pd": {
+                        "routingStrategy": "pd",
+                        "promptLenBucketMinLength": 0,
+                        "promptLenBucketMaxLength": 2048
+                      }
+                    }
+                  }
               labels:
                 model.aibrix.ai/name: qwen3-8B
                 model.aibrix.ai/port: "30000"
                 model.aibrix.ai/metric-port: "30000"
                 model.aibrix.ai/engine: sglang
-                prompt-min-length: "0"
-                prompt-max-length: "2048"
             spec:
               nodeSelector:
                 kubernetes.io/hostname: 10.0.151.187
@@ -304,13 +322,22 @@ spec:
                 prometheus.io/scrape: "true"
                 prometheus.io/port: "30000"
                 prometheus.io/path: "/metrics"
+                model.aibrix.ai/config: |
+                  {
+                    "defaultProfile": "pd",
+                    "profiles": {
+                      "pd": {
+                        "routingStrategy": "pd",
+                        "promptLenBucketMinLength": 2049,
+                        "promptLenBucketMaxLength": 4096
+                      }
+                    }
+                  }
               labels:
                 model.aibrix.ai/name: qwen3-8B
                 model.aibrix.ai/port: "30000"
                 model.aibrix.ai/metric-port: "30000"
                 model.aibrix.ai/engine: sglang
-                prompt-min-length: "2049"
-                prompt-max-length: "4096"
             spec:
               nodeSelector:
                 kubernetes.io/hostname: 10.0.151.187
@@ -434,13 +461,22 @@ spec:
                 prometheus.io/scrape: "true"
                 prometheus.io/port: "30000"
                 prometheus.io/path: "/metrics"
+                model.aibrix.ai/config: |
+                  {
+                    "defaultProfile": "pd",
+                    "profiles": {
+                      "pd": {
+                        "routingStrategy": "pd",
+                        "promptLenBucketMinLength": 2049,
+                        "promptLenBucketMaxLength": 4096
+                      }
+                    }
+                  }
               labels:
                 model.aibrix.ai/name: qwen3-8B
                 model.aibrix.ai/port: "30000"
                 model.aibrix.ai/metric-port: "30000"
                 model.aibrix.ai/engine: sglang
-                prompt-min-length: "2049"
-                prompt-max-length: "4096"
             spec:
               nodeSelector:
                 kubernetes.io/hostname: 10.0.151.187
@@ -584,13 +620,22 @@ spec:
                 prometheus.io/scrape: "true"
                 prometheus.io/port: "30000"
                 prometheus.io/path: "/metrics"
+                model.aibrix.ai/config: |
+                  {
+                    "defaultProfile": "pd",
+                    "profiles": {
+                      "pd": {
+                        "routingStrategy": "pd",
+                        "promptLenBucketMinLength": 0,
+                        "combined": true
+                      }
+                    }
+                  }
               labels:
                 model.aibrix.ai/name: qwen3-8B
                 model.aibrix.ai/port: "30000"
                 model.aibrix.ai/metric-port: "30000"
                 model.aibrix.ai/engine: sglang
-                prompt-min-length: "0"
-                model.aibrix.ai/combined: "true"
             spec:
               nodeSelector:
                 kubernetes.io/hostname: 10.1.9.155
diff --git a/test/e2e/routing_config_profile_test.go b/test/e2e/routing_config_profile_test.go
new file mode 100644
index 000000000..5564698bd
--- /dev/null
+++ b/test/e2e/routing_config_profile_test.go
@@ -0,0 +1,89 @@
+/*
+Copyright 2025 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"net/http"
+	"testing"
+
+	"github.com/openai/openai-go"
+	"github.com/openai/openai-go/option"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// TestConfigProfileRoutingStrategy verifies that passing config-profile as a header
+// causes the gateway plugin to select the correct routing-strategy from the model's
+// config (model.aibrix.ai/config annotation).
+//
+// The config is defined in development/app/config/mock/config-profile-llama2-patch.yaml
+// (mirrors config-profile.yaml structure):
+//   - defaultProfile: "least-request"
+//   - profiles: "least-request" (routingStrategy: least-request), "throughput" (routingStrategy: throughput)
+//
+// The gateway resolves config-profile header -> ResolveProfile -> deriveRoutingStrategyFromContext
+// and sets routing-strategy in the response headers.
+func TestConfigProfileRoutingStrategy(t *testing.T) {
+	msg := "config-profile routing test message"
+
+	t.Run("no_config_profile_uses_default", func(t *testing.T) {
+		var dst *http.Response
+		client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "", option.WithResponseInto(&dst))
+
+		_, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+			Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)},
+			Model:    modelNameQwen3,
+		})
+		require.NoError(t, err)
+
+		// No config-profile header -> defaultProfile "least-request" is used
+		got := dst.Header.Get("routing-strategy")
+		assert.Equal(t, "least-request", got,
+			"without config-profile header, gateway should use defaultProfile least-request")
+	})
+
+	t.Run("config_profile_least_request", func(t *testing.T) {
+		var dst *http.Response
+		client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "least-request", option.WithResponseInto(&dst))
+
+		_, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+			Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)},
+			Model:    modelNameQwen3,
+		})
+		require.NoError(t, err)
+
+		got := dst.Header.Get("routing-strategy")
+		assert.Equal(t, "least-request", got,
+			"config-profile: least-request should select routing-strategy least-request")
+	})
+
+	t.Run("config_profile_throughput", func(t *testing.T) {
+		var dst *http.Response
+		client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "throughput", option.WithResponseInto(&dst))
+
+		_, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{
+			Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)},
+			Model:    modelNameQwen3,
+		})
+		require.NoError(t, err)
+
+		got := dst.Header.Get("routing-strategy")
+		assert.Equal(t, "throughput", got,
+			"config-profile: throughput should select routing-strategy throughput")
+	})
+}
diff --git a/test/e2e/util.go b/test/e2e/util.go
index 4419d9704..7f7f40664 100644
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@@ -41,11 +41,12 @@ import (
 )
 
 const (
-	gatewayURL = "http://localhost:8888"
-	engineURL  = "http://localhost:8000"
-	apiKey     = "test-key-1234567890"
-	modelName  = "llama2-7b"
-	namespace  = "aibrix-system"
+	gatewayURL     = "http://localhost:8888"
+	engineURL      = "http://localhost:8000"
+	apiKey         = "test-key-1234567890"
+	modelName      = "llama2-7b"
+	modelNameQwen3 = "qwen3-8b"
+	namespace      = "aibrix-system"
 )
 
 func initializeClient(ctx context.Context, t *testing.T) (*kubernetes.Clientset, *v1alpha1.Clientset) {
@@ -129,6 +130,36 @@ func createOpenAIClientWithRoutingStrategy(baseURL, apiKey, routingStrategy stri
 	)
 }
 
+// createOpenAIClientWithConfigProfile creates a client that sends config-profile header.
+// The gateway plugin selects routing-strategy from the model's config profile (model.aibrix.ai/config)
+// based on this header, rather than from the routing-strategy header.
+func createOpenAIClientWithConfigProfile(baseURL, apiKey, configProfile string,
+	respOpt option.RequestOption) openai.Client {
+	transport := &http.Transport{
+		DisableKeepAlives: true,
+		MaxIdleConns:      0,
+	}
+
+	opts := []option.RequestOption{
+		option.WithBaseURL(baseURL),
+		option.WithAPIKey(apiKey),
+		option.WithHTTPClient(&http.Client{Transport: transport}),
+		option.WithMiddleware(func(r *http.Request, mn option.MiddlewareNext) (*http.Response, error) {
+			r.URL.Path = "/v1" + r.URL.Path
+			return mn(r)
+		}),
+		option.WithMaxRetries(0),
+	}
+	if configProfile != "" {
+		opts = append(opts, option.WithHeader("config-profile", configProfile))
+	}
+	if respOpt != nil {
+		opts = append(opts, respOpt)
+	}
+
+	return openai.NewClient(opts...)
+}
+
 func validateInference(t *testing.T, modelName string) {
 	client := createOpenAIClient(gatewayURL, apiKey)
 	validateInferenceWithClient(t, client, modelName)