diff --git a/development/app/config/mock/config-profile.yaml b/development/app/config/mock/config-profile.yaml new file mode 100644 index 000000000..bbedb0d3e --- /dev/null +++ b/development/app/config/mock/config-profile.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mock-qwen3-8b + labels: + model.aibrix.ai/name: "qwen3-8b" + model.aibrix.ai/port: "8000" + adapter.model.aibrix.ai/enabled: "true" +spec: + replicas: 1 + selector: + matchLabels: + adapter.model.aibrix.ai/enabled: "true" + model.aibrix.ai/name: "qwen3-8b" + app: "mock-qwen3-8b" + template: + metadata: + labels: + adapter.model.aibrix.ai/enabled: "true" + model.aibrix.ai/name: "qwen3-8b" + app: "mock-qwen3-8b" + annotations: + model.aibrix.ai/config: | + { + "defaultProfile": "least-request", + "profiles": { + "least-request": { + "routingStrategy": "least-request" + }, + "throughput": { + "routingStrategy": "throughput" + } + } + } + spec: + serviceAccountName: mocked-app-sa + containers: + - name: llm-engine + image: aibrix/vllm-mock:nightly + command: + - python3 + - app.py + - --api_key + - test-key-1234567890 \ No newline at end of file diff --git a/development/app/config/mock/kustomization.yaml b/development/app/config/mock/kustomization.yaml index 412c56f20..5a6f6f8f4 100644 --- a/development/app/config/mock/kustomization.yaml +++ b/development/app/config/mock/kustomization.yaml @@ -1,6 +1,7 @@ resources: - ../templates/deployment - components.yaml + - config-profile.yaml # enable following patch when we test lora + api-key patches: diff --git a/docs/source/designs/model-config-profiles.rst b/docs/source/designs/model-config-profiles.rst new file mode 100644 index 000000000..6c22c346f --- /dev/null +++ b/docs/source/designs/model-config-profiles.rst @@ -0,0 +1,162 @@ +.. _model_config_profiles: + +========================= +Model Config and Profiles +========================= + +This design describes how to supply **model/gateway configuration** (routing strategy, PD bucket bounds, combined mode, etc.) via a **single annotation** (or ConfigMap), with support for **multiple named profiles** selectable at **runtime** by the client. + +Motivation +---------- + +Today, options are encoded as many pod labels (e.g. ``model.aibrix.ai/name``, ``model.aibrix.ai/port``, ``model.aibrix.ai/routing-strategy``, ``prompt-min-length``, etc.). Adding new options requires new labels and gateway changes to read them. This does not scale. Using a single structured annotation with **multiple profiles** allows: + +* One place to add new options (extend the JSON schema). +* Different configurations for the same model (e.g. ``default``, ``pd``, ``low-latency``) selectable per request via a header. + +Overview +-------- + +* **Annotation** (on the pod): ``model.aibrix.ai/config`` holds a JSON object with a ``profiles`` map. Each profile is a set of gateway options: ``routingStrategy``, ``promptLenBucketMinLength``, ``promptLenBucketMaxLength``, ``combined``. +* **Runtime selection**: Client sends header ``config-profile: `` (e.g. ``pd``, ``low-latency``). If omitted, the ``defaultProfile`` (or ``"default"``) is used. + +JSON Schema (Implementation) +---------------------------- + +The implementation parses the following structure. Extra fields (e.g. ``name``, ``port``, ``engine``) in the JSON are ignored. + +Root object: + +* ``defaultProfile`` (string, optional): Profile name to use when header is empty or profile not found. Default: ``"default"``. +* ``profiles`` (object, required): Map of profile name → profile object. + +Profile object (``ModelConfigProfile``): + +* ``routingStrategy`` (string): e.g. ``random``, ``pd``, ``least-latency``. +* ``promptLenBucketMinLength`` (int, optional): Lower bound for bucketing. Default: ``0``. If negative, normalized to ``0``. +* ``promptLenBucketMaxLength`` (int, optional): Upper bound for bucketing. Default: ``math.MaxInt32`` when ``0`` or omitted. +* ``combined`` (bool, optional): When true, indicates combined prefill/decode pod for PD routing. + +Single profile (backward compatible): + +.. code-block:: json + + { + "profiles": { + "default": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + } + } + } + +Multiple profiles with default: + +.. code-block:: json + + { + "defaultProfile": "pd", + "profiles": { + "default": { + "routingStrategy": "random", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 4096 + }, + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + }, + "low-latency": { + "routingStrategy": "least-latency", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + } + } + } + +Runtime Behavior +---------------- + +1. Gateway resolves config from pod annotation ``model.aibrix.ai/config``. ConfigMap lookup is not yet implemented. If no annotation, fall back to existing label-based resolution. +2. Gateway reads ``config-profile`` from request headers. If missing, use ``defaultProfile`` from the JSON, or ``"default"``. +3. Gateway selects the profile via ``GetProfile(profileName)``: exact match first, then fallback to ``defaultProfile``, then ``"default"``. +4. The resolved profile is stored on ``RoutingContext.ConfigProfile`` (``ResolvedConfigProfile``) for the request. +5. Routing strategy is derived from: request headers → ``ConfigProfile.RoutingStrategy`` → env ``ROUTING_ALGORITHM``. +6. PD router uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` with fallback to the default profile; prompt bounds and ``combined`` are read from the selected profile. + +Annotation Example (StormService pod template) +---------------------------------------------- + +.. code-block:: yaml + + template: + metadata: + labels: + app: sglang-qwen3-8b-1p1d-0-2k + model.aibrix.ai/name: qwen3-8B + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "30000" + prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "default": { + "routingStrategy": "random", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 4096 + }, + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + } + } + } + +Client Usage +------------ + +* Use default profile: do not set any header (or set ``config-profile: default``). +* Use a specific profile: set header ``config-profile: pd`` or ``config-profile: low-latency``. + +Implementation +------------- + +Package: ``pkg/plugins/gateway/configprofiles/`` + +* ``ModelConfigProfile``: struct with ``RoutingStrategy``, ``PromptLenBucketMinLength``, ``PromptLenBucketMaxLength``, ``Combined``. +* ``ModelConfigProfiles``: struct with ``DefaultProfile``, ``Profiles map[string]ModelConfigProfile``. +* ``ParseModelConfig(jsonStr)``: parses JSON; normalizes ``promptLenBucketMinLength`` (≥0) and ``promptLenBucketMaxLength`` (0→MaxInt32). +* ``GetProfile(name)``: returns profile by name; falls back to ``defaultProfile`` then ``"default"``. +* ``ResolveProfile(pods, headerProfile)``: iterates pods, returns first non-nil from ``ResolveProfileFromPod``. +* ``ResolveProfileFromPod(pod, headerProfile)``: reads ``model.aibrix.ai/config`` from pod, parses, returns ``GetProfile(headerProfile)``. +* Prompt length bounds normalization occurs in ``ParseModelConfig``: ``promptLenBucketMinLength`` (<0 → 0), ``promptLenBucketMaxLength`` (0 → ``math.MaxInt32``). + +Constants: ``ModelAnnoConfig`` (pkg/constants/model.go), ``HeaderConfigProfile`` (pkg/plugins/gateway/types.go). + +Gateway flow: + +* ``HandleRequestHeaders``: captures ``config-profile`` into ``ReqConfigProfile``. +* ``HandleRequestBody``: calls ``applyConfigProfile`` which resolves config from pod annotation, sets ``routingCtx.ConfigProfile``, and provides routing strategy to ``deriveRoutingStrategyFromContext``. +* ``deriveRoutingStrategyFromContext``: chooses the routing strategy for the request using this precedence: (1) request header ``routing-strategy`` if present and non-empty; (2) ``routingCtx.ConfigProfile.RoutingStrategy`` from the resolved profile (config-profile + pod annotation); (3) environment default. Returns the strategy and whether it was explicitly set (used to validate and set ``routingCtx.Algorithm`` in ``HandleRequestBody``). + +PD router: + +* ``isPodSuitableForPromptLength(routingCtx, pod, promptLength)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``promptLenBucketMinLength``/``promptLenBucketMaxLength``. +* ``isCombinedPod(routingCtx, pod)``: uses ``ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile)`` for ``combined``. + +Backward Compatibility +---------------------- + +If no annotation is present, ``ResolveProfile`` returns nil. Gateway continues to use existing pod labels and env for routing strategy, port, engine, etc. + +Future Work +---------- + +* ConfigMap lookup (wire when gateway config supports it). +* Extend profile schema: ``port``, ``metricPort``, ``engine``, ``name`` for full parity with labels. +* Use request-level ``ConfigProfile`` (from ``config-profile``) for PD bucketing instead of per-pod ``"pd"`` profile. diff --git a/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json b/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json index d0ddbb9e1..a35d5a67d 100644 --- a/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json +++ b/observability/grafana/AIBrix_Envoy_Gateway_Plugins_Dashboard.json @@ -4,8 +4,8 @@ { "builtIn": 1, "datasource": { - "type": "grafana", - "uid": "-- Grafana --" + "type": "datasource", + "uid": "grafana" }, "enable": true, "hide": true, @@ -18,44 +18,40 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 4, "links": [], - "liveNow": false, "panels": [ { "collapsed": false, - "datasource": { - "type": "prometheus", - "uid": "prometheus" - }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, - "id": 10, + "id": 34, "panels": [], - "title": "Router: vtc-basic", + "title": "Priority 0 - Service Health & Availability", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, - "description": "Shows whether the adaptive bucket size stays stable or jumps around. Big jumps = the algorithm is reacting too quickly.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Bucket Size", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -64,8 +60,9 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -77,7 +74,7 @@ "mode": "none" }, "thresholdsStyle": { - "mode": "area" + "mode": "off" } }, "mappings": [], @@ -85,93 +82,305 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "none" + "unit": "short" }, - "overrides": [] + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "qwen3-8B-200" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] }, "gridPos": { - "h": 12, - "w": 16, + "h": 8, + "w": 24, "x": 0, "y": 1 }, - "id": 1, + "id": 10, "options": { + "alertThreshold": true, "legend": { - "calcs": ["min", "max", "mean", "stdDev"], + "calcs": [ + "lastNotNull", + "max" + ], "displayMode": "table", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.0.2", "targets": [ { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, "editorMode": "code", - "expr": "vtc_bucket_size_active", - "legendFormat": "{{exported_pod}} ({{model}}) - {{namespace}}", + "expr": "sum by (model, code) (rate(gateway_request_total[5m]))", + "hide": false, + "instant": false, + "legendFormat": "{{model}}-{{code}}", "range": true, "refId": "A" } ], - "title": "VTC Bucket Size", + "title": "gateway_request_rate-nodata", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] }, - "description": "How to read the VTC bucket size metric", "gridPos": { - "h": 12, - "w": 8, - "x": 16, - "y": 1 + "h": 8, + "w": 24, + "x": 0, + "y": 9 }, - "id": 2, + "id": 32, "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "content": "## VTC Bucket Size Metric\n\n**Metric**: `vtc_bucket_size_active` (gauge with pod,model labels)\n\n**Why we track it**: Shows whether the adaptive bucket size stays stable or jumps around. Big jumps indicate the algorithm is reacting too quickly.\n\n**How to read / act**:\n- **Smooth, gradual slope** → Algorithm is working well\n- **Saw-tooth jumps** → Algorithm needs tuning (increase minimum bucket size or lengthen adjustment window)\n\n**What to look for**:\n- **Stability**: Look for consistent patterns across pods\n- **Oscillations**: Watch for rapid up/down movements\n- **Correlation**: Check if changes correlate with load patterns\n\n**Configuration**:\n See env `AIBRIX_ROUTER_VTC_*`", - "mode": "markdown" + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "10.0.3", - "title": "Interpretation Guide", - "type": "text" + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": false, + "disableTextWrap": false, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "rate(num_requests_running[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "isCounter": false, + "legendFormat": "running-{{engine_type}}-{{roleset}}", + "metric": "inf.aibrix.num_requests_running", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "RUNNING", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "shouldComputeTopK": false, + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "rate(num_requests_waiting[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "isCounter": false, + "legendFormat": "waiting-{{engine_type}}-{{roleset}}", + "metric": "inf.aibrix.num_requests_waiting", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "WAITING", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default", + "useBackend": false + } + ], + "title": "requests_running/waiting_rate", + "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, - "description": "Shows how quickly the bucket size is changing. Large spikes indicate rapid adjustments that might need tuning.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { - "axisCenteredZero": true, + "axisBorderShow": false, + "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Change Rate", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", @@ -180,8 +389,9 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -193,7 +403,7 @@ "mode": "none" }, "thresholdsStyle": { - "mode": "area" + "mode": "off" } }, "mappings": [], @@ -201,56 +411,93 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "none" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 12, - "w": 24, + "h": 8, + "w": 12, "x": 0, - "y": 13 + "y": 17 }, - "id": 3, + "id": 24, "options": { + "alertThreshold": true, "legend": { - "calcs": ["min", "max", "mean", "stdDev"], + "calcs": [ + "lastNotNull" + ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.0.2", "targets": [ { + "aggregator": "max", + "alias": "$tag_model:$tag_pod:awake", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, + "disableDownsampling": true, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", "editorMode": "code", - "expr": "abs(deriv(vtc_bucket_size_active[1m]))", - "legendFormat": "{{exported_pod}} ({{model}}) - {{namespace}}", + "explicitTags": true, + "expr": "up{pod=~\"sglang-.*\"}", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod", + "type": "literal_or" + } + ], + "legendFormat": "{{pod}}", + "metric": "inf.aibrix.engine_sleep_state", "range": true, - "refId": "A" + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" } ], - "title": "VTC Bucket Size Rate of Change", + "title": "Engine Sleep State", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, - "description": "Time-to-First-Token latency by model, showing impact of router changes on user experience", "fieldConfig": { "defaults": { "color": { @@ -260,7 +507,7 @@ "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", - "axisLabel": "Latency (s)", + "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, @@ -272,8 +519,9 @@ "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -285,7 +533,7 @@ "mode": "none" }, "thresholdsStyle": { - "mode": "area" + "mode": "off" } }, "mappings": [], @@ -293,92 +541,6553 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "s" + "unit": "short" }, "overrides": [] }, "gridPos": { - "h": 12, - "w": 16, - "x": 0, - "y": 25 + "h": 8, + "w": 12, + "x": 12, + "y": 17 }, - "id": 4, + "id": 64, "options": { + "alertThreshold": true, "legend": { - "calcs": ["min", "max", "mean", "stdDev"], + "calcs": [ + "lastNotNull" + ], "displayMode": "table", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { + "hideZeros": false, "mode": "multi", "sort": "none" } }, + "pluginVersion": "12.0.2", "targets": [ { + "aggregator": "max", + "alias": "$tag_model-pdqueue_threshold", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum by(le, model_name) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "legendFormat": "P99 - {{model_name}}", + "expr": "pd_queue_exceeds_threshold", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod_name", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "__auto", + "metric": "inf.aibrix.pd_queue_exceeds_threshold", "range": true, - "refId": "A" + "rateDownsampleType": "before_downsample", + "refId": "pd_threshold", + "tenant": "default" + } + ], + "title": "pd_queue_threshold", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 68, + "panels": [], + "title": "Request Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 66, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum by(le, model_name) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "legendFormat": "P50 - {{model_name}}", + "explicitTags": true, + "expr": "rate(sglang:prompt_tokens_total[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{pod}}", + "metric": "inf.aibrix.gateway_prompt_token_bucket_total", "range": true, - "refId": "B" + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false } ], - "title": "Time to First Token by Model", + "title": "prompt_token_rate", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "prometheus" + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] }, - "description": "How to interpret TTFT patterns and their correlation with router changes", "gridPos": { - "h": 12, - "w": 8, - "x": 16, - "y": 25 + "h": 8, + "w": 12, + "x": 12, + "y": 26 }, - "id": 5, + "id": 75, "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true }, - "content": "## TTFT Monitoring Guide\n\n**What to look for**:\n- **Healthy Pattern**: P99 close to P50\n- **Warning Sign**: P99 widens significantly\n\n**Correlation with Router**:\n- If P99 spikes when bucket size changes → router needs tuning\n- If P99 stays stable during changes → router is working well\n\n**Model Differences**:\n- Different models may show different latency characteristics\n- Compare models to identify performance differences\n\n**Action Items**:\n- If P99 widens: Check bucket size changes\n- If model differences grow: Review routing fairness", - "mode": "markdown" + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } }, - "pluginVersion": "10.0.3", - "title": "TTFT Interpretation Guide", - "type": "text" + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "rate(gateway_completion_token_bucket_total[1m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{roleset}}-{{role}}-{{bucket}} tokens", + "metric": "inf.aibrix.gateway_completion_token_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + } + ], + "title": "generation_token rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 80, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "pd_selected_prefill_pod_total", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-prefill-roleset:{{roleset}}", + "metric": "inf.aibrix.gateway_prompt_token_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "editorMode": "code", + "expr": "pd_selected_decode_pod_total", + "hide": false, + "instant": false, + "legendFormat": "{{model}}-decode-roleset:{{roleset}}", + "range": true, + "refId": "B" + } + ], + "title": "pd_request_counter", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 77, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "gateway_routing_time_bucket_total", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "legendFormat": "{{model}}-range[{{bucket}}]", + "metric": "inf.aibrix.gateway_routing_time_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "routing_time_taken", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 81, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "rate(gateway_prefill_time_bucket_total[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-prefill-roleset:{{roleset}}-[{{bucket}}]", + "metric": "inf.aibrix.gateway_prompt_token_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + } + ], + "title": "prefill_time bucket ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 82, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "rate(gateway_decode_time_bucket_total[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-decode-roleset:{{roleset}}-[{{bucket}}]", + "metric": "inf.aibrix.gateway_prompt_token_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + } + ], + "title": "prefill_time bucket ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 70, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "sglang:kv_transfer_latency_ms", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{pod}}", + "metric": "inf.aibrix.gateway_kv_transfer_time_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + } + ], + "title": "kv_transfer_time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "id": 71, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p99", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p90", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p50", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "tenant": "default", + "useBackend": false + } + ], + "title": "ttft_time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 83, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p99", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p90", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p50", + "metric": "inf.aibrix.gateway_ttft_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "tenant": "default", + "useBackend": false + } + ], + "title": "tpot_time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 73, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "gateway_total_time_bucket_total", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "wildcard" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "bucket", + "type": "literal_or" + } + ], + "legendFormat": "{{model}}-{{bucket}}", + "metric": "inf.aibrix.gateway_total_time_bucket_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "total_time", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 66 + }, + "id": 36, + "panels": [], + "title": "Priority 1 - End-to-End Latency (User-visible SLOs)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 67 + }, + "id": 21, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "builder", + "explicitTags": true, + "expr": "e2e_request_latency_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "isCounter": true, + "legendFormat": "{{model}}-{{engine_type}}-p99", + "metric": "inf.aibrix.e2e_request_latency_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "e2e_request_latency_p99", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "builder", + "expr": "e2e_request_latency_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model}}-{{engine_type}}-p90", + "metric": "inf.aibrix.e2e_request_latency_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "e2e_request_latency_p90", + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "builder", + "explicitTags": true, + "expr": "e2e_request_latency_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "isCounter": true, + "legendFormat": "{{model}}-{{engine_type}}-p50", + "metric": "inf.aibrix.e2e_request_latency_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "e2e_request_latency_p50", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default", + "useBackend": false + } + ], + "title": "e2e_request_latency_seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 67 + }, + "id": 9, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_model_name:$tag_role:$tag_pod", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "sum(up{pod=~\".*prefill.*\"})", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model_name", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod", + "type": "literal_or" + } + ], + "legendFormat": "prefill", + "metric": "inf.aibrix.model_replicas", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "avg", + "alias": "$tag_model_name:$tag_role:$tag_pod", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "sum(up{pod=~\".*decode.*\"})", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model_name", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "pod", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "prefill", + "metric": "inf.aibrix.model_replicas", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "P/D Replica Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 75 + }, + "id": 31, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "expr": "request_inference_time_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": true, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_inference_time_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "sum", + "alias": "$tag_model-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "expr": "request_inference_time_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": true, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_inference_time_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "request_inference_time_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_inference_time_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "P99", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "request inference time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 75 + }, + "id": 29, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "disableTextWrap": false, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "histogram_quantile(0.50, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "{{model_name}}-p50", + "metric": "inf.aibrix.request_queue_time_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeRate": false, + "tenant": "default", + "useBackend": false + }, + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "histogram_quantile(0.90, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "{{model_name}}-p90", + "metric": "inf.aibrix.request_queue_time_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-$tag_role-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "histogram_quantile(0.99, sum(rate(sglang:queue_time_seconds_bucket[5m])) by (le, model_name))", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "{{model_name}}-p99", + "metric": "inf.aibrix.request_queue_time_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "P99", + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "Request queue time ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 37, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "HTTPRequestDurationSeconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 83 + }, + "id": 38, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_highr_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_highr_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_highr_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_highr_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "model", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "count", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "editorMode": "code", + "explicitTags": true, + "expr": "http_request_duration_highr_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "legendFormat": "__auto", + "metric": "inf.aibrix.http_request_duration_highr_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "HTTPRequestDurationHighRSeconds", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 91 + }, + "id": 17, + "panels": [], + "title": "Priority 2 - Time To First Token (Prefill Performance)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 92 + }, + "id": 14, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p50", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p90", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_to_first_token_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p99", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "tenant": "default" + } + ], + "title": "time_to_first_token_seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 92 + }, + "id": 19, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "mean", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_model-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "request_prefill_time_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_prefill_time_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "avg", + "alias": "$tag_model-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "request_prefill_time_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_prefill_time_seconds_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "avg", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "request_prefill_time_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_prefill_time_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "request_prefill_time_seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 100 + }, + "id": 42, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "rate(gateway_prompt_token_bucket_total[5m])", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "{{model}}-{{role}}-{{roleset}}-{{bucket}}", + "metric": "inf.aibrix.request_prompt_tokens_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "request_prompt_tokens rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 108 + }, + "id": 40, + "panels": [], + "title": "Priority 2 - Time To First Token (Decode Performance)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 109 + }, + "id": 84, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p50", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p90", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + }, + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "time_per_output_token_seconds_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + } + ], + "hide": false, + "legendFormat": "{{model}}-{{roleset}}-{{role}}-p99", + "metric": "inf.aibrix.time_to_first_token_seconds_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "tenant": "default" + } + ], + "title": "time_per_output_token_seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 109 + }, + "id": 18, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_model-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.request_decode_time_seconds_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "avg", + "alias": "$tag_model-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "metric": "inf.aibrix.request_decode_time_seconds_p90", + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "avg", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "metric": "inf.aibrix.request_decode_time_seconds_p50", + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "request_decode_time_seconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 117 + }, + "id": 85, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "avg_generation_throughput_toks_per_s", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "{{model}}-{{role}}-{{roleset}}", + "metric": "inf.aibrix.request_prompt_tokens_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "generation_token_rate/s", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 117 + }, + "id": 45, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "iteration_tokens_total_p50", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.iteration_tokens_total_p50", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "sum", + "alias": "$tag_model-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "iteration_tokens_total_p90", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.iteration_tokens_total_p90", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "sum", + "alias": "$tag_model-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "expr": "iteration_tokens_total_p99", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.iteration_tokens_total_p99", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "iteration_tokens_total", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 125 + }, + "id": 44, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "sum", + "alias": "$tag_model-p50", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "metric": "inf.aibrix.request_max_num_generation_tokens_p50", + "rateDownsampleType": "before_downsample", + "refId": "C", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "sum", + "alias": "$tag_model-p90", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "hide": false, + "isCounter": true, + "metric": "inf.aibrix.request_max_num_generation_tokens_p90", + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + }, + { + "aggregator": "sum", + "alias": "$tag_model-p99", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "max", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + } + ], + "isCounter": true, + "metric": "inf.aibrix.request_max_num_generation_tokens_p99", + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "request_max_num_generation_tokens", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 133 + }, + "id": 47, + "panels": [], + "title": "KVCache + NIXL", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 134 + }, + "id": 51, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "kv_cache_usage_perc", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "legendFormat": "{{model}}-{{role}}-{{roleset}}", + "metric": "inf.aibrix.kv_cache_usage_perc", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "KVCacheUsagePerc", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 134 + }, + "id": 54, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": true, + "expr": "kv", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "legendFormat": "__auto", + "metric": "inf.aibrix.nixl_num_failed_notifications_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "NixlNumFailedNotifications", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 142 + }, + "id": 56, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "editorMode": "code", + "explicitTags": false, + "expr": "", + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "literal_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "isCounter": true, + "legendFormat": "__auto", + "metric": "inf.aibrix.prefix_cache_hits_total", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeDelta": false, + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "PrefixCacheHitTotal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 142 + }, + "id": 55, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.prefix_cache_queries_total", + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "PrefixCacheQueriesTotal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 150 + }, + "id": 52, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.nixl_num_failed_transfers_total", + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "NixlNumFailedTransfers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 150 + }, + "id": 60, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.external_prefix_cache_hits_total", + "rateDownsampleType": "before_downsample", + "refId": "A", + "tenant": "default" + } + ], + "title": "ExternalPrefilCacheHitsTotal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 158 + }, + "id": 59, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "*", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.external_prefix_cache_hits_total", + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeRate": false, + "tenant": "default" + } + ], + "title": "ExternalPrefilCacheHitsTotal", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 158 + }, + "id": 61, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.nixl_post_time_seconds_p50", + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_post_time_seconds_p90", + "rateDownsampleType": "before_downsample", + "refId": "P90", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_post_time_seconds_p99", + "rateDownsampleType": "before_downsample", + "refId": "P99", + "tenant": "default" + } + ], + "title": "NixlPostTimeSeconds", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 166 + }, + "id": 58, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.nixl_xfer_time_seconds_p50", + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_xfer_time_seconds_p90", + "rateDownsampleType": "before_downsample", + "refId": "P90", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_xfer_time_seconds_p99", + "rateDownsampleType": "before_downsample", + "refId": "P99", + "tenant": "default" + } + ], + "title": "NixlXferTimeSeconds", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 174 + }, + "id": 5, + "panels": [], + "title": "machines metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 175 + }, + "id": 2, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_cluster-$tag_container_name", + "currentField": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "currentTagValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "30s", + "editorMode": "code", + "expr": "sum by (pod) (\n rate(container_cpu_usage_seconds_total{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}[5m])\n)\n/\nsum by (pod) (\n kube_pod_container_resource_limits{pod=~\".*aibrix.*\", resource=\"cpu\", unit=\"core\"}\n)", + "fields": [ + "usage_ratio" + ], + "filters": [], + "legendFormat": "__auto", + "metric": "tce.container.cpu_usage.mt", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeMulti": true, + "tags": { + "_psm": "inf.aibrix.gateway", + "cluster": "echo*", + "container_name": "gateway-plugin|envoy" + }, + "tenant": "computation.tce" + } + ], + "title": "aibrix component cpu usage ratio", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 175 + }, + "id": 3, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_cluster-$tag_container_name", + "currentField": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "currentTagValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "30s", + "editorMode": "code", + "expr": "sum by (pod) (\n container_memory_working_set_bytes{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}\n)\n/\nsum by (pod) (\n kube_pod_container_resource_limits{pod=~\".*aibrix.*\", resource=\"memory\", unit=\"byte\"}\n)", + "fields": [ + "utilization" + ], + "filters": [], + "legendFormat": "__auto", + "metric": "tce.container.mem_usage.mt", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeMulti": true, + "tags": { + "_psm": "inf.aibrix.gateway", + "cluster": "echo*", + "container_name": "gateway-plugin|envoy" + }, + "tenant": "computation.tce" + } + ], + "title": "gateway memory utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 183 + }, + "id": 6, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_cluster-$tag_container_name", + "currentField": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "currentTagValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "30s", + "editorMode": "code", + "expr": "sum by (pod) (\n container_memory_working_set_bytes{pod=~\".*aibrix.*\", container!=\"\", container!=\"POD\"}\n)", + "fields": [ + "used" + ], + "filters": [], + "legendFormat": "__auto", + "metric": "tce.container.mem_usage.mt", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeMulti": true, + "tags": { + "_psm": "inf.aibrix.gateway", + "cluster": "echo*", + "container_name": "gateway-plugin|envoy" + }, + "tenant": "computation.tce" + } + ], + "title": "aibrix component memory used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 183 + }, + "id": 7, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "avg", + "alias": "$tag_cluster-$tag_container_name-rx-bytes", + "currentField": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "currentTagValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "30s", + "editorMode": "code", + "expr": "sum by (pod) (\n rate(container_network_receive_bytes_total{pod=~\".*aibrix.*\"}[5m])\n)", + "fields": [ + "rx_bytes" + ], + "filters": [], + "legendFormat": "{{pod}}-receive-bytes", + "metric": "tce.container.net_tcp.mt", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "A", + "shouldComputeMulti": true, + "tags": { + "_psm": "inf.aibrix.gateway", + "cluster": "echo*", + "container_name": "gateway-plugin|envoy" + }, + "tenant": "computation.tce" + }, + { + "aggregator": "avg", + "alias": "$tag_cluster-tx-bytes", + "currentField": "", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "currentTagKey": "", + "currentTagValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "downsampleInterval": "30s", + "editorMode": "code", + "expr": "sum by (pod) (\n rate(container_network_transmit_bytes_total{pod=~\".*aibrix.*\"}[5m])\n)", + "fields": [ + "tx_bytes" + ], + "filters": [], + "hide": false, + "legendFormat": "{{pod}}-send-bytes", + "metric": "tce.container.net_tcp.mt", + "range": true, + "rateDownsampleType": "before_downsample", + "refId": "B", + "shouldComputeMulti": true, + "tags": { + "_psm": "inf.aibrix.gateway", + "cluster": "echo*" + }, + "tenant": "computation.tce" + } + ], + "title": "component network", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 191 + }, + "id": 62, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.nixl_bytes_transferred_p50", + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_bytes_transferred_p90", + "rateDownsampleType": "before_downsample", + "refId": "P90", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_bytes_transferred_p99", + "rateDownsampleType": "before_downsample", + "refId": "P99", + "tenant": "default" + } + ], + "title": "NixlBytesTransferred", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 191 + }, + "id": 63, + "options": { + "alertThreshold": true, + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "12.0.2", + "targets": [ + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "metric": "inf.aibrix.nixl_num_descriptors_p50", + "rateDownsampleType": "before_downsample", + "refId": "P50", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_num_descriptors_p90", + "rateDownsampleType": "before_downsample", + "refId": "P90", + "tenant": "default" + }, + { + "aggregator": "max", + "currentFilterGroupBy": false, + "currentFilterKey": "", + "currentFilterType": "literal_or", + "currentFilterValue": "", + "datasource": { + "type": "prometheus", + "uid": "ff75su0268000c" + }, + "disableDownsampling": true, + "downsampleAggregator": "avg", + "downsampleAggregatorDisabled": false, + "downsampleFillPolicy": "none", + "downsampleFillPolicyDisabled": false, + "explicitTags": true, + "filters": [ + { + "filter": "*", + "groupBy": true, + "tagk": "model", + "type": "iliteral_or" + }, + { + "filter": "decode", + "groupBy": true, + "tagk": "role", + "type": "literal_or" + } + ], + "hide": false, + "metric": "inf.aibrix.nixl_num_descriptors_p99", + "rateDownsampleType": "before_downsample", + "refId": "P99", + "tenant": "default" + } + ], + "title": "NixlNumDescriptors", + "type": "timeseries" } ], - "refresh": "5s", - "schemaVersion": 38, - "style": "dark", - "tags": ["vtc", "metrics", "gateway", "router"], + "preload": false, + "refresh": "1m", + "schemaVersion": 41, + "tags": [], "templating": { "list": [] }, @@ -388,8 +7097,7 @@ }, "timepicker": {}, "timezone": "", - "title": "AIBrix Envoy Gateway Plugins Dashboard", - "uid": "aibrix-envoy-gateway-plugins", - "version": 1, - "weekStart": "" -} + "title": "AIBrix gateway", + "uid": "5rRLZ0zDz", + "version": 46 +} \ No newline at end of file diff --git a/pkg/cache/cache_init.go b/pkg/cache/cache_init.go index e4ba517cb..fccb081bc 100644 --- a/pkg/cache/cache_init.go +++ b/pkg/cache/cache_init.go @@ -17,6 +17,7 @@ limitations under the License. package cache import ( + "container/list" "context" "errors" "fmt" @@ -113,6 +114,9 @@ type Store struct { // KV event management - optional enhancement kvEventManager *KVEventManager + + // Prometheus event queue + promqlJobs chan *Pod } // Get retrieves the cache instance @@ -379,6 +383,7 @@ func InitWithOptions(config *rest.Config, stopCh <-chan struct{}, opts InitOptio // stopCh: Stop signal channel func initMetricsCache(store *Store, stopCh <-chan struct{}) { ticker := time.NewTicker(podMetricRefreshInterval) + store.initPromQLWorker(stopCh) go func() { for { select { @@ -599,3 +604,102 @@ func (s *Store) Close() { // Other cleanup can be added here in the future } + +func (c *Store) enqueuePromQL(pod *Pod) { + if c.promqlJobs == nil { + return + } + // Non-blocking enqueue so slow PromQL queries do not affect the main path. + select { + case c.promqlJobs <- pod: + default: + // Drop when the queue is full (the next pod refresh cycle will enqueue again). + klog.V(5).InfoS("PromQL queue full, dropping promql job", "pod", pod.Name) + } +} + +func (c *Store) initPromQLWorker(stopCh <-chan struct{}) { + if c.prometheusApi == nil { + klog.InfoS("Prometheus API is nil, skip initializing PromQL worker") + return + } + c.promqlJobs = make(chan *Pod, 2*c.podMetricsWorkerCount) + go c.promQueryLoop(stopCh) +} + +func (c *Store) promQueryLoop(stopCh <-chan struct{}) { + ticker := time.NewTicker(promQueryInterval) + defer ticker.Stop() + + // pendingPods keeps at most one pending job per pod key (ns/name). + // If the same pod is enqueued multiple times, we overwrite with the latest *Pod. + pendingPods := make(map[string]*Pod) + + // fifoKeys records the processing order of pending pod keys. + // A key is appended only when it is first seen in pendingPods. + fifoKeys := list.New() + + // Build stable key for dedupe/order. + podKey := func(p *Pod) string { + ns := p.Namespace + if ns == "" && p.Pod != nil { + ns = p.Pod.Namespace + } + return ns + "/" + p.Name + } + + // Helper: enqueue into (pendingPods + fifoKeys) with dedupe. + enqueuePending := func(key string, p *Pod) { + if _, exists := pendingPods[key]; !exists { + fifoKeys.PushBack(key) // first time seen: record order + } + pendingPods[key] = p // always keep latest pod pointer + } + + for { + select { + case <-stopCh: + return + + // Accept pods from worker and deduplicate. + case p := <-c.promqlJobs: + if p == nil || p.Pod == nil || !utils.FilterReadyPod(p.Pod) { + continue + } + key := podKey(p) + if key == "" || key == "/" { + continue + } + enqueuePending(key, p) + + // Every tick, process exactly one pending pod to cap QPS. + case <-ticker.C: + if fifoKeys.Len() == 0 { + continue + } + + // Pop head key (FIFO). + element := fifoKeys.Front() + key := element.Value.(string) + fifoKeys.Remove(element) + + // Get latest pod pointer and mark it as dequeued. + p := pendingPods[key] + delete(pendingPods, key) + + // Pod may become unready while waiting in queue. + if p == nil || p.Pod == nil || !utils.FilterReadyPod(p.Pod) { + continue + } + + ctx, cancel := context.WithTimeout(context.Background(), promQueryTimeout) + err := c.updateMetricFromPromQL(ctx, p) + cancel() + + if err != nil { + // Best-effort retry: put it back to the tail. + enqueuePending(key, p) + } + } + } +} diff --git a/pkg/cache/cache_metrics.go b/pkg/cache/cache_metrics.go index 2b4be2554..482cc1d88 100644 --- a/pkg/cache/cache_metrics.go +++ b/pkg/cache/cache_metrics.go @@ -42,6 +42,8 @@ const ( defaultEngineLabelValue = "vllm" defaultPodMetricRefreshIntervalInMS = 50 defaultPodMetricsWorkerCount = 10 + defaultPromQueryIntervalInMS = 200 + defaultPromQueryTimeoutInMS = 2000 ) var ( @@ -82,6 +84,7 @@ var ( metrics.RequestDecodeTimeSeconds, metrics.RequestPrefillTimeSeconds, metrics.HTTPRequestDurationSeconds, + metrics.PerStageReqLatencySeconds, metrics.HTTPRequestDurationHighRSeconds, metrics.RequestPromptTokens, metrics.RequestGenerationTokens, @@ -113,6 +116,8 @@ var ( metrics.RunningLoraAdapters, } podMetricRefreshInterval = time.Duration(utils.LoadEnvInt("AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS", defaultPodMetricRefreshIntervalInMS)) * time.Millisecond + promQueryInterval = time.Duration(utils.LoadEnvInt("AIBRIX_PROMETHEUS_QUERY_INTERVAL_MS", defaultPromQueryIntervalInMS)) * time.Millisecond + promQueryTimeout = time.Duration(utils.LoadEnvInt("AIBRIX_PROMETHEUS_QUERY_TIMEOUT_MS", defaultPromQueryTimeoutInMS)) * time.Millisecond ) // MetricSnapshot represents a metric value at a specific timestamp @@ -247,12 +252,11 @@ func (c *Store) worker(jobs <-chan *Pod) { continue } - podLabelNames, podLabelValues := buildMetricLabels(pod, engineType, "") for metricName, metricValue := range result.Metrics { if shouldSkipMetric(pod.Name, metricName) { continue } - metrics.EmitMetricToPrometheus(metricName, metricValue, podLabelNames, podLabelValues) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod.Pod, metricName, metricValue, metricValue.GetLabelValues()) } for metricName, metricValue := range result.ModelMetrics { @@ -267,8 +271,6 @@ func (c *Store) worker(jobs <-chan *Pod) { continue } - labelNames, labelValues := buildMetricLabels(pod, engineType, model) - var rateMetricName string if strings.Contains(pod.Name, "prefill") && metric == metrics.PromptTokenTotal { rateMetricName = metrics.AvgPromptThroughputToksPerS @@ -279,20 +281,19 @@ func (c *Store) worker(jobs <-chan *Pod) { perSecRate := c.calculatePerSecondRate(pod, model, metric, metricValue.GetSimpleValue()) if perSecRate >= 0 { rateValue := &metrics.SimpleMetricValue{Value: perSecRate} - metrics.SetGaugeMetric(rateMetricName, metrics.GetMetricHelp(rateMetricName), rateValue.GetSimpleValue(), labelNames, labelValues...) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, pod.Pod, rateMetricName, rateValue, metricValue.GetLabelValues()) _ = c.updatePodRecord(pod, model, rateMetricName, metrics.PodModelMetricScope, rateValue) klog.V(4).InfoS("get metric per sec rate", "metric", rateMetricName, "raw_value", metricValue.GetSimpleValue(), "per_sec_rate", rateValue.GetSimpleValue()) } } - - metrics.EmitMetricToPrometheus(metric, metricValue, labelNames, labelValues) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, pod.Pod, metric, metricValue, metricValue.GetLabelValues()) } // Update pod metrics using typed results c.updatePodMetricsFromTypedResult(pod, result) // Handle Prometheus-based metrics separately (these require PromQL queries) if c.prometheusApi != nil { - c.updateMetricFromPromQL(ctx, pod) + c.enqueuePromQL(pod) } else { klog.V(4).InfoS("Prometheus API not initialized, skipping PromQL metrics", "pod", pod.Name) } @@ -308,7 +309,7 @@ func (c *Store) worker(jobs <-chan *Pod) { } } -func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) { +func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) (queryErr error) { podName := pod.Name podMetricPort := getPodMetricPort(pod) for _, metricName := range prometheusMetricNames { @@ -325,6 +326,9 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) { err := c.queryUpdatePromQLMetrics(ctx, metric, queryLabels, pod, "", metricName, podMetricPort) if err != nil { klog.V(4).Infof("Failed to query and update PromQL metrics: %v", err) + if queryErr == nil { + queryErr = err + } continue } } else if scope == metrics.PodModelMetricScope { @@ -334,6 +338,9 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) { err := c.queryUpdatePromQLMetrics(ctx, metric, queryLabels, pod, modelName, metricName, podMetricPort) if err != nil { klog.V(4).Infof("Failed to query and update PromQL metrics: %v", err) + if queryErr == nil { + queryErr = err + } continue } } @@ -344,6 +351,7 @@ func (c *Store) updateMetricFromPromQL(ctx context.Context, pod *Pod) { klog.V(4).Infof("Scope %v is not supported", scope) } } + return queryErr } func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Metric, queryLabels map[string]string, pod *Pod, modelName string, metricName string, podMetricPort int) error { @@ -352,7 +360,7 @@ func (c *Store) queryUpdatePromQLMetrics(ctx context.Context, metric metrics.Met // Querying metrics result, warnings, err := c.prometheusApi.Query(ctx, query, time.Now()) if err != nil { - metrics.EmitCounterMetric(&types.RoutingContext{Model: modelName}, pod.Pod, metrics.PrometheusQueryFail, 1.0, nil) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: modelName}, pod.Pod, metrics.PrometheusQueryFail, &metrics.SimpleMetricValue{Value: 1.0}, nil) // Skip this model fetching if an error is thrown return fmt.Errorf("error executing query: %v", err) } diff --git a/pkg/cache/cache_metrics_test.go b/pkg/cache/cache_metrics_test.go index 650848f02..43143e3b3 100644 --- a/pkg/cache/cache_metrics_test.go +++ b/pkg/cache/cache_metrics_test.go @@ -26,6 +26,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" "github.com/vllm-project/aibrix/pkg/metrics" + "github.com/vllm-project/aibrix/pkg/types" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/rest" @@ -153,10 +154,13 @@ func TestEmitMetricToPrometheus_GaugeAndCounter(t *testing.T) { }{name: name, value: value}) } - labels := []string{"pod"} - values := []string{"p1"} - - metrics.EmitMetricToPrometheus(metrics.NumRequestsRunning, &metrics.SimpleMetricValue{Value: 3}, labels, values) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p1", + Namespace: "ns1", + }, + } + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod, metrics.NumRequestsRunning, &metrics.SimpleMetricValue{Value: 3}, nil) require.Len(t, gaugeCalls, 1) require.Equal(t, metrics.NumRequestsRunning, gaugeCalls[0].name) require.Equal(t, 3.0, gaugeCalls[0].value) @@ -189,7 +193,13 @@ func TestEmitMetricToPrometheus_HistogramAlsoEmitsQuantiles(t *testing.T) { "+Inf": 2, }, } - metrics.EmitMetricToPrometheus(metrics.TimeToFirstTokenSeconds, hv, []string{"pod"}, []string{"p1"}) + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "p1", + Namespace: "ns1", + }, + } + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: ""}, pod, metrics.TimeToFirstTokenSeconds, hv, nil) require.Contains(t, gaugeMetricNames, metrics.TimeToFirstTokenSeconds+"_p50") require.Contains(t, gaugeMetricNames, metrics.TimeToFirstTokenSeconds+"_p90") diff --git a/pkg/cache/kvcache/event_types.go b/pkg/cache/kvcache/event_types.go index a69beb519..6d7f21c1e 100644 --- a/pkg/cache/kvcache/event_types.go +++ b/pkg/cache/kvcache/event_types.go @@ -82,11 +82,16 @@ type KVEvent interface { // ------------------------------------------------------------ // // lora_id and medium are unused for now. +// +// Note: BlockHashes are converted at decode time: +// - vLLM legacy format (int64) → stored as-is +// - vLLM new format (32-byte SHA-256 from PR #23673) → first 8 bytes converted to int64 +// This ensures internal consistency and compatibility with existing code. type BlockStoredEvent struct { _ struct{} `msgpack:",array"` // msgspec array encoding Type EventType `msgpack:"-"` - BlockHashes []int64 - ParentBlockHash *int64 + BlockHashes []int64 // Decoded from vLLM, supports both old and new formats + ParentBlockHash *int64 // Decoded from vLLM, supports both old and new formats TokenIDs [][]byte // NOTE: These are NOT part of msgpack @@ -110,10 +115,14 @@ func (e *BlockStoredEvent) setPodName(name string) { e.PodName = name } // ------------------------------------------------------------ // // lora_id is unused for now. +// +// Note: BlockHashes are converted at decode time: +// - vLLM legacy format (int64) → stored as-is +// - vLLM new format (32-byte SHA-256 from PR #23673) → first 8 bytes converted to int64 type BlockRemovedEvent struct { _ struct{} `msgpack:",array"` Type EventType `msgpack:"-"` - BlockHashes []int64 + BlockHashes []int64 // Decoded from vLLM, supports both old and new formats // NOTE: These are NOT part of msgpack Timestamp time.Time `msgpack:"-"` diff --git a/pkg/cache/kvcache/msgpack_decoder.go b/pkg/cache/kvcache/msgpack_decoder.go index 471c315af..5cc72e374 100644 --- a/pkg/cache/kvcache/msgpack_decoder.go +++ b/pkg/cache/kvcache/msgpack_decoder.go @@ -21,6 +21,7 @@ import ( "time" msgpack "github.com/vmihailenco/msgpack/v5" + "k8s.io/klog/v2" ) // DecodeEventBatch parses a raw msgpack batch of events. @@ -35,7 +36,15 @@ func DecodeEventBatch( if err := msgpack.Unmarshal(data, &rawBatch); err != nil { return nil, fmt.Errorf("failed to unmarshal event batch: %w", err) } - if len(rawBatch) != 2 { + // if size of rawBatch is 3, the third element is the data parallel rank + // data_parallel_rank is not used in aibrix now + if len(rawBatch) == 3 { + if data_parallel_rank, err := parseInt(rawBatch[2]); err != nil { + return nil, fmt.Errorf("data_parallel_rank is not an int: %T", rawBatch[2]) + } else { + klog.V(4).Infof("event has data_parallel_rank: %d", data_parallel_rank) + } + } else if len(rawBatch) != 2 { return nil, fmt.Errorf("expected 2 elements in batch (ts, events), got %d", len(rawBatch)) } @@ -97,13 +106,13 @@ func parseEventArray(arr []interface{}) (KVEvent, error) { } // 1: block_hashes - blockHashes, err := toInt64Slice(arr[1]) + blockHashes, err := toBlockHashSlice(arr[1]) if err != nil { return nil, fmt.Errorf("invalid block_hashes: %w", err) } // 2: parent_block_hash - parentHash, err := toInt64Ptr(arr[2]) + parentHash, err := toBlockHashPtr(arr[2]) if err != nil { return nil, fmt.Errorf("invalid parent_block_hash: %w", err) } @@ -148,7 +157,7 @@ func parseEventArray(arr []interface{}) (KVEvent, error) { return nil, fmt.Errorf("BlockRemoved expects ≥2 fields, got %d", len(arr)) } - blockHashes, err := toInt64Slice(arr[1]) + blockHashes, err := toBlockHashSlice(arr[1]) if err != nil { return nil, fmt.Errorf("invalid block_hashes: %w", err) } @@ -190,6 +199,128 @@ func applyBatchMetadata(evt KVEvent, ts time.Time, model, pod string) { } } +// toBlockHashSlice converts block_hashes field to []int64. +// Supports both legacy int64 format and new bytes format from vLLM PR #23673. +// This function handles the conversion at decode time, keeping the rest of the codebase simple. +func toBlockHashSlice(v any) ([]int64, error) { + raw, ok := v.([]interface{}) + if !ok { + return nil, fmt.Errorf("expected []interface{}, got %T", v) + } + + out := make([]int64, len(raw)) + for i, x := range raw { + hash, err := parseBlockHashToInt64(x) + if err != nil { + return nil, fmt.Errorf("block_hashes[%d]: %w", i, err) + } + out[i] = hash + } + return out, nil +} + +// bytesToInt64 converts a byte array to int64 using big-endian encoding. +// If the byte array is shorter than 8 bytes, it pads with leading zeros. +func bytesToInt64(b []byte) int64 { + if len(b) >= 8 { + // Use first 8 bytes for both 8-byte and 32-byte formats + return int64(binary.BigEndian.Uint64(b[:8])) + } + // Unexpected short byte array: pad with leading zeros for big-endian + padded := make([]byte, 8) + copy(padded[8-len(b):], b) + return int64(binary.BigEndian.Uint64(padded)) +} + +// parseBlockHashToInt64 parses a single block hash and converts it to int64. +// Supports: +// 1. int64 types (legacy format from old vLLM) → used directly +// 2. []byte (new format from vLLM PR #23673): +// - 8 bytes: big-endian int64 +// - 32 bytes: SHA-256, uses first 8 bytes +// +// 3. string (msgpack may decode bytes as string) → same as []byte +// +// Using the first 8 bytes of SHA-256 provides sufficient uniqueness: +// - Collision probability ≈ 1/2^64 ≈ 10^-19 (extremely low) +// - In typical scenarios (thousands to millions of blocks), collisions are virtually impossible +func parseBlockHashToInt64(v any) (int64, error) { + switch x := v.(type) { + case []byte: + return bytesToInt64(x), nil + + case string: + // msgpack may decode bytes as string + return bytesToInt64([]byte(x)), nil + + // Legacy format: integer types → convert to int64 + case int64: + return x, nil + + case uint64: + return int64(x), nil + + case int: + return int64(x), nil + + case uint: + return int64(x), nil + + case int8: + return int64(x), nil + + case int16: + return int64(x), nil + + case int32: + return int64(x), nil + + case uint8: + return int64(x), nil + + case uint16: + return int64(x), nil + + case uint32: + return int64(x), nil + + // Floating-point types (for backward compatibility with msgpack decoding) + case float32: + f := float64(x) + if f < math.MinInt64 || f > math.MaxInt64 { + return 0, fmt.Errorf("float32 out of int64 range: %f", f) + } + if f != math.Trunc(f) { + return 0, fmt.Errorf("float32 has fractional part: %f", f) + } + return int64(f), nil + + case float64: + if x < math.MinInt64 || x > math.MaxInt64 { + return 0, fmt.Errorf("float64 out of int64 range: %f", x) + } + if x != math.Trunc(x) { + return 0, fmt.Errorf("float64 has fractional part: %f", x) + } + return int64(x), nil + + default: + return 0, fmt.Errorf("unsupported block hash type: %T", v) + } +} + +// toBlockHashPtr converts a single block hash (can be nil) to *int64 +func toBlockHashPtr(v any) (*int64, error) { + if v == nil { + return nil, nil + } + hash, err := parseBlockHashToInt64(v) + if err != nil { + return nil, err + } + return &hash, nil +} + func toInt64Slice(v any) ([]int64, error) { raw, ok := v.([]interface{}) if !ok { diff --git a/pkg/cache/kvcache/msgpack_decoder_test.go b/pkg/cache/kvcache/msgpack_decoder_test.go index e3b37a9b9..d7b16ed65 100644 --- a/pkg/cache/kvcache/msgpack_decoder_test.go +++ b/pkg/cache/kvcache/msgpack_decoder_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + msgpack "github.com/vmihailenco/msgpack/v5" ) func TestBlockStoredEventEncodeDecode(t *testing.T) { @@ -225,3 +226,149 @@ func TestMultipleEventsInBatch_MixedEvents(t *testing.T) { } } } + +func TestBlockHashesAsBytesInDecodeEventBatch(t *testing.T) { + // Test DecodeEventBatch with BlockHashes as [][]byte format + // This simulates the new vLLM format where block hashes are sent as bytes + ts := time.Now().UTC() + + // Construct msgpack data manually with BlockHashes as [][]byte + // Format: [timestamp, [event_array]] + // event_array for BlockStored: ["block_stored", block_hashes, parent_hash, token_ids, block_size] + + // Create block hash bytes (8-byte big-endian int64) + hash1 := make([]byte, 8) + binary.BigEndian.PutUint64(hash1, uint64(12345)) + + hash2 := make([]byte, 8) + binary.BigEndian.PutUint64(hash2, uint64(67890)) + + parentHash := make([]byte, 8) + binary.BigEndian.PutUint64(parentHash, uint64(99999)) + + // Create a BlockStored event with bytes format + eventArray := []interface{}{ + "BlockStored", + []interface{}{hash1, hash2}, // block_hashes as [][]byte + parentHash, // parent_block_hash as []byte + []interface{}{uint32(1), uint32(2), uint32(3), uint32(4)}, // token_ids + int(2), // block_size + } + + batch := []interface{}{ + float64(ts.Unix()), // timestamp as float64 + []interface{}{eventArray}, // events + } + + data, err := msgpack.Marshal(batch) + require.NoError(t, err) + + // Decode + decoded, err := DecodeEventBatch(data, "test-model", "test-pod") + require.NoError(t, err) + require.Len(t, decoded.Events, 1) + + // Verify BlockStoredEvent + stored, ok := decoded.Events[0].(*BlockStoredEvent) + require.True(t, ok, "decoded event is not BlockStoredEvent") + + // Verify block hashes were correctly converted from []byte to int64 + assert.Equal(t, int64(12345), stored.BlockHashes[0]) + assert.Equal(t, int64(67890), stored.BlockHashes[1]) + + // Verify parent hash + require.NotNil(t, stored.ParentBlockHash) + assert.Equal(t, int64(99999), *stored.ParentBlockHash) + + // Verify token IDs + require.Len(t, stored.TokenIDs, 2) // 4 tokens / block_size(2) = 2 blocks + for i, block := range stored.TokenIDs { + for j := 0; j < len(block); j += 4 { + val := binary.BigEndian.Uint32(block[j : j+4]) + expectedVal := uint32(i*2 + j/4 + 1) // 1,2 for first block, 3,4 for second + assert.Equal(t, expectedVal, val, + "token mismatch at block %d index %d", i, j/4) + } + } + + // Verify metadata + assert.Equal(t, "test-model", stored.ModelName) + assert.Equal(t, "test-pod", stored.PodName) +} + +func TestBlockHashesAsSHA256BytesInDecodeEventBatch(t *testing.T) { + // Test DecodeEventBatch with 32-byte SHA-256 hashes + // The decoder should use the first 8 bytes + ts := time.Now().UTC() + + // Create a 32-byte SHA-256 hash + sha256Hash := make([]byte, 32) + for i := 0; i < 32; i++ { + sha256Hash[i] = byte(i) + } + + // Expected: first 8 bytes converted to int64 + expectedHash := int64(binary.BigEndian.Uint64(sha256Hash[:8])) + + eventArray := []interface{}{ + "BlockStored", + []interface{}{sha256Hash}, // 32-byte hash + nil, // no parent + []interface{}{uint32(1), uint32(2)}, // token_ids + int(2), // block_size + } + + batch := []interface{}{ + float64(ts.Unix()), // timestamp as float64 + []interface{}{eventArray}, + } + + data, err := msgpack.Marshal(batch) + require.NoError(t, err) + + decoded, err := DecodeEventBatch(data, "sha256-model", "sha256-pod") + require.NoError(t, err) + require.Len(t, decoded.Events, 1) + + stored, ok := decoded.Events[0].(*BlockStoredEvent) + require.True(t, ok) + + assert.Len(t, stored.BlockHashes, 1) + assert.Equal(t, expectedHash, stored.BlockHashes[0]) + assert.Nil(t, stored.ParentBlockHash) +} + +func TestBlockRemovedEventWithBytesHashes(t *testing.T) { + // Test BlockRemovedEvent with block hashes as bytes + ts := time.Now().UTC() + + hash1 := make([]byte, 8) + binary.BigEndian.PutUint64(hash1, uint64(111)) + + hash2 := make([]byte, 8) + binary.BigEndian.PutUint64(hash2, uint64(222)) + + eventArray := []interface{}{ + "BlockRemoved", + []interface{}{hash1, hash2}, // block_hashes as bytes + } + + batch := []interface{}{ + float64(ts.Unix()), // timestamp as float64 + []interface{}{eventArray}, + } + + data, err := msgpack.Marshal(batch) + require.NoError(t, err) + + decoded, err := DecodeEventBatch(data, "removed-model", "removed-pod") + require.NoError(t, err) + require.Len(t, decoded.Events, 1) + + removed, ok := decoded.Events[0].(*BlockRemovedEvent) + require.True(t, ok) + + assert.Equal(t, []int64{111, 222}, removed.BlockHashes) + assert.Equal(t, "removed-model", removed.ModelName) + assert.Equal(t, "removed-pod", removed.PodName) +} diff --git a/pkg/cache/kvcache/msgpack_encoder.go b/pkg/cache/kvcache/msgpack_encoder.go index c363c3a1d..4a832c7b8 100644 --- a/pkg/cache/kvcache/msgpack_encoder.go +++ b/pkg/cache/kvcache/msgpack_encoder.go @@ -75,7 +75,7 @@ func encodeEvent(event KVEvent) ([]interface{}, error) { arr := []interface{}{ string(e.Type), // tag e.BlockHashes, // block_hashes - e.ParentBlockHash, // parent_block_hash (nullable) + e.ParentBlockHash, // parent_block_hash (nullable *[]byte) tokenIDs, // flat token IDs blockSize, // block_size } diff --git a/pkg/cache/kvcache/zmq_client.go b/pkg/cache/kvcache/zmq_client.go index 22c3371ba..7fbada3f1 100644 --- a/pkg/cache/kvcache/zmq_client.go +++ b/pkg/cache/kvcache/zmq_client.go @@ -377,11 +377,13 @@ func (c *ZMQClient) requestReplay(fromSeq int64) error { } // Prepare replay request + // DEALER-ROUTER pattern requires: [empty_delimiter, payload] + // The DEALER socket will automatically prepend the identity frame reqData := make([]byte, 8) binary.BigEndian.PutUint64(reqData, uint64(fromSeq)) - // Send replay request - if _, err := socket.SendBytes(reqData, 0); err != nil { + // Send replay request as multipart message: [empty_delimiter, start_seq_bytes] + if _, err := socket.SendMessage([]byte{}, reqData); err != nil { return fmt.Errorf("failed to send replay request: %w", err) } diff --git a/pkg/cache/utils.go b/pkg/cache/utils.go index ae3124193..69df58aae 100644 --- a/pkg/cache/utils.go +++ b/pkg/cache/utils.go @@ -76,6 +76,69 @@ func buildMetricLabels(pod *Pod, engineType string, model string) ([]string, []s return labelNames, labelValues } +func mergeLabelPairs(primaryNames, primaryValues, secondaryNames, secondaryValues []string) ([]string, []string) { + pLen := len(primaryNames) + if len(primaryValues) < pLen { + klog.Warningf("primary labels length mismatch: names=%d, values=%d", pLen, len(primaryValues)) + pLen = len(primaryValues) + } + sLen := len(secondaryNames) + if len(secondaryValues) < sLen { + klog.Warningf("secondary labels length mismatch: names=%d, values=%d", sLen, len(secondaryValues)) + sLen = len(secondaryValues) + } + + secondaryMap := make(map[string]string, sLen) + secondaryOrder := make([]string, 0, sLen) + for i := 0; i < sLen; i++ { + n := secondaryNames[i] + if n == "" { + continue + } + if _, exists := secondaryMap[n]; !exists { + secondaryOrder = append(secondaryOrder, n) + } + secondaryMap[n] = secondaryValues[i] // last-wins + } + + outNames := make([]string, 0, pLen+len(secondaryOrder)) + outValues := make([]string, 0, pLen+len(secondaryOrder)) + seen := make(map[string]struct{}, pLen+len(secondaryOrder)) + + // primary first, but allow secondary override + for i := 0; i < pLen; i++ { + n := primaryNames[i] + if n == "" { + continue + } + if _, ok := seen[n]; ok { + continue + } + seen[n] = struct{}{} + v := primaryValues[i] + if sv, ok := secondaryMap[n]; ok { + v = sv + } + outNames = append(outNames, n) + outValues = append(outValues, v) + } + + // then add secondary-only labels (use map value to respect last-wins) + for _, n := range secondaryOrder { + if n == "" { + continue + } + if _, ok := seen[n]; ok { + continue + } + seen[n] = struct{}{} + outNames = append(outNames, n) + outValues = append(outValues, secondaryMap[n]) + } + + return outNames, outValues +} + func shouldSkipMetric(podName string, metricName string) bool { if strings.Contains(podName, "prefill") && isDecodeOnlyMetric(metricName) { return true diff --git a/pkg/cache/utils_test.go b/pkg/cache/utils_test.go new file mode 100644 index 000000000..71d910332 --- /dev/null +++ b/pkg/cache/utils_test.go @@ -0,0 +1,52 @@ +/* +Copyright 2025 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cache + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" +) + +func TestMergeLabelPairs_DedupAndPreferSecondaryValue(t *testing.T) { + primaryNames := []string{"engine_type", "model_name"} + primaryValues := []string{"from_engine", "m1"} + secondaryNames := []string{"namespace", "engine_type", "pod"} + secondaryValues := []string{"ns1", "vllm", "p1"} + + mergedNames, mergedValues := mergeLabelPairs(primaryNames, primaryValues, secondaryNames, secondaryValues) + require.Equal(t, []string{"engine_type", "model_name", "namespace", "pod"}, mergedNames) + require.Equal(t, []string{"vllm", "m1", "ns1", "p1"}, mergedValues) + + seen := make(map[string]struct{}, len(mergedNames)) + for _, n := range mergedNames { + _, ok := seen[n] + require.False(t, ok) + seen[n] = struct{}{} + } + + descDup := prometheus.NewDesc("num_requests_running", "help", []string{"engine_type", "engine_type"}, nil) + require.Panics(t, func() { + _ = prometheus.MustNewConstMetric(descDup, prometheus.GaugeValue, 1, "a", "b") + }) + + descMerged := prometheus.NewDesc("num_requests_running", "help", mergedNames, nil) + require.NotPanics(t, func() { + _ = prometheus.MustNewConstMetric(descMerged, prometheus.GaugeValue, 1, mergedValues...) + }) +} diff --git a/pkg/constants/model.go b/pkg/constants/model.go index e2b1a3082..b988889d0 100644 --- a/pkg/constants/model.go +++ b/pkg/constants/model.go @@ -45,4 +45,9 @@ const ( // ModelAnnoRouterCustomPath is the anno for add PathPrefixes in httpRoute, split by comma // Example: "model.aibrix.ai/model-router-custom-paths": "/score,/version" ModelAnnoRouterCustomPath = "model.aibrix.ai/model-router-custom-paths" + + // ModelAnnoConfig is the annotation holding JSON model config with multiple profiles. + // Client selects profile at runtime via config-profile header or defaultProfile is selected. + // See docs/source/designs/model-config-profiles.rst for schema. + ModelAnnoConfig = "model.aibrix.ai/config" ) diff --git a/pkg/kvevent/handler.go b/pkg/kvevent/handler.go index 1774c8b96..ec6aab47f 100644 --- a/pkg/kvevent/handler.go +++ b/pkg/kvevent/handler.go @@ -77,6 +77,7 @@ func (h *eventHandler) handleBlockStored(ctx context.Context, event *kvcache.Blo } // Convert to sync event + // Note: BlockHashes are already []int64 after msgpack decoding syncEvent := BlockStoredEvent{ BlockHashes: event.BlockHashes, ModelName: h.modelName, @@ -110,6 +111,7 @@ func (h *eventHandler) handleBlockRemoved(ctx context.Context, event *kvcache.Bl } // Convert to sync event + // Note: BlockHashes are already []int64 after msgpack decoding syncEvent := BlockRemovedEvent{ BlockHashes: event.BlockHashes, ModelName: h.modelName, diff --git a/pkg/kvevent/handler_test.go b/pkg/kvevent/handler_test.go index 702800fc9..331496482 100644 --- a/pkg/kvevent/handler_test.go +++ b/pkg/kvevent/handler_test.go @@ -26,6 +26,15 @@ import ( "github.com/vllm-project/aibrix/pkg/cache/kvcache" ) +// Helper function to convert int32 slice to bytes (big-endian) +func int32SliceToBytes(tokens []int32) []byte { + result := make([]byte, len(tokens)*4) + for i, token := range tokens { + binary.BigEndian.PutUint32(result[i*4:], uint32(token)) + } + return result +} + // mockSyncIndexerWithErrors allows simulating errors type mockSyncIndexerWithErrors struct { blockStoredErr error @@ -62,86 +71,6 @@ func (m *mockSyncProvider) GetSyncIndexer(ctx context.Context) (SyncIndexer, err return m.indexer, nil } -// Test tokenIDsToBytes conversion -func TestTokenIDsToBytes(t *testing.T) { - tests := []struct { - name string - tokenIDs []int32 - expected []byte - }{ - { - name: "empty tokens", - tokenIDs: []int32{}, - expected: []byte{}, - }, - { - name: "single token", - tokenIDs: []int32{12345}, - expected: []byte{0, 0, 48, 57}, // 12345 in big-endian - }, - { - name: "multiple tokens", - tokenIDs: []int32{1, 256, 65535}, - expected: []byte{ - 0, 0, 0, 1, // 1 - 0, 0, 1, 0, // 256 - 0, 0, 255, 255, // 65535 - }, - }, - { - name: "negative token", - tokenIDs: []int32{-1}, - expected: []byte{255, 255, 255, 255}, // -1 in two's complement - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := tokenIDsToBytes(tt.tokenIDs) - if len(result) != len(tt.expected) { - t.Fatalf("Expected length %d, got %d", len(tt.expected), len(result)) - } - for i := range result { - if result[i] != tt.expected[i] { - t.Errorf("Byte %d: expected %d, got %d", i, tt.expected[i], result[i]) - } - } - }) - } -} - -// Test convertTokenIDs -func TestConvertTokenIDs(t *testing.T) { - input := [][]int32{ - {1, 2, 3}, - {}, - {12345}, - } - - result := convertTokenIDs(input) - - if len(result) != 3 { - t.Fatalf("Expected 3 results, got %d", len(result)) - } - - // Check first array - expected0 := []byte{0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3} - if len(result[0]) != len(expected0) { - t.Errorf("Result[0] length mismatch") - } - - // Check second array (empty) - if len(result[1]) != 0 { - t.Errorf("Result[1] should be empty") - } - - // Check third array - expected2 := []byte{0, 0, 48, 57} - if len(result[2]) != len(expected2) { - t.Errorf("Result[2] length mismatch") - } -} - // Test HandleEvent with BlockStoredEvent func TestHandleBlockStoredEvent(t *testing.T) { syncIndexer := &mockSyncIndexerWithErrors{} @@ -165,7 +94,10 @@ func TestHandleBlockStoredEvent(t *testing.T) { event := &kvcache.BlockStoredEvent{ BlockHashes: []int64{1001, 1002, 1003}, ParentBlockHash: &[]int64{1000}[0], - TokenIDs: [][]int32{{1, 2, 3}, {4, 5, 6}}, + TokenIDs: [][]byte{ + int32SliceToBytes([]int32{1, 2, 3}), + int32SliceToBytes([]int32{4, 5, 6}), + }, } err := handler.HandleEvent(event) diff --git a/pkg/kvevent/integration_test.go b/pkg/kvevent/integration_test.go index 88f5f5b8a..59f94d735 100644 --- a/pkg/kvevent/integration_test.go +++ b/pkg/kvevent/integration_test.go @@ -19,6 +19,7 @@ package kvevent_test import ( "context" + "encoding/binary" "testing" v1 "k8s.io/api/core/v1" @@ -31,6 +32,15 @@ import ( syncindexer "github.com/vllm-project/aibrix/pkg/utils/syncprefixcacheindexer" ) +// Helper function to convert int32 slice to bytes (big-endian) +func int32SliceToBytes(tokens []int32) []byte { + result := make([]byte, len(tokens)*4) + for i, token := range tokens { + binary.BigEndian.PutUint32(result[i*4:], uint32(token)) + } + return result +} + // TestIntegrationEventHandlerWithRealComponents tests event handling with real components // This ensures we're not just testing mocks but actual integration between components func TestIntegrationEventHandlerWithRealComponents(t *testing.T) { @@ -79,7 +89,11 @@ func TestIntegrationEventHandlerWithRealComponents(t *testing.T) { storedEvent := &kvcache.BlockStoredEvent{ BlockHashes: []int64{1001, 1002, 1003}, ParentBlockHash: &[]int64{1000}[0], - TokenIDs: [][]int32{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, + TokenIDs: [][]byte{ + int32SliceToBytes([]int32{1, 2, 3}), + int32SliceToBytes([]int32{4, 5, 6}), + int32SliceToBytes([]int32{7, 8, 9}), + }, } err := handler.HandleEvent(storedEvent) diff --git a/pkg/kvevent/manager.go b/pkg/kvevent/manager.go index 6779878ef..1b12f8b97 100644 --- a/pkg/kvevent/manager.go +++ b/pkg/kvevent/manager.go @@ -274,7 +274,7 @@ func isPodSubscribable(pod *v1.Pod) bool { func isSamePod(pod1 *v1.Pod, pod2 *v1.Pod) bool { // For now, we just check if PodIP is the same. Other conditions may be added if needed. - return pod1.Status.PodIP != pod2.Status.PodIP + return pod1.Status.PodIP == pod2.Status.PodIP } func (m *Manager) subscribeToPod(ctx context.Context, podKey string, podInfo *PodInfo) error { diff --git a/pkg/metrics/custom_metrics.go b/pkg/metrics/custom_metrics.go index 2646faa76..923159602 100644 --- a/pkg/metrics/custom_metrics.go +++ b/pkg/metrics/custom_metrics.go @@ -25,7 +25,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - "github.com/vllm-project/aibrix/pkg/constants" "github.com/vllm-project/aibrix/pkg/types" "github.com/vllm-project/aibrix/pkg/utils" v1 "k8s.io/api/core/v1" @@ -38,6 +37,7 @@ var ( customCountersMu sync.RWMutex customHistograms = make(map[string]*histogramCollector) customHistogramsMu sync.RWMutex + gatewayPodName = os.Getenv("POD_NAME") // Function variables that can be overridden for testing SetGaugeMetricFnForTest = defaultSetGaugeMetric @@ -73,7 +73,7 @@ func IncrementCounterMetric(name string, help string, value float64, labelNames IncrementCounterMetricFnForTest(name, help, value, labelNames, labelValues...) } -func EmitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) { +func emitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) { var model string if routingCtx == nil { model = "" @@ -81,10 +81,10 @@ func EmitGaugeMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, model = routingCtx.Model } labelNames, labelValues := buildMetricLabels(pod, model, extras) - SetGaugeMetricFnForTest(name, GetMetricHelp(name), value, labelNames, labelValues...) + SetGaugeMetric(name, GetMetricHelp(name), value, labelNames, labelValues...) } -func EmitCounterMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) { +func emitCounterMetric(routingCtx *types.RoutingContext, pod *v1.Pod, name string, value float64, extras map[string]string) { var model string if routingCtx == nil { model = "" @@ -267,18 +267,23 @@ func SetupCounterMetricsForTest(metricName string, labelNames []string) (*promet return testCounter, func() { IncrementCounterMetricFnForTest = originalFn } } -func EmitMetricToPrometheus(metricName string, metricValue MetricValue, labelNames []string, labelValues []string) { +func EmitMetricToPrometheus(routingCtx *types.RoutingContext, pod *v1.Pod, metricName string, metricValue MetricValue, extra map[string]string) { metricDef, exists := Metrics[metricName] if !exists { return } + var model string + if routingCtx != nil { + model = routingCtx.Model + } switch metricDef.MetricType.Raw { case Gauge: - SetGaugeMetric(metricName, GetMetricHelp(metricName), metricValue.GetSimpleValue(), labelNames, labelValues...) + emitGaugeMetric(routingCtx, pod, metricName, metricValue.GetSimpleValue(), extra) case Counter: - SetGaugeMetric(metricName, GetMetricHelp(metricName), metricValue.GetSimpleValue(), labelNames, labelValues...) + emitCounterMetric(routingCtx, pod, metricName, metricValue.GetSimpleValue(), extra) default: + labelNames, labelValues := buildMetricLabels(pod, model, extra) if hv := metricValue.GetHistogramValue(); hv != nil { SetHistogramMetric(metricName, GetMetricHelp(metricName), hv, labelNames, labelValues...) p50, _ := hv.GetPercentile(50) @@ -292,13 +297,24 @@ func EmitMetricToPrometheus(metricName string, metricValue MetricValue, labelNam } func buildMetricLabels(pod *v1.Pod, model string, extras map[string]string) ([]string, []string) { - labelNames, labelValues := generateDefaultMetricLabelsMap(pod, model) + defaultLabelMap := generateDefaultMetricLabelsMap(pod, model) + labelNames := make([]string, 0, len(defaultLabelMap)+len(extras)) + labelValues := make([]string, 0, len(defaultLabelMap)+len(extras)) + for k, v := range defaultLabelMap { + labelNames = append(labelNames, k) + labelValues = append(labelValues, v) + } + if len(extras) > 0 { keys := make([]string, 0, len(extras)) for k := range extras { - if k != "" { - keys = append(keys, k) + if k == "" { + continue } + if _, exist := defaultLabelMap[k]; exist { + continue + } + keys = append(keys, k) } sort.Strings(keys) for _, k := range keys { @@ -309,35 +325,21 @@ func buildMetricLabels(pod *v1.Pod, model string, extras map[string]string) ([]s return labelNames, labelValues } -func generateDefaultMetricLabelsMap(pod *v1.Pod, model string) (labelNames []string, labelValues []string) { - labelNames = []string{ - "namespace", - "pod", - "model", - "engine_type", - "roleset", - "role", - "role_replica_index", - "gateway_pod", - } - var namespace, podName, engineType, roleset, role, roleReplica string - if pod != nil { - namespace = pod.Namespace - podName = pod.Name - engineType = utils.GetLLMEngine(pod, constants.ModelLabelEngine, utils.DefaultLLMEngine) - roleset = utils.GetPodEnv(pod, "ROLESET_NAME", "") - role = utils.GetPodEnv(pod, "ROLE_NAME", "") - roleReplica = utils.GetPodEnv(pod, "ROLE_REPLICA_INDEX", "") +func generateDefaultMetricLabelsMap(pod *v1.Pod, model string) map[string]string { + if pod == nil { + return map[string]string{ + "model": model, + "gateway_pod": gatewayPodName, + } } - labelValues = []string{ - namespace, - podName, - model, - engineType, - roleset, - role, - roleReplica, - os.Getenv("POD_NAME"), // gateway-plugin pod + return map[string]string{ + "namespace": pod.Namespace, + "pod": pod.Name, + "model": model, + "engine_type": GetEngineType(*pod), + "roleset": utils.GetPodEnv(pod, "ROLESET_NAME", ""), + "role": utils.GetPodEnv(pod, "ROLE_NAME", ""), + "role_replica_index": utils.GetPodEnv(pod, "ROLE_REPLICA_INDEX", ""), + "gateway_pod": gatewayPodName, } - return labelNames, labelValues } diff --git a/pkg/metrics/engine_fetcher.go b/pkg/metrics/engine_fetcher.go index 827726621..ed0ac3468 100644 --- a/pkg/metrics/engine_fetcher.go +++ b/pkg/metrics/engine_fetcher.go @@ -295,11 +295,11 @@ func (ef *EngineMetricsFetcher) parseMetricFromFamily(allMetrics map[string]*dto if metric.MetricType.IsRawMetric() { switch metric.MetricType.Raw { case Gauge, Counter: - value, err := GetCounterGaugeValue(firstMetric, metricFamily.GetType()) + simpleValue, err := GetCounterGaugeValue(firstMetric, metricFamily.GetType()) if err != nil { return nil, fmt.Errorf("failed to parse counter/gauge metric %s: %v", rawMetricName, err) } - return &SimpleMetricValue{Value: value}, nil + return simpleValue, nil case Histogram: histValue, err := GetHistogramValue(firstMetric) @@ -355,7 +355,7 @@ func (ef *EngineMetricsFetcher) fetchAllMetricsFromURL(ctx context.Context, url resp, err := ef.client.Do(req) if err != nil { - EmitCounterMetric(nil, nil, LLMEngineMetricsQueryFail, 1.0, nil) + EmitMetricToPrometheus(nil, nil, LLMEngineMetricsQueryFail, &SimpleMetricValue{Value: 1.0}, nil) return nil, fmt.Errorf("failed to fetch metrics from %s: %v", url, err) } defer func() { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 1caf8cb82..1014a5a97 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -29,6 +29,7 @@ const ( E2ERequestLatencySeconds = "e2e_request_latency_seconds" RequestQueueTimeSeconds = "request_queue_time_seconds" RequestInferenceTimeSeconds = "request_inference_time_seconds" + PerStageReqLatencySeconds = "per_stage_req_latency_seconds" HTTPRequestDurationSeconds = "http_request_duration_seconds" HTTPRequestDurationHighRSeconds = "http_request_duration_highr_seconds" @@ -237,6 +238,17 @@ var ( }, Description: "Request inference time in seconds", }, + PerStageReqLatencySeconds: { + MetricScope: PodModelMetricScope, + MetricSource: PodRawMetrics, + MetricType: MetricType{ + Raw: Histogram, + }, + EngineMetricsNameMapping: map[string]string{ + "sglang": "sglang:per_stage_req_latency_seconds", + }, + Description: "Per-stage request latency in seconds", + }, HTTPRequestDurationSeconds: { MetricScope: PodMetricScope, MetricSource: PodRawMetrics, diff --git a/pkg/metrics/types.go b/pkg/metrics/types.go index 50852ff2c..a21b3299c 100644 --- a/pkg/metrics/types.go +++ b/pkg/metrics/types.go @@ -92,7 +92,7 @@ type MetricValue interface { GetSimpleValue() float64 GetHistogramValue() *HistogramMetricValue GetPrometheusResult() *model.Value - GetLabelValue() string + GetLabelValues() map[string]string } var _ MetricValue = (*SimpleMetricValue)(nil) @@ -102,7 +102,8 @@ var _ MetricValue = (*LabelValueMetricValue)(nil) // SimpleMetricValue represents simple metrics (e.g., gauge or counter). type SimpleMetricValue struct { - Value float64 + Value float64 + Labels map[string]string // Optional: Additional labels for the metric. } func (s *SimpleMetricValue) GetSimpleValue() float64 { @@ -117,8 +118,8 @@ func (s *SimpleMetricValue) GetPrometheusResult() *model.Value { return nil } -func (s *SimpleMetricValue) GetLabelValue() string { - return "" +func (s *SimpleMetricValue) GetLabelValues() map[string]string { + return s.Labels } // HistogramMetricValue represents a detailed histogram metric. @@ -126,6 +127,7 @@ type HistogramMetricValue struct { Sum float64 Count float64 Buckets map[string]float64 // e.g., {"0.1": 5, "0.5": 3, "1.0": 2} + Labels map[string]string // Optional: Additional labels for the histogram. } func (h *HistogramMetricValue) GetSimpleValue() float64 { @@ -230,8 +232,8 @@ func (h *HistogramMetricValue) GetPercentile(percentile float64) (float64, error return 0, fmt.Errorf("percentile not found") } -func (s *HistogramMetricValue) GetLabelValue() string { - return "" +func (h *HistogramMetricValue) GetLabelValues() map[string]string { + return h.Labels } // PrometheusMetricValue represents Prometheus query results. @@ -251,8 +253,8 @@ func (p *PrometheusMetricValue) GetPrometheusResult() *model.Value { return p.Result } -func (s *PrometheusMetricValue) GetLabelValue() string { - return "" +func (s *PrometheusMetricValue) GetLabelValues() map[string]string { + return map[string]string{} } // PrometheusMetricValue represents Prometheus query results. @@ -272,8 +274,8 @@ func (l *LabelValueMetricValue) GetPrometheusResult() *model.Value { return nil } -func (l *LabelValueMetricValue) GetLabelValue() string { - return l.Value +func (l *LabelValueMetricValue) GetLabelValues() map[string]string { + return map[string]string{"value": l.Value} } func ExtractNumericFromPromResult(r *model.Value) (float64, error) { diff --git a/pkg/metrics/utils.go b/pkg/metrics/utils.go index 6993fdc29..7d0d53029 100644 --- a/pkg/metrics/utils.go +++ b/pkg/metrics/utils.go @@ -181,18 +181,25 @@ func GetLabelValueForKey(metric *dto.Metric, key string) (string, error) { return "", fmt.Errorf("Label %s not found", key) } -func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (float64, error) { - if metricType == dto.MetricType_COUNTER { - return metric.GetCounter().GetValue(), nil - } else if metricType == dto.MetricType_GAUGE { - return metric.GetGauge().GetValue(), nil +func GetCounterGaugeValue(metric *dto.Metric, metricType dto.MetricType) (*SimpleMetricValue, error) { + labels := make(map[string]string) + for _, labelPair := range metric.Label { + labels[labelPair.GetName()] = labelPair.GetValue() + } + switch metricType { + case dto.MetricType_COUNTER: + return &SimpleMetricValue{Value: metric.GetCounter().GetValue(), Labels: labels}, nil + case dto.MetricType_GAUGE: + return &SimpleMetricValue{Value: metric.GetGauge().GetValue(), Labels: labels}, nil + default: + return nil, fmt.Errorf("Metric type not supported: %v", metricType) } - return 0, fmt.Errorf("Metric type not supported: %v", metricType) } func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error) { histogram := &HistogramMetricValue{ Buckets: make(map[string]float64), + Labels: make(map[string]string), } histogramMetric := metric.GetHistogram() if histogramMetric == nil { @@ -205,6 +212,9 @@ func GetHistogramValue(metric *dto.Metric) (*HistogramMetricValue, error) { bound := fmt.Sprintf("%f", bucket.GetUpperBound()) histogram.Buckets[bound] = float64(bucket.GetCumulativeCount()) } + for _, labelPair := range metric.Label { + histogram.Labels[labelPair.GetName()] = labelPair.GetValue() + } return histogram, nil } diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache.go b/pkg/plugins/gateway/algorithms/least_kv_cache.go index fd2898a3e..3680680d0 100644 --- a/pkg/plugins/gateway/algorithms/least_kv_cache.go +++ b/pkg/plugins/gateway/algorithms/least_kv_cache.go @@ -57,7 +57,7 @@ func (r leastKvCacheRouter) Route(ctx *types.RoutingContext, readyPodList types. // Due to metric refactor (pull/543) to better support lora and multi models, // we change to use PodModelMetrics instead of PodMetrics in some scenarios. // This works but doesn't look very promising, we can revisit this part later. - gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.GPUCacheUsagePerc) + gpuCache, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.KVCacheUsagePerc) if err != nil { klog.Error(err) continue diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache_test.go b/pkg/plugins/gateway/algorithms/least_kv_cache_test.go index ac4420dde..bf12009d8 100644 --- a/pkg/plugins/gateway/algorithms/least_kv_cache_test.go +++ b/pkg/plugins/gateway/algorithms/least_kv_cache_test.go @@ -53,19 +53,19 @@ func TestLeastKvCache(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3}, }, "p2": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.1}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.1}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5}, }, "p3": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6}, }, "p4": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.6}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.8}, }, }, @@ -93,15 +93,15 @@ func TestLeastKvCache(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3}, }, "p2": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.5}, }, "p3": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, }, }, @@ -120,7 +120,7 @@ func TestLeastKvCache(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.GPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, + metrics.KVCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.2}, metrics.CPUCacheUsagePerc: &metrics.SimpleMetricValue{Value: 0.3}, }, }, diff --git a/pkg/plugins/gateway/algorithms/pd_disaggregation.go b/pkg/plugins/gateway/algorithms/pd_disaggregation.go index 4b631f210..ca4881352 100644 --- a/pkg/plugins/gateway/algorithms/pd_disaggregation.go +++ b/pkg/plugins/gateway/algorithms/pd_disaggregation.go @@ -33,6 +33,7 @@ import ( "github.com/vllm-project/aibrix/pkg/cache" "github.com/vllm-project/aibrix/pkg/constants" "github.com/vllm-project/aibrix/pkg/metrics" + "github.com/vllm-project/aibrix/pkg/plugins/gateway/configprofiles" "github.com/vllm-project/aibrix/pkg/types" "github.com/vllm-project/aibrix/pkg/utils" "github.com/vllm-project/aibrix/pkg/utils/prefixcacheindexer" @@ -50,11 +51,10 @@ const ( LLMEngineIdentifier string = constants.ModelLabelEngine PDRoleSetIdentifier string = "roleset-name" PDRoleIdentifier string = "role-name" - CombinedIdentifier string = "model.aibrix.ai/combined" RoleReplicaIndex string = "stormservice.orchestration.aibrix.ai/role-replica-index" PodGroupIndex string = "stormservice.orchestration.aibrix.ai/pod-group-index" - PromptMinLength string = "prompt-min-length" - PromptMaxLength string = "prompt-max-length" + PromptLenBucketMinLength string = "prompt-len-bucket-min-length" + PromptLenBucketMaxLength string = "prompt-len-bucket-max-length" defaultPrefillRequestTimeout int = 30 defaultMaxRequest float64 = 32 @@ -73,6 +73,9 @@ const ( // KV connector types for different backends KVConnectorTypeSHFS = "shfs" // Default - AIBrix SHFS/KVCacheManager (GPU) KVConnectorTypeNIXL = "nixl" // NIXL for Neuron (uses disagg_prefill_resp wrapper) + + HeaderPrefillTargetPodIP = "prefill-target-pod-ip" + HeaderPrefillTargetPod = "prefill-target-pod" ) var ( @@ -158,28 +161,33 @@ func (r *pdRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) // Validate engine consistency across all prefill pods llmEngine, err := validateAndGetLLMEngine(readyPodList.All()) if err != nil { - metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0, + metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": pdRouteValidateLLMEngineFail, "status_code": "400"}) return "", fmt.Errorf("engine validation failed for request %s: %w", ctx.RequestID, err) } prefillPod, decodePod, err := r.filterPrefillDecodePods(ctx, readyPodList.All()) if err != nil { - metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0, + metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": pdRouteFilterPrefillDecodePodsFail, "status_code": "400"}) return "", fmt.Errorf("failed to filter prefill/decode pods for request %s: %w", ctx.RequestID, err) } if prefillPod != nil { klog.InfoS("selected prefill/decode pods", "request_id", ctx.RequestID, "prefill_pod", prefillPod.Name, "decode_pod", decodePod.Name) + if ctx.RespHeaders == nil { + ctx.RespHeaders = make(map[string]string) + } + ctx.RespHeaders[HeaderPrefillTargetPod] = prefillPod.Name + ctx.RespHeaders[HeaderPrefillTargetPodIP] = prefillPod.Status.PodIP err = r.doPrefillRequest(ctx, prefillPod, llmEngine) if err != nil { - metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0, + metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": pdRoutePrefillRequestError, "status_code": "500"}) klog.ErrorS(err, pdRoutePrefillRequestError, "request_id", ctx.RequestID) return "", fmt.Errorf("prefill request failed for request %s: %w", ctx.RequestID, err) } - metrics.EmitCounterMetric(ctx, nil, metrics.GatewayPrefillRequestSuccessTotal, 1.0, + metrics.EmitMetricToPrometheus(ctx, nil, metrics.GatewayPrefillRequestSuccessTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": pdRoutePrefillRequestSuccess, "status_code": "200"}) } @@ -203,7 +211,7 @@ func (r *pdRouter) filterPrefillDecodePods(routingCtx *types.RoutingContext, rea klog.V(4).InfoS("prompt length based filtering enabled", "request_id", routingCtx.RequestID, "prompt_length", promptLength) } - prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(readyPods, promptLength) + prefillPods, decodePods, promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, combinedPods := r.collectAndBucketPods(routingCtx, readyPods, promptLength) combinedAvailable := aibrixPromptLengthBucketing && len(combinedPods) > 0 if len(prefillPods) == 0 && !combinedAvailable { return nil, nil, fmt.Errorf("prefill pods are not ready: prefill=%d, decode=%d", len(prefillPods), len(decodePods)) @@ -509,8 +517,8 @@ func (r *pdRouter) finalPDScore(routingCtx *types.RoutingContext, r.selectionCounts[targetDecodePod.Name]++ r.countersMu.Unlock() - metrics.EmitCounterMetric(routingCtx, targetPrefillPod, metrics.PDSelectedPrefillPodTotal, 1.0, nil) - metrics.EmitCounterMetric(routingCtx, targetDecodePod, metrics.PDSelectedDecodePodTotal, 1.0, nil) + metrics.EmitMetricToPrometheus(routingCtx, targetPrefillPod, metrics.PDSelectedPrefillPodTotal, &metrics.SimpleMetricValue{Value: 1.0}, nil) + metrics.EmitMetricToPrometheus(routingCtx, targetDecodePod, metrics.PDSelectedDecodePodTotal, &metrics.SimpleMetricValue{Value: 1.0}, nil) return targetPrefillPod, targetDecodePod, nil } @@ -684,7 +692,7 @@ func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingConte resp, err := r.httpClient.Do(req) if err != nil { status, code := metrics.HttpFailureStatusCode(ctx, err, nil) - metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0, + metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": status, "status_code": code}) return nil, fmt.Errorf("failed to execute http prefill request: %w", err) } @@ -701,7 +709,7 @@ func (r *pdRouter) executeHTTPRequest(url string, routingCtx *types.RoutingConte // Check response status if resp.StatusCode != http.StatusOK { status, code := metrics.HttpFailureStatusCode(ctx, nil, resp) - metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, 1.0, + metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayPrefillRequestFailTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"status": status, "status_code": code}) return nil, fmt.Errorf("http prefill request failed with status %d: %s", resp.StatusCode, string(body)) } @@ -932,8 +940,12 @@ func (t *PrefillRequestTracker) GetPrefillRequestCountsForPod(podname string) in return int(countInterface.(*atomic.Int32).Load()) } -func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) bool { - minLength, maxLength := r.getPodPromptRange(pod) +func (r *pdRouter) isPodSuitableForPromptLength(routingCtx *types.RoutingContext, pod *v1.Pod, promptLength int) bool { + profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile) + if profile == nil { + return false + } + minLength, maxLength := profile.PromptLenBucketMinLength, profile.PromptLenBucketMaxLength if minLength > maxLength { return false @@ -946,31 +958,15 @@ func (r *pdRouter) isPodSuitableForPromptLength(pod *v1.Pod, promptLength int) b return promptLength >= minLength && promptLength <= maxLength } -// getPodPromptRange retrieves the minimum and maximum prompt lengths from pod labels. -func (r *pdRouter) getPodPromptRange(pod *v1.Pod) (int, int) { - minLength := 0 - maxLength := math.MaxInt32 - - if val, ok := pod.Labels[PromptMinLength]; ok { - if parsed, err := strconv.Atoi(val); err == nil { - minLength = parsed - } - } - - if val, ok := pod.Labels[PromptMaxLength]; ok { - if parsed, err := strconv.Atoi(val); err == nil { - maxLength = parsed - } +func isCombinedPod(routingCtx *types.RoutingContext, pod *v1.Pod) bool { + profile := configprofiles.ResolveProfileFromPod(pod, routingCtx.ReqConfigProfile) + if profile == nil { + return false } - - return minLength, maxLength -} - -func isCombinedPod(pod *v1.Pod) bool { - return pod != nil && pod.Labels[CombinedIdentifier] == "true" + return profile.Combined } -func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) { +func (r *pdRouter) collectAndBucketPods(routingCtx *types.RoutingContext, readyPods []*v1.Pod, promptLength int) ([]*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod, []*v1.Pod) { prefillPods, decodePods := []*v1.Pod{}, []*v1.Pod{} promptLengthBucketingPrefillPods, promptLengthBucketingDecodePods, promptLengthBucketingCombinedPods := []*v1.Pod{}, []*v1.Pod{}, []*v1.Pod{} @@ -991,16 +987,16 @@ func (r *pdRouter) collectAndBucketPods(readyPods []*v1.Pod, promptLength int) ( switch pod.Labels[PDRoleIdentifier] { case "prefill": prefillPods = append(prefillPods, pod) - if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) { + if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) { promptLengthBucketingPrefillPods = append(promptLengthBucketingPrefillPods, pod) } case "decode": decodePods = append(decodePods, pod) - if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(pod, promptLength) { + if aibrixPromptLengthBucketing && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) { promptLengthBucketingDecodePods = append(promptLengthBucketingDecodePods, pod) } default: - if aibrixPromptLengthBucketing && isCombinedPod(pod) && r.isPodSuitableForPromptLength(pod, promptLength) { + if aibrixPromptLengthBucketing && isCombinedPod(routingCtx, pod) && r.isPodSuitableForPromptLength(routingCtx, pod, promptLength) { promptLengthBucketingCombinedPods = append(promptLengthBucketingCombinedPods, pod) } } diff --git a/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go b/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go index 6279d32bb..fd1932b9e 100644 --- a/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go +++ b/pkg/plugins/gateway/algorithms/pd_disaggregation_test.go @@ -19,9 +19,11 @@ package routingalgorithms import ( "context" "io" + "math" "net" "net/http" "net/http/httptest" + "strconv" "testing" "time" @@ -1524,88 +1526,62 @@ func TestLoadImbalanceSelectDecodePod(t *testing.T) { func TestIsPodSuitableForPromptLength(t *testing.T) { tests := []struct { name string - podLabels map[string]string + minLen int + maxLen int promptLength int expected bool }{ { - name: "no prompt length range configured", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - }, + name: "no prompt length range configured", + minLen: 0, + maxLen: math.MaxInt32, promptLength: 1000, expected: true, }, { - name: "prompt length exactly at min", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "1000", - "prompt-max-length": "2000", - }, + name: "prompt length exactly at min", + minLen: 1000, + maxLen: 2000, promptLength: 1000, expected: true, }, { - name: "prompt length exactly at max", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "1000", - "prompt-max-length": "2000", - }, + name: "prompt length exactly at max", + minLen: 1000, + maxLen: 2000, promptLength: 2000, expected: true, }, { - name: "prompt length in middle of range", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "1000", - "prompt-max-length": "2000", - }, + name: "prompt length in middle of range", + minLen: 1000, + maxLen: 2000, promptLength: 1500, expected: true, }, { - name: "prompt length below min", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "1000", - "prompt-max-length": "2000", - }, + name: "prompt length below min", + minLen: 1000, + maxLen: 2000, promptLength: 900, expected: false, }, { - name: "prompt length above max", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "1000", - "prompt-max-length": "2000", - }, + name: "prompt length above max", + minLen: 1000, + maxLen: 2000, promptLength: 2100, expected: false, }, { - name: "prompt length min larger than max", - podLabels: map[string]string{ - "roleset-name": "test", - "role-name": "prefill", - "prompt-min-length": "2000", - "prompt-max-length": "1000", - }, + name: "prompt length min larger than max", + minLen: 2000, + maxLen: 1000, promptLength: 1000, expected: false, }, } - // Create a router instance router := &pdRouter{ cache: cache.NewForTest(), tokenizer: tokenizer.NewCharacterTokenizer(), @@ -1613,28 +1589,27 @@ func TestIsPodSuitableForPromptLength(t *testing.T) { prefillRequestTracker: NewPrefillRequestTracker(), httpClient: &http.Client{}, } + ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "", "req", "user") for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - // Create test pod - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pod", - Labels: tt.podLabels, - }, - Status: v1.PodStatus{ - Conditions: []v1.PodCondition{ - {Type: v1.PodReady, Status: v1.ConditionTrue}, - }, - }, - } - - result := router.isPodSuitableForPromptLength(pod, tt.promptLength) + config := pdConfigAnnotation(tt.minLen, tt.maxLen, false) + pod := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Annotations: map[string]string{constants.ModelAnnoConfig: config}}} + result := router.isPodSuitableForPromptLength(ctx, pod, tt.promptLength) assert.Equal(t, tt.expected, result) }) } } +// pdConfigAnnotation returns model.aibrix.ai/config annotation JSON for prompt length bucketing. +func pdConfigAnnotation(minLen, maxLen int, combined bool) string { + combinedStr := "false" + if combined { + combinedStr = "true" + } + return `{"defaultProfile":"pd","profiles":{"pd":{"routingStrategy":"pd","promptLenBucketMinLength":` + strconv.Itoa(minLen) + `,"promptLenBucketMaxLength":` + strconv.Itoa(maxLen) + `,"combined":` + combinedStr + `}}}` +} + func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) { aibrixPromptLengthBucketing = true @@ -1647,10 +1622,13 @@ func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) { selectionCounts: map[string]int64{}, } - prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "0", PromptMaxLength: "1000000"}}} - prefillBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "1000000", PromptMaxLength: "2000000"}}} - decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "0", PromptMaxLength: "1000000"}}} - decodeBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "1000000", PromptMaxLength: "2000000"}}} + // Pods use model.aibrix.ai/config annotation (not labels) for prompt length bucketing. + configOK := pdConfigAnnotation(0, 1000000, false) + configBlocked := pdConfigAnnotation(1000000, 2000000, false) + prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configOK}}} + prefillBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}} + decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configOK}}} + decodeBlocked := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-blocked", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}} ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "short", "req-bucket", "user") prefill, decode, err := r.filterPrefillDecodePods(ctx, []*v1.Pod{prefillOK, prefillBlocked, decodeOK, decodeBlocked}) @@ -1662,7 +1640,6 @@ func TestFilterPrefillDecodePods_SelectCorrectBucketPods(t *testing.T) { } func TestFilterPrefillDecodePods_CombinedFallbackBucketing(t *testing.T) { - // os.Setenv("AIBRIX_PROMPT_LENGTH_BUCKETING", "true") aibrixPromptLengthBucketing = true r := pdRouter{ @@ -1671,11 +1648,16 @@ func TestFilterPrefillDecodePods_CombinedFallbackBucketing(t *testing.T) { prefixCacheIndexer: prefixcacheindexer.NewPrefixHashTable(), prefillRequestTracker: NewPrefillRequestTracker(), httpClient: &http.Client{}, + selectionCounts: map[string]int64{}, } - combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-1", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", CombinedIdentifier: "true", PromptMinLength: "0", PromptMaxLength: "1000000"}}} - prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", PromptMinLength: "0", PromptMaxLength: "1"}}} - decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", PromptMinLength: "0", PromptMaxLength: "1"}}} + // prefill/decode with 0-1 range: blocked for "say test" (prompt length > 1) + // combined with 0-1000000 + combined:true: suitable for fallback + configBlocked := pdConfigAnnotation(0, 1, false) + configCombined := pdConfigAnnotation(0, 1000000, true) + combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-1", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined"}, Annotations: map[string]string{constants.ModelAnnoConfig: configCombined}}} + prefillOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}} + decodeOK := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-ok", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode"}, Annotations: map[string]string{constants.ModelAnnoConfig: configBlocked}}} ctx := types.NewRoutingContext(context.Background(), "pd", "test-model", "say test", "req-combined", "user") prefill, decode, err := r.filterPrefillDecodePods(ctx, []*v1.Pod{prefillOK, decodeOK, combined}) @@ -1716,11 +1698,14 @@ func TestFilterPrefillDecodePods_CombinedPickImbalance(t *testing.T) { }, } + configPrefillDecode := pdConfigAnnotation(0, 1000000, false) + configCombined := pdConfigAnnotation(0, 1000000, true) + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - prefill := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-high", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", constants.ModelLabelName: "test-model", PromptMinLength: "0", PromptMaxLength: "1000000"}}} - decode := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-mid", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", constants.ModelLabelName: "test-model", PromptMinLength: "0", PromptMaxLength: "1000000"}}} - combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-low", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", constants.ModelLabelName: "test-model", CombinedIdentifier: "true", PromptMinLength: "0", PromptMaxLength: "1000000"}}} + prefill := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "prefill-high", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "prefill", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configPrefillDecode}}} + decode := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "decode-mid", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "decode", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configPrefillDecode}}} + combined := &v1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "combined-low", Namespace: "default", Labels: map[string]string{PDRoleSetIdentifier: "rs1", PDRoleIdentifier: "combined", constants.ModelLabelName: "test-model"}, Annotations: map[string]string{constants.ModelAnnoConfig: configCombined}}} metricsMap := map[string]map[string]metrics.MetricValue{} vecDrain100 := model.Vector{&model.Sample{Metric: model.Metric{"__name__": "drain_rate_1m"}, Value: model.SampleValue(100)}} diff --git a/pkg/plugins/gateway/algorithms/prefix_cache.go b/pkg/plugins/gateway/algorithms/prefix_cache.go index e225efcc0..f28f54b2d 100644 --- a/pkg/plugins/gateway/algorithms/prefix_cache.go +++ b/pkg/plugins/gateway/algorithms/prefix_cache.go @@ -388,14 +388,14 @@ func (p prefixCacheRouter) routeOriginal(ctx *types.RoutingContext, readyPodList for _, pod := range leastReqPodList { readyPodsMap[pod.Name] = struct{}{} } - klog.InfoS("prefix_cache_load_imbalanced", + klog.V(4).InfoS("prefix_cache_load_imbalanced", "request_id", ctx.RequestID, "pod_request_count", getRequestCounts(p.cache, readyPods), "target_pod_list", readyPodsMap) } // handle request with readyPodsMap from balanced or imbalanced filter matchedPods, prefixHashes = p.prefixCacheIndexer.MatchPrefix(tokens, ctx.Model, readyPodsMap) - klog.InfoS("prefix_hashes", "request_id", ctx.RequestID, "prefix_hashes", prefixHashes) + klog.V(4).InfoS("prefix_hashes", "request_id", ctx.RequestID, "prefix_hashes", prefixHashes) if len(matchedPods) > 0 { targetPod = getTargetPodFromMatchedPods(p.cache, readyPods, matchedPods) diff --git a/pkg/plugins/gateway/algorithms/router.go b/pkg/plugins/gateway/algorithms/router.go index ff9952d57..6a9f68a20 100644 --- a/pkg/plugins/gateway/algorithms/router.go +++ b/pkg/plugins/gateway/algorithms/router.go @@ -48,7 +48,7 @@ type RouterManager struct { func NewRouterManager() *RouterManager { rm := &RouterManager{} - rm.routerInited, rm.routerDoneInit = context.WithTimeout(context.Background(), 1*time.Second) + rm.routerInited, rm.routerDoneInit = context.WithTimeout(context.Background(), 5*time.Second) rm.routerFactory = make(map[types.RoutingAlgorithm]types.RouterProviderFunc) rm.routerConstructor = make(map[types.RoutingAlgorithm]types.RouterProviderRegistrationFunc) return rm diff --git a/pkg/plugins/gateway/algorithms/throughput.go b/pkg/plugins/gateway/algorithms/throughput.go index d3dc1d0b0..6d0e64d28 100644 --- a/pkg/plugins/gateway/algorithms/throughput.go +++ b/pkg/plugins/gateway/algorithms/throughput.go @@ -56,12 +56,12 @@ func (r throughputRouter) Route(ctx *types.RoutingContext, readyPodList types.Po readyPods := readyPodList.All() for _, pod := range readyPods { - promptThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgPromptThroughputToksPerS) + promptThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgPromptToksPerReq) if err != nil { klog.Error(err) continue } - generationThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgGenerationThroughputToksPerS) + generationThroughput, err := r.cache.GetMetricValueByPodModel(pod.Name, pod.Namespace, ctx.Model, metrics.AvgGenerationToksPerReq) if err != nil { klog.Error(err) continue diff --git a/pkg/plugins/gateway/algorithms/throughput_test.go b/pkg/plugins/gateway/algorithms/throughput_test.go index 032f1955d..a585a5622 100644 --- a/pkg/plugins/gateway/algorithms/throughput_test.go +++ b/pkg/plugins/gateway/algorithms/throughput_test.go @@ -53,20 +53,20 @@ func TestThroughput(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 1}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 1}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2}, }, "p2": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 1}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 2}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 1}, }, "p3": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 3}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 3}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 3}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 3}, }, "p4": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 4}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 4}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 4}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 4}, }, }, expectErr: false, @@ -93,16 +93,16 @@ func TestThroughput(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 1}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 4}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 1}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 4}, }, "p2": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 5}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 5}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 5}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 5}, }, "p3": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 2}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2}, }, }, expectErr: false, @@ -120,8 +120,8 @@ func TestThroughput(t *testing.T) { }, podMetrics: map[string]map[string]metrics.MetricValue{ "p1": { - metrics.AvgPromptThroughputToksPerS: &metrics.SimpleMetricValue{Value: 1}, - metrics.AvgGenerationThroughputToksPerS: &metrics.SimpleMetricValue{Value: 2}, + metrics.AvgPromptToksPerReq: &metrics.SimpleMetricValue{Value: 1}, + metrics.AvgGenerationToksPerReq: &metrics.SimpleMetricValue{Value: 2}, }, }, expectErr: false, diff --git a/pkg/plugins/gateway/configprofiles/configprofiles.go b/pkg/plugins/gateway/configprofiles/configprofiles.go new file mode 100644 index 000000000..6a952ac1a --- /dev/null +++ b/pkg/plugins/gateway/configprofiles/configprofiles.go @@ -0,0 +1,129 @@ +/* +Copyright 2025 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package configprofiles parses the model.aibrix.ai/config annotation (or ConfigMap) +// and supports multiple named profiles selectable at runtime via config-profile header. +// See docs/source/designs/model-config-profiles.rst for the design. +package configprofiles + +import ( + "encoding/json" + "fmt" + "math" + "strings" + + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + + "github.com/vllm-project/aibrix/pkg/constants" +) + +const ( + // DefaultProfileName is used when defaultProfile is not set in the JSON. + DefaultProfileName = "default" +) + +// ModelConfigProfile holds gateway options for a single profile. +type ModelConfigProfile struct { + RoutingStrategy string `json:"routingStrategy"` + PromptLenBucketMinLength int `json:"promptLenBucketMinLength"` + PromptLenBucketMaxLength int `json:"promptLenBucketMaxLength"` + Combined bool `json:"combined"` +} + +// ModelConfigProfiles is the root JSON structure from model.aibrix.ai/config. +type ModelConfigProfiles struct { + DefaultProfile string `json:"defaultProfile"` + Profiles map[string]ModelConfigProfile `json:"profiles"` +} + +// GetProfile returns the profile for the given name, or the default profile. +// Falls back to defaultProfile/"default" when the requested profile does not exist. +// Returns nil only if no default profile exists. +func (c *ModelConfigProfiles) GetProfile(name string) *ModelConfigProfile { + if name != "" { + if p, ok := c.Profiles[name]; ok { + return &p + } + } + // Fall back to default + if name = c.DefaultProfile; name == "" { + name = DefaultProfileName + } + if p, ok := c.Profiles[name]; ok { + return &p + } + return nil +} + +// ResolveProfile resolves the model config from pods (annotation), +// then returns the profile selected by headerProfile (from config-profile). +// configMapGetter can be nil; it is checked first when provided. +func ResolveProfile(pods []*v1.Pod, headerProfile string) *ModelConfigProfile { + for _, pod := range pods { + if p := ResolveProfileFromPod(pod, headerProfile); p != nil { + return p + } + } + return nil +} + +// ResolveProfileFromPod resolves the model config from a single pod annotation and returns the selected profile. +func ResolveProfileFromPod(pod *v1.Pod, headerProfile string) *ModelConfigProfile { + if pod == nil { + return nil + } + anno := pod.Annotations[constants.ModelAnnoConfig] + if anno == "" { + return nil + } + cfg, err := ParseModelConfig(anno) + if err != nil { + klog.V(4).InfoS("failed to parse model config from pod annotation", "pod", pod.Name, "err", err) + return nil + } + if headerProfile == "" { + return cfg.GetProfile("") + } + return cfg.GetProfile(headerProfile) +} + +// ParseModelConfig parses the JSON from annotation data. +// Returns nil if jsonStr is empty or invalid. +func ParseModelConfig(jsonStr string) (*ModelConfigProfiles, error) { + jsonStr = strings.TrimSpace(jsonStr) + if jsonStr == "" { + return nil, nil + } + var cfg ModelConfigProfiles + if err := json.Unmarshal([]byte(jsonStr), &cfg); err != nil { + return nil, fmt.Errorf("parse model config: %w", err) + } + if len(cfg.Profiles) == 0 { + return nil, fmt.Errorf("model config has no profiles") + } + // Default prompt bounds when not provided: min=0, max=MaxInt32 + for name, p := range cfg.Profiles { + if p.PromptLenBucketMinLength < 0 { + p.PromptLenBucketMinLength = 0 + } + if p.PromptLenBucketMaxLength == 0 { + p.PromptLenBucketMaxLength = math.MaxInt32 + } + cfg.Profiles[name] = p + } + return &cfg, nil +} diff --git a/pkg/plugins/gateway/configprofiles/configprofiles_test.go b/pkg/plugins/gateway/configprofiles/configprofiles_test.go new file mode 100644 index 000000000..69406ca98 --- /dev/null +++ b/pkg/plugins/gateway/configprofiles/configprofiles_test.go @@ -0,0 +1,240 @@ +/* +Copyright 2025 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package configprofiles + +import ( + "math" + "testing" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/vllm-project/aibrix/pkg/constants" +) + +func TestParseModelConfig(t *testing.T) { + tests := []struct { + name string + json string + wantErr bool + }{ + { + name: "empty", + json: "", + }, + { + name: "single profile", + json: `{"profiles":{"default":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`, + }, + { + name: "multiple profiles with defaultProfile", + json: `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}`, + }, + { + name: "with combined field", + json: `{"profiles":{"default":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048,"combined":true}}}`, + }, + { + name: "invalid json", + json: `{`, + wantErr: true, + }, + { + name: "no profiles", + json: `{}`, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg, err := ParseModelConfig(tt.json) + if tt.wantErr { + if err == nil || cfg != nil { + t.Errorf("ParseModelConfig() expected error, got cfg=%v err=%v", cfg, err) + } + return + } + if err != nil { + t.Errorf("ParseModelConfig() err=%v", err) + return + } + if tt.json != "" && cfg == nil { + t.Errorf("ParseModelConfig() expected config for non-empty input") + } + }) + } +} + +func TestParseModelConfig_DefaultValues(t *testing.T) { + // promptLenBucketMinLength negative → normalized to 0 + // promptLenBucketMaxLength 0 or omitted → MaxInt32 + json := `{"profiles":{"p1":{"routingStrategy":"pd","promptLenBucketMinLength":-5,"promptLenBucketMaxLength":0},"p2":{"routingStrategy":"random"}}}` + + cfg, err := ParseModelConfig(json) + if err != nil || cfg == nil { + t.Fatalf("ParseModelConfig failed: %v", err) + } + + p1 := cfg.GetProfile("p1") + if p1 == nil { + t.Fatal("GetProfile(p1) = nil") + } + if p1.PromptLenBucketMinLength != 0 { + t.Errorf("promptLenBucketMinLength -5 should be normalized to 0, got %d", p1.PromptLenBucketMinLength) + } + if p1.PromptLenBucketMaxLength != math.MaxInt32 { + t.Errorf("promptLenBucketMaxLength 0 should become MaxInt32, got %d", p1.PromptLenBucketMaxLength) + } + + p2 := cfg.GetProfile("p2") + if p2 == nil { + t.Fatal("GetProfile(p2) = nil") + } + if p2.PromptLenBucketMaxLength != math.MaxInt32 { + t.Errorf("omitted promptLenBucketMaxLength should become MaxInt32, got %d", p2.PromptLenBucketMaxLength) + } +} + +func TestGetProfile(t *testing.T) { + json := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}` + + cfg, err := ParseModelConfig(json) + if err != nil || cfg == nil { + t.Fatalf("ParseModelConfig failed: %v", err) + } + + if p := cfg.GetProfile("pd"); p == nil || p.RoutingStrategy != "pd" { + t.Errorf("GetProfile(pd) = %v, want routingStrategy=pd", p) + } + if p := cfg.GetProfile(""); p == nil || p.RoutingStrategy != "pd" { + t.Errorf("GetProfile(\"\") should use defaultProfile, got %v", p) + } + if p := cfg.GetProfile("default"); p == nil || p.RoutingStrategy != "random" { + t.Errorf("GetProfile(default) = %v", p) + } + // nonexistent profile falls back to defaultProfile + if p := cfg.GetProfile("nonexistent"); p == nil || p.RoutingStrategy != "pd" { + t.Errorf("GetProfile(nonexistent) should fall back to default, got %v", p) + } +} + +func TestGetProfile_NoDefault(t *testing.T) { + // No defaultProfile set; falls back to "default" + json := `{"profiles":{"default":{"routingStrategy":"random"},"pd":{"routingStrategy":"pd"}}}` + + cfg, err := ParseModelConfig(json) + if err != nil || cfg == nil { + t.Fatalf("ParseModelConfig failed: %v", err) + } + + // Empty/unknown name should use "default" (implied default) + if p := cfg.GetProfile(""); p == nil || p.RoutingStrategy != "random" { + t.Errorf("GetProfile(\"\") with no defaultProfile should use \"default\", got %v", p) + } +} + +func TestResolveProfileFromPod(t *testing.T) { + configJSON := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random"},"pd":{"routingStrategy":"pd","promptLenBucketMaxLength":2048}}}` + + podWithAnno := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + Annotations: map[string]string{constants.ModelAnnoConfig: configJSON}, + }, + } + podNoAnno := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "default"}, + } + + tests := []struct { + name string + pod *v1.Pod + headerProfile string + wantProfile string + }{ + {"nil pod", nil, "", ""}, + {"pod without anno", podNoAnno, "", ""}, + {"pod with anno, no header", podWithAnno, "", "pd"}, + {"pod with anno, header pd", podWithAnno, "pd", "pd"}, + {"pod with anno, header default", podWithAnno, "default", "random"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := ResolveProfileFromPod(tt.pod, tt.headerProfile) + if tt.wantProfile == "" { + if p != nil { + t.Errorf("ResolveProfileFromPod() = %v, want nil", p) + } + return + } + if p == nil { + t.Errorf("ResolveProfileFromPod() = nil, want profile with routingStrategy=%s", tt.wantProfile) + return + } + if p.RoutingStrategy != tt.wantProfile { + t.Errorf("ResolveProfileFromPod().RoutingStrategy = %s, want %s", p.RoutingStrategy, tt.wantProfile) + } + }) + } +} + +func TestResolveProfile(t *testing.T) { + configJSON := `{"defaultProfile":"pd","profiles":{"default":{"routingStrategy":"random","promptLenBucketMinLength":0,"promptLenBucketMaxLength":4096},"pd":{"routingStrategy":"pd","promptLenBucketMinLength":0,"promptLenBucketMaxLength":2048}}}` + + podWithAnno := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + Annotations: map[string]string{constants.ModelAnnoConfig: configJSON}, + }, + } + podNoAnno := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod2", Namespace: "default"}, + } + + tests := []struct { + name string + pods []*v1.Pod + headerProfile string + wantProfile string + }{ + {"no pods", nil, "", ""}, + {"pods without anno", []*v1.Pod{podNoAnno}, "", ""}, + {"pods with anno, no header", []*v1.Pod{podWithAnno}, "", "pd"}, + {"pods with anno, header pd", []*v1.Pod{podWithAnno}, "pd", "pd"}, + {"pods with anno, header default", []*v1.Pod{podWithAnno}, "default", "random"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := ResolveProfile(tt.pods, tt.headerProfile) + if tt.wantProfile == "" { + if p != nil { + t.Errorf("ResolveProfile() = %v, want nil", p) + } + return + } + if p == nil { + t.Errorf("ResolveProfile() = nil, want profile with routingStrategy=%s", tt.wantProfile) + return + } + if p.RoutingStrategy != tt.wantProfile { + t.Errorf("ResolveProfile().RoutingStrategy = %s, want %s", p.RoutingStrategy, tt.wantProfile) + } + }) + } +} diff --git a/pkg/plugins/gateway/gateway.go b/pkg/plugins/gateway/gateway.go index bc020bb39..629dd4231 100644 --- a/pkg/plugins/gateway/gateway.go +++ b/pkg/plugins/gateway/gateway.go @@ -21,6 +21,7 @@ import ( "errors" "fmt" "io" + "os" "strings" "time" @@ -82,6 +83,8 @@ type processState struct { completed bool } +var podName = os.Getenv("POD_NAME") + func NewServer(redisClient *redis.Client, client kubernetes.Interface, gatewayClient gatewayapi.Interface) *Server { c, err := cache.Get() if err != nil { @@ -273,6 +276,7 @@ func (s *Server) responseForResponseHeaderError(st *processState, resp *extProcP } func (s *Server) emitProcessMetrics(st *processState, resp *extProcPb.ProcessingResponse) { + s.emitGatewayRequestTotalMetric(resp, st.model) if st.model == "" { return } @@ -335,6 +339,7 @@ func (s *Server) selectTargetPod(ctx *types.RoutingContext, pods types.PodList, ctx.SetTargetPod(readyPods[0]) return ctx.TargetAddress(), nil } + utils.CryptoShuffle(readyPods) return router.Route(ctx, &utils.PodArray{Pods: readyPods}) } @@ -433,8 +438,17 @@ func (s *Server) responseErrorProcessingWithHeaders(ctx context.Context, headers } func (s *Server) emitMetricsCounterHelper(metricName, model, status, statusCode string) { - labelNames, labelValues := buildGatewayPodMetricLabels(model, status, statusCode) - metrics.EmitMetricToPrometheus(metricName, &metrics.SimpleMetricValue{Value: 1.0}, labelNames, labelValues) + labels := buildGatewayPodMetricLabels(model, status, statusCode) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, nil, metricName, &metrics.SimpleMetricValue{Value: 1.0}, labels) +} + +func (s *Server) emitGatewayRequestTotalMetric(resp *extProcPb.ProcessingResponse, model string) { + statusCode := "200" + if resp.GetImmediateResponse() != nil { + statusCode = fmt.Sprintf("%d", int(resp.GetImmediateResponse().Status.GetCode())) + } + labels := buildGatewayPodMetricLabels(model, "gateway_request_handled", statusCode) + metrics.EmitMetricToPrometheus(&types.RoutingContext{Model: model}, nil, metrics.GatewayRequestTotal, &metrics.SimpleMetricValue{Value: 1.0}, labels) } func getMetricErr(resp *extProcPb.ImmediateResponse, metricLabel string) string { diff --git a/pkg/plugins/gateway/gateway_req_body.go b/pkg/plugins/gateway/gateway_req_body.go index 2ca7e1936..72501060c 100644 --- a/pkg/plugins/gateway/gateway_req_body.go +++ b/pkg/plugins/gateway/gateway_req_body.go @@ -40,12 +40,12 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e routingCtx, _ := ctx.(*types.RoutingContext) requestPath := routingCtx.ReqPath - routingAlgorithm := routingCtx.Algorithm body := req.Request.(*extProcPb.ProcessingRequest_RequestBody) var model, message string var stream bool + var routingAlgorithm types.RoutingAlgorithm var errRes *extProcPb.ProcessingResponse // Check if this is a multipart request (audio endpoints) @@ -88,6 +88,19 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e fmt.Sprintf("error on getting pods for model %s", model), ErrorCodeServiceUnavailable, ""), model, routingCtx, stream, term } + // Resolve model config profile from annotation and apply overrides + applyConfigProfile(routingCtx, podsArr.All()) + + // Derive and validate routing strategy (headers -> profile -> env); return 400 on invalid + if strategy, enabled := deriveRoutingStrategyFromContext(routingCtx); enabled { + var ok bool + if routingAlgorithm, ok = routing.Validate(strategy); !ok { + klog.ErrorS(nil, "incorrect routing strategy", "requestID", requestID, "routing-strategy", strategy) + return buildErrorResponse(envoyTypePb.StatusCode_BadRequest, fmt.Sprintf("incorrect routing strategy %s", strategy), "", "", HeaderErrorRouting, "true"), model, routingCtx, stream, term + } + routingCtx.Algorithm = routingAlgorithm + } + headers := []*configPb.HeaderValueOption{} // Path rewriting for image/video generation based on engine type @@ -108,19 +121,15 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e targetPodIP, err := s.selectTargetPod(routingCtx, podsArr, externalFilter) if targetPodIP == "" || err != nil { klog.ErrorS(err, "failed to select target pod", "requestID", requestID, "routingStrategy", routingAlgorithm, "model", model, "routingDuration", routingCtx.GetRoutingDelay()) - return generateErrorResponse( - envoyTypePb.StatusCode_ServiceUnavailable, - []*configPb.HeaderValueOption{{Header: &configPb.HeaderValue{ - Key: HeaderErrorRouting, RawValue: []byte("true")}}}, - "error on selecting target pod", ErrorCodeServiceUnavailable, ""), model, routingCtx, stream, term + return buildErrorResponse(envoyTypePb.StatusCode_ServiceUnavailable, "error on selecting target pod", ErrorCodeServiceUnavailable, "", HeaderErrorRouting, "true"), model, routingCtx, stream, term } headers = buildEnvoyProxyHeaders(headers, HeaderRoutingStrategy, string(routingAlgorithm), HeaderTargetPod, targetPodIP, "content-length", strconv.Itoa(len(routingCtx.ReqBody)), "X-Request-Id", routingCtx.RequestID) - var targetPodName string - var targetNamespace string + + var targetPodName, targetNamespace string var request_count float64 if routingCtx.HasRouted() && routingCtx.TargetPod() != nil { targetPodName = routingCtx.TargetPod().Name @@ -128,7 +137,7 @@ func (s *Server) HandleRequestBody(ctx context.Context, requestID string, req *e request_count = getRunningRequestsByPod(s, targetPodName, targetNamespace) } klog.InfoS("request_start", "request_id", requestID, "request_path", requestPath, "model", model, "stream", stream, "routing_strategy", routingAlgorithm, - "target_pod", targetPodName, "target_pod_ip", targetPodIP, "outstanding_requests", request_count, "routing_duration", routingCtx.GetRoutingDelay()) + "target_pod", targetPodName, "target_pod_ip", targetPodIP, "outstanding_requests", request_count, "routing_time_taken", routingCtx.GetRoutingDelay()) } routingCtx.RequestEndTime = time.Now() diff --git a/pkg/plugins/gateway/gateway_req_body_test.go b/pkg/plugins/gateway/gateway_req_body_test.go index 3c1f97d16..37d0124bf 100644 --- a/pkg/plugins/gateway/gateway_req_body_test.go +++ b/pkg/plugins/gateway/gateway_req_body_test.go @@ -32,7 +32,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/vllm-project/aibrix/pkg/cache" - "github.com/vllm-project/aibrix/pkg/metrics" routingalgorithms "github.com/vllm-project/aibrix/pkg/plugins/gateway/algorithms" "github.com/vllm-project/aibrix/pkg/types" "github.com/vllm-project/aibrix/pkg/utils" @@ -269,7 +268,7 @@ func Test_handleRequestBody(t *testing.T) { }, }, { - name: "invalid routing strategy - should fallback to random router", + name: "invalid routing strategy - should return 400 BadRequest", requestBody: `{"model": "test-model", "messages": [{"role": "user", "content": "test"}]}`, user: utils.User{ Name: "test-user", @@ -285,64 +284,34 @@ func Test_handleRequestBody(t *testing.T) { Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}}, }, }, - { - Status: v1.PodStatus{ - PodIP: "5.6.7.8", - Conditions: []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue}}, - }, - }, }, } mockCache.On("ListPodsByModel", "test-model").Return(podList, nil) - mockCache.On("GetMetricValueByPod", mock.Anything, mock.Anything, metrics.RealtimeNumRequestsRunning).Return(&metrics.SimpleMetricValue{Value: 0}, nil) - mockCache.On("AddRequestCount", mock.Anything, mock.Anything, "test-model").Return(int64(1)) }, expected: testResponse{ - statusCode: envoyTypePb.StatusCode_OK, - headers: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: HeaderRoutingStrategy, - RawValue: []byte("invalid-router"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: HeaderTargetPod, - RawValue: []byte("1.2.3.4:8000"), - }, - }, - }, + statusCode: envoyTypePb.StatusCode_BadRequest, model: "test-model", stream: false, - term: 1, + term: 0, routingCtx: &types.RoutingContext{}, }, validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, model string, routingCtx *types.RoutingContext, stream bool, term int64) { - assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK) + assert.Equal(t, tt.expected.statusCode, resp.GetImmediateResponse().GetStatus().GetCode()) + // buildErrorResponse returns x-error-routing: true (no Content-Type in headers) + headers := resp.GetImmediateResponse().GetHeaders().GetSetHeaders() + assert.GreaterOrEqual(t, len(headers), 1) + foundErrorRouting := false + for _, h := range headers { + if h.Header.Key == HeaderErrorRouting && string(h.Header.RawValue) == "true" { + foundErrorRouting = true + break + } + } + assert.True(t, foundErrorRouting, "expected x-error-routing header") assert.Equal(t, tt.expected.model, model) assert.Equal(t, tt.expected.stream, stream) assert.Equal(t, tt.expected.term, term) assert.NotNil(t, routingCtx) - assert.Equal(t, tt.expected.model, routingCtx.Model) - assert.Equal(t, tt.routingAlgo, routingCtx.Algorithm) - // Verify both routing headers are set - foundRoutingStrategy := false - foundTargetPod := false - for _, header := range resp.GetRequestBody().GetResponse().GetHeaderMutation().GetSetHeaders() { - if header.Header.Key == HeaderRoutingStrategy { - foundRoutingStrategy = true - assert.Equal(t, "invalid-router", string(header.Header.RawValue)) - } - if header.Header.Key == HeaderTargetPod { - foundTargetPod = true - // Since this is a random router, accept either valid pod IP from the mock setup - targetPodIP := string(header.Header.RawValue) - assert.Contains(t, []string{"1.2.3.4:8000", "5.6.7.8:8000"}, targetPodIP, "Target pod IP should be one of the pod IPs from the mock setup") - } - } - assert.True(t, foundRoutingStrategy, "HeaderRoutingStrategy not found") - assert.True(t, foundTargetPod, "HeaderTargetPod not found") }, }, { @@ -680,10 +649,17 @@ func Test_handleRequestBody(t *testing.T) { // Call HandleRequestBody and validate the response routingCtx := types.NewRoutingContext(context.Background(), tt.routingAlgo, tt.expected.model, "", "test-request-id", tt.user.Name) - routingCtx.ReqPath = "/v1/chat/completions" + routingCtx.ReqPath = PathChatCompletions if tt.reqPath != "" { routingCtx.ReqPath = tt.reqPath } + // deriveRoutingStrategyFromContext reads from ReqHeaders, not Algorithm + if tt.routingAlgo != "" { + if routingCtx.ReqHeaders == nil { + routingCtx.ReqHeaders = make(map[string]string) + } + routingCtx.ReqHeaders[HeaderRoutingStrategy] = string(tt.routingAlgo) + } resp, model, routingCtx, stream, term := server.HandleRequestBody( routingCtx, "test-request-id", diff --git a/pkg/plugins/gateway/gateway_req_headers.go b/pkg/plugins/gateway/gateway_req_headers.go index 2fdbf2f2a..fb84f8171 100644 --- a/pkg/plugins/gateway/gateway_req_headers.go +++ b/pkg/plugins/gateway/gateway_req_headers.go @@ -26,7 +26,6 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" - routing "github.com/vllm-project/aibrix/pkg/plugins/gateway/algorithms" "github.com/vllm-project/aibrix/pkg/types" "github.com/vllm-project/aibrix/pkg/utils" ) @@ -46,6 +45,7 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req var err error var errRes *extProcPb.ProcessingResponse var routingCtx *types.RoutingContext + var reqConfigProfile string h := req.Request.(*extProcPb.ProcessingRequest_RequestHeaders) reqHeaders := map[string]string{} @@ -61,20 +61,13 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req reqHeaders[n.Key] = string(n.RawValue) case contentTypeKey: reqHeaders[n.Key] = string(n.RawValue) + case HeaderRoutingStrategy: + reqHeaders[n.Key] = string(n.RawValue) + case HeaderConfigProfile: + reqConfigProfile = strings.TrimSpace(string(n.RawValue)) } } - routingStrategy, routingStrategyEnabled := getRoutingStrategy(h.RequestHeaders.Headers.Headers) - routingAlgorithm, ok := routing.Validate(routingStrategy) - if routingStrategyEnabled && !ok { - klog.ErrorS(nil, "incorrect routing strategy", "requestID", requestID, "routing-strategy", routingStrategy) - return generateErrorResponse( - envoyTypePb.StatusCode_BadRequest, - []*configPb.HeaderValueOption{{Header: &configPb.HeaderValue{ - Key: HeaderErrorInvalidRouting, RawValue: []byte(routingStrategy), - }}}, "incorrect routing strategy", "", "routing-strategy"), utils.User{}, rpm, routingCtx - } - if username != "" { user, err = utils.GetUser(ctx, utils.User{Name: username}, s.redisClient) if err != nil { @@ -94,9 +87,10 @@ func (s *Server) HandleRequestHeaders(ctx context.Context, requestID string, req } } - routingCtx = types.NewRoutingContext(ctx, routingAlgorithm, "", "", requestID, user.Name) + routingCtx = types.NewRoutingContext(ctx, "", "", "", requestID, user.Name) routingCtx.ReqPath = requestPath routingCtx.ReqHeaders = reqHeaders + routingCtx.ReqConfigProfile = reqConfigProfile headers := []*configPb.HeaderValueOption{} headers = append(headers, &configPb.HeaderValueOption{ diff --git a/pkg/plugins/gateway/gateway_req_headers_test.go b/pkg/plugins/gateway/gateway_req_headers_test.go index 4b6b5541a..bb9ed0764 100644 --- a/pkg/plugins/gateway/gateway_req_headers_test.go +++ b/pkg/plugins/gateway/gateway_req_headers_test.go @@ -59,7 +59,7 @@ func Test_handleRequestHeaders(t *testing.T) { // Define test cases for different routing and error scenarios tests := []testCase{ { - name: "not found strategy - should return error", + name: "invalid strategy - passes through to request body (validation deferred)", requestHeaders: []*configPb.HeaderValue{ { Key: HeaderRoutingStrategy, @@ -67,26 +67,23 @@ func Test_handleRequestHeaders(t *testing.T) { }, }, expected: testResponse{ - statusCode: envoyTypePb.StatusCode_BadRequest, + statusCode: envoyTypePb.StatusCode_OK, headers: []*configPb.HeaderValueOption{ - {Header: &configPb.HeaderValue{Key: HeaderErrorInvalidRouting, RawValue: []byte("not-found-strategy")}}, - {Header: &configPb.HeaderValue{Key: "Content-Type", Value: "application/json"}}, + {Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}}, }, - routingCtx: nil, - user: utils.User{}, - rpm: 0, + routingCtx: &types.RoutingContext{ + ReqHeaders: map[string]string{HeaderRoutingStrategy: "not-found-strategy"}, + }, + user: utils.User{}, + rpm: 0, }, validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, user utils.User, routingCtx *types.RoutingContext, rpm int64) { - // Validate request headers info - assert.Equal(t, tt.expected.statusCode, resp.GetImmediateResponse().GetStatus().GetCode()) - assert.Equal(t, tt.expected.headers, resp.GetImmediateResponse().GetHeaders().GetSetHeaders()) + assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK) + assert.Equal(t, tt.expected.headers, resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders()) assert.Equal(t, tt.expected.user, user) - assert.Nil(t, routingCtx) + assert.NotNil(t, routingCtx) + assert.Equal(t, tt.expected.routingCtx.ReqHeaders, routingCtx.ReqHeaders) assert.Equal(t, tt.expected.rpm, rpm) - // Verify no special headers are set - for _, header := range resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders() { - assert.NotEqual(t, HeaderWentIntoReqHeaders, header.Header.Key) - } }, }, { @@ -151,13 +148,12 @@ func Test_handleRequestHeaders(t *testing.T) { }, routingCtx: &types.RoutingContext{ ReqPath: "test-path", - ReqHeaders: map[string]string{authorizationKey: "token:test-token"}, + ReqHeaders: map[string]string{authorizationKey: "token:test-token", HeaderRoutingStrategy: "random"}, }, user: utils.User{}, rpm: 0, }, validate: func(t *testing.T, tt *testCase, resp *extProcPb.ProcessingResponse, user utils.User, routingCtx *types.RoutingContext, rpm int64) { - // Validate request headers info assert.Equal(t, tt.expected.statusCode, envoyTypePb.StatusCode_OK) assert.Equal(t, tt.expected.headers, resp.GetRequestHeaders().GetResponse().GetHeaderMutation().GetSetHeaders()) assert.Equal(t, tt.expected.user, user) diff --git a/pkg/plugins/gateway/gateway_rsp_body.go b/pkg/plugins/gateway/gateway_rsp_body.go index 5cfb5f5f1..a10a1409d 100644 --- a/pkg/plugins/gateway/gateway_rsp_body.go +++ b/pkg/plugins/gateway/gateway_rsp_body.go @@ -21,6 +21,7 @@ import ( "context" "fmt" "io" + "math" "net/http" "strings" "time" @@ -133,13 +134,8 @@ func (s *Server) HandleResponseBody(ctx context.Context, requestID string, req * HeaderUpdateTPM, fmt.Sprintf("%d", tpm)) } - var targetPod *v1.Pod headers = buildEnvoyProxyHeaders(headers, HeaderRequestID, routerCtx.RequestID) - if routerCtx != nil && routerCtx.HasRouted() { - targetPod = routerCtx.TargetPod() - headers = buildEnvoyProxyHeaders(headers, HeaderTargetPod, routerCtx.TargetAddress()) - } - fields := s.requestEndHelper(routerCtx, targetPod, arrival, promptTokens, completionTokens, totalTokens) + fields := s.requestEndHelper(routerCtx, arrival, promptTokens, completionTokens, totalTokens) klog.InfoS("request_end", fields...) } else if b.ResponseBody.EndOfStream { complete = true @@ -234,10 +230,14 @@ func processLanguageResponse(requestID string, b *extProcPb.ProcessingRequest_Re return } -func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v1.Pod, arrival time.Time, +func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, arrival time.Time, promptTokens, completionTokens, totalTokens int64) []interface{} { requestID := routingCtx.RequestID model := routingCtx.Model + var targetPod *v1.Pod + if routingCtx.HasRouted() { + targetPod = routingCtx.TargetPod() + } fields := []interface{}{ "request_id", requestID, @@ -248,19 +248,20 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v } pBucket := tokenBucketLabel(promptTokens) cBucket := tokenBucketLabel(completionTokens) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayPromptTokenBucketTotal, 1.0, map[string]string{"bucket": pBucket}) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayCompletionTokenBucketTotal, 1.0, map[string]string{"bucket": cBucket}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayPromptTokenBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": pBucket}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayCompletionTokenBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": cBucket}) if targetPod != nil { + outstandingRequestCount := math.Max(0, getRunningRequestsByPod(s, targetPod.Name, targetPod.Namespace)-1) fields = append(fields, "target_pod", targetPod.Name, - "outstanding_request_count", getRunningRequestsByPod(s, targetPod.Name, targetPod.Namespace)) + "outstanding_request_count", outstandingRequestCount) } ttft := arrival.Sub(routingCtx.RequestTime) if routingCtx.Stream { ttftBucket := durationBucketLabel(ttft) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayTTFTBucketTotal, 1.0, map[string]string{"bucket": ttftBucket}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayTTFTBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": ttftBucket}) } if routingCtx.Algorithm == "pd" { @@ -275,12 +276,12 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v "ttft", ttft, "decode_time_taken", decodeTime, ) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayRoutingTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(routingTime)}) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayPrefillTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(prefillTime)}) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayKVTransferTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(kvTransferTime)}) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayDecodeTimeBucketTotal, 1.0, map[string]string{"bucket": durationBucketLabel(decodeTime)}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayRoutingTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(routingTime)}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayPrefillTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(prefillTime)}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayKVTransferTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(kvTransferTime)}) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayDecodeTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(decodeTime)}) if ttft > ttftThreshold { - metrics.EmitCounterMetric(routingCtx, nil, metrics.GatewayFirstTokenDelayOver1sTotal, 1.0, map[string]string{ + metrics.EmitMetricToPrometheus(routingCtx, nil, metrics.GatewayFirstTokenDelayOver1sTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{ "request_id": requestID, "p_bucket": pBucket, "c_bucket": cBucket, "routing_time_taken": fmt.Sprintf("%v", routingTime), @@ -290,15 +291,11 @@ func (s *Server) requestEndHelper(routingCtx *types.RoutingContext, targetPod *v "decode_time_taken": fmt.Sprintf("%v", decodeTime), }) } - } else { - fields = append(fields, - "routing_time_taken", routingCtx.RequestEndTime.Sub(routingCtx.RequestTime), - ) + } else if routingCtx.Algorithm != "" { + fields = append(fields, "routing_time_taken", routingCtx.GetRoutingDelay()) } fields = append(fields, "total_time_taken", routingCtx.Elapsed(time.Now())) - metrics.EmitCounterMetric(routingCtx, targetPod, metrics.GatewayTotalTimeBucketTotal, 1.0, map[string]string{ - "bucket": durationBucketLabel(routingCtx.Elapsed(time.Now())), - }) + metrics.EmitMetricToPrometheus(routingCtx, targetPod, metrics.GatewayTotalTimeBucketTotal, &metrics.SimpleMetricValue{Value: 1.0}, map[string]string{"bucket": durationBucketLabel(routingCtx.Elapsed(time.Now()))}) return fields } diff --git a/pkg/plugins/gateway/gateway_rsp_body_test.go b/pkg/plugins/gateway/gateway_rsp_body_test.go new file mode 100644 index 000000000..39ab39c92 --- /dev/null +++ b/pkg/plugins/gateway/gateway_rsp_body_test.go @@ -0,0 +1,516 @@ +/* +Copyright 2024 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gateway + +import ( + "context" + "errors" + "testing" + "time" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/vllm-project/aibrix/pkg/cache" + "github.com/vllm-project/aibrix/pkg/types" + "github.com/vllm-project/aibrix/pkg/utils" +) + +// mockRateLimiter implements ratelimiter.RateLimiter for testing +type mockRateLimiter struct { + mock.Mock +} + +func (m *mockRateLimiter) Get(ctx context.Context, key string) (int64, error) { + args := m.Called(ctx, key) + return args.Get(0).(int64), args.Error(1) +} + +func (m *mockRateLimiter) GetLimit(ctx context.Context, key string) (int64, error) { + args := m.Called(ctx, key) + return args.Get(0).(int64), args.Error(1) +} + +func (m *mockRateLimiter) Incr(ctx context.Context, key string, val int64) (int64, error) { + args := m.Called(ctx, key, val) + return args.Get(0).(int64), args.Error(1) +} + +func TestIsLanguageRequest(t *testing.T) { + tests := []struct { + name string + requestPath string + want bool + }{ + { + name: "chat completions is language", + requestPath: "/v1/chat/completions", + want: true, + }, + { + name: "completions is language", + requestPath: "/v1/completions", + want: true, + }, + { + name: "embeddings is language", + requestPath: "/v1/embeddings", + want: true, + }, + { + name: "images generations is not language", + requestPath: "/v1/images/generations", + want: false, + }, + { + name: "video generations is not language", + requestPath: "/v1/video/generations", + want: false, + }, + { + name: "audio transcriptions is not language", + requestPath: "/v1/audio/transcriptions", + want: false, + }, + { + name: "audio translations is not language", + requestPath: "/v1/audio/translations", + want: false, + }, + { + name: "empty path is language", + requestPath: "", + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isLanguageRequest(tt.requestPath) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestTokenBucketLabel(t *testing.T) { + tests := []struct { + name string + tokens int64 + want string + }{ + {"zero", 0, "0-256"}, + {"small", 100, "0-256"}, + {"boundary 256", 256, "256-512"}, + {"mid range", 500, "256-512"}, + {"boundary 512", 512, "512-1024"}, + {"1024", 1024, "1024-2048"}, + {"2048", 2048, "2048-4096"}, + {"4096", 4096, "4096-8192"}, + {"8192", 8192, "8192-16384"}, + {"16384", 16384, "16384-32768"}, + {"32768", 32768, "32768+"}, + {"large", 100000, "32768+"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tokenBucketLabel(tt.tokens) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestDurationBucketLabel(t *testing.T) { + tests := []struct { + name string + d time.Duration + want string + }{ + {"zero", 0, "0-1ms"}, + {"sub millisecond", 500 * time.Microsecond, "0-1ms"}, + {"1ms", time.Millisecond, "1-2ms"}, + {"2ms", 2 * time.Millisecond, "2-5ms"}, + {"5ms", 5 * time.Millisecond, "5-10ms"}, + {"10ms", 10 * time.Millisecond, "10-20ms"}, + {"50ms", 50 * time.Millisecond, "50-100ms"}, + {"100ms", 100 * time.Millisecond, "100-200ms"}, + {"500ms", 500 * time.Millisecond, "500-1000ms"}, + {"1s", time.Second, "1000-2000ms"}, + {"5s", 5 * time.Second, "5000ms+"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := durationBucketLabel(tt.d) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestProcessLanguageResponse_PartialChunk(t *testing.T) { + requestID := "test-partial-" + time.Now().Format("150405.000") + body := []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`) + + req := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: body, + EndOfStream: false, + }, + } + + res, complete, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, req) + + assert.False(t, complete) + assert.Equal(t, int64(0), promptTokens) + assert.Equal(t, int64(0), completionTokens) + assert.Equal(t, int64(0), totalTokens) + assert.NotNil(t, res) + assert.NotNil(t, res.GetResponseBody()) + assert.NotNil(t, res.GetResponseBody().GetResponse()) +} + +func TestProcessLanguageResponse_ValidFullResponse(t *testing.T) { + requestID := "test-valid-" + time.Now().Format("150405.000") + body := []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`) + + req := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: body, + EndOfStream: true, + }, + } + + res, complete, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, req) + + // processLanguageResponse returns complete=false for valid case (no early return) + assert.False(t, complete) + assert.Equal(t, int64(10), promptTokens) + assert.Equal(t, int64(5), completionTokens) + assert.Equal(t, int64(15), totalTokens) + assert.Nil(t, res) // No error response for valid case +} + +func TestProcessLanguageResponse_InvalidJSON(t *testing.T) { + requestID := "test-invalid-json-" + time.Now().Format("150405.000") + body := []byte(`{invalid json}`) + + req := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: body, + EndOfStream: true, + }, + } + + res, complete, _, _, _ := processLanguageResponse(requestID, req) + + assert.True(t, complete) + assert.NotNil(t, res) + // buildErrorResponse returns ImmediateResponse, not ResponseBody + immResp := res.GetImmediateResponse() + assert.NotNil(t, immResp) + headers := immResp.GetHeaders().GetSetHeaders() + found := false + for _, h := range headers { + if h.Header.Key == HeaderErrorResponseUnmarshal { + found = true + break + } + } + assert.True(t, found, "expected HeaderErrorResponseUnmarshal in response") +} + +func TestProcessLanguageResponse_EmptyModel(t *testing.T) { + requestID := "test-empty-model-" + time.Now().Format("150405.000") + body := []byte(`{"model": "", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`) + + req := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: body, + EndOfStream: true, + }, + } + + res, complete, _, _, _ := processLanguageResponse(requestID, req) + + assert.True(t, complete) + assert.NotNil(t, res) + immResp := res.GetImmediateResponse() + assert.NotNil(t, immResp) + headers := immResp.GetHeaders().GetSetHeaders() + found := false + for _, h := range headers { + if h.Header.Key == HeaderErrorResponseUnknown { + found = true + break + } + } + assert.True(t, found, "expected HeaderErrorResponseUnknown in response") +} + +func TestProcessLanguageResponse_ChunkedAccumulation(t *testing.T) { + requestID := "test-chunked-" + time.Now().Format("150405.000") + + // First chunk - partial + chunk1 := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model": "test-model", "usage": {"prompt_tokens": `), + EndOfStream: false, + }, + } + res1, complete1, _, _, _ := processLanguageResponse(requestID, chunk1) + assert.False(t, complete1) + assert.NotNil(t, res1) + + // Second chunk - complete + chunk2 := &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`10, "completion_tokens": 5, "total_tokens": 15}}`), + EndOfStream: true, + }, + } + _, complete2, promptTokens, completionTokens, totalTokens := processLanguageResponse(requestID, chunk2) + assert.False(t, complete2) // processLanguageResponse returns complete=false for valid case + assert.Equal(t, int64(10), promptTokens) + assert.Equal(t, int64(5), completionTokens) + assert.Equal(t, int64(15), totalTokens) +} + +func TestHandleResponseBody_NonStreamNoTokens(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + // DoneRequestTrace(ctx, requestID, model, inputTokens, outputTokens, traceTerm) + mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(10), int64(5), int64(0)).Maybe() + + server := &Server{ + cache: mockCache, + } + + routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`), + EndOfStream: true, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false) + + assert.True(t, complete) + assert.NotNil(t, resp) + assert.NotNil(t, resp.GetResponseBody()) +} + +func TestHandleResponseBody_WithUserAndTPM(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + // DoneRequestTrace(ctx, requestID, model, term, inputTokens, outputTokens) - use mock.Anything for dynamic requestID + mockCache.On("DoneRequestTrace", mock.Anything, mock.Anything, "test-model", int64(10), int64(5), int64(0)).Maybe() + + mockRL := &mockRateLimiter{} + mockRL.On("Incr", mock.Anything, "test-user_TPM_CURRENT", int64(15)).Return(int64(100), nil) + + server := &Server{ + cache: mockCache, + ratelimiter: mockRL, + } + + requestID := "test-req-tpm-" + time.Now().Format("150405.000") + routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", requestID, "test-user") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`), + EndOfStream: true, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, requestID, req, utils.User{Name: "test-user"}, 42, "test-model", false, 0, false) + + assert.True(t, complete) + assert.NotNil(t, resp) + headers := resp.GetResponseBody().GetResponse().GetHeaderMutation().GetSetHeaders() + foundTPM := false + foundRPM := false + foundReqID := false + for _, h := range headers { + switch h.Header.Key { + case HeaderUpdateTPM: + foundTPM = true + assert.Equal(t, []byte("100"), h.Header.RawValue) + case HeaderUpdateRPM: + foundRPM = true + assert.Equal(t, []byte("42"), h.Header.RawValue) + case HeaderRequestID: + foundReqID = true + assert.Equal(t, []byte(requestID), h.Header.RawValue) + } + } + assert.True(t, foundTPM, "expected HeaderUpdateTPM in response") + assert.True(t, foundRPM, "expected HeaderUpdateRPM in response") + assert.True(t, foundReqID, "expected request-id in response") + mockRL.AssertExpectations(t) +} + +func TestHandleResponseBody_NonLanguageRequest(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + // Non-language request: no tokens from processLanguageResponse, EndOfStream triggers complete + mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(0), int64(0), int64(0)).Maybe() + + server := &Server{ + cache: mockCache, + } + + routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "") + routerCtx.ReqPath = "/v1/images/generations" + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model": "test-model"}`), + EndOfStream: true, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false) + + // Non-language request with EndOfStream sets complete=true + assert.True(t, complete) + assert.NotNil(t, resp) +} + +func TestHandleResponseBody_EndOfStreamNoTokens(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + // Body {} parses but has empty model - returns error, DoneRequestTrace called with 0,0,0 + mockCache.On("DoneRequestTrace", mock.Anything, "test-req-id", "test-model", int64(0), int64(0), int64(0)).Maybe() + + server := &Server{ + cache: mockCache, + } + + routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", "test-req-id", "") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{}`), + EndOfStream: true, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, "test-req-id", req, utils.User{}, 0, "test-model", false, 0, false) + + assert.True(t, complete) + assert.NotNil(t, resp) +} + +func TestHandleResponseBody_TPMIncrError(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + mockCache.On("DoneRequestTrace", mock.Anything, mock.Anything, "test-model", int64(10), int64(5), int64(0)).Maybe() + + mockRL := &mockRateLimiter{} + mockRL.On("Incr", mock.Anything, "test-user_TPM_CURRENT", int64(15)).Return(int64(0), errors.New("mock error")) + server := &Server{ + cache: mockCache, + ratelimiter: mockRL, + } + + requestID := "test-req-tpm-err-" + time.Now().Format("150405.000") + routerCtx := types.NewRoutingContext(context.Background(), "random", "test-model", "", requestID, "test-user") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model": "test-model", "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}}`), + EndOfStream: true, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, requestID, req, utils.User{Name: "test-user"}, 0, "test-model", false, 0, false) + + assert.True(t, complete) + assert.NotNil(t, resp) + // Error response uses ImmediateResponse + immResp := resp.GetImmediateResponse() + assert.NotNil(t, immResp) + headers := immResp.GetHeaders().GetSetHeaders() + found := false + for _, h := range headers { + if h.Header.Key == HeaderErrorIncrTPM { + found = true + break + } + } + assert.True(t, found, "expected HeaderErrorIncrTPM in response") + mockRL.AssertExpectations(t) +} + +func TestHandleResponseBody_LanguagePartialResponse(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + server := &Server{cache: mockCache} + + routerCtx := types.NewRoutingContext(context.Background(), "random", "m", "", "rid-partial", "") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model":"m","usage":{"prompt_tokens":1}}`), + EndOfStream: false, + }, + }, + } + + resp, complete := server.HandleResponseBody(routerCtx, "rid-partial", req, utils.User{}, 0, "m", false, 0, false) + assert.False(t, complete) + assert.NotNil(t, resp) + assert.NotNil(t, resp.GetResponseBody().GetResponse()) +} + +func TestHandleResponseBody_DoesNotDuplicateTrace(t *testing.T) { + mockCache := &MockCache{Cache: cache.NewForTest()} + server := &Server{cache: mockCache} + + routerCtx := types.NewRoutingContext(context.Background(), "random", "m", "", "rid", "") + routerCtx.ReqPath = PathChatCompletions + routerCtx.RequestTime = time.Now() + + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`{"model":"m","usage":{"prompt_tokens":10,"completion_tokens":5,"total_tokens":15}}`), + EndOfStream: true, + }, + }, + } + + _, complete := server.HandleResponseBody(routerCtx, "rid", req, utils.User{}, 0, "m", false, 0, true) + assert.True(t, complete) + mockCache.AssertNotCalled(t, "DoneRequestTrace", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything) +} diff --git a/pkg/plugins/gateway/gateway_rsp_headers.go b/pkg/plugins/gateway/gateway_rsp_headers.go index 9d88ee8db..68dd0f6ef 100644 --- a/pkg/plugins/gateway/gateway_rsp_headers.go +++ b/pkg/plugins/gateway/gateway_rsp_headers.go @@ -44,7 +44,10 @@ func (s *Server) HandleResponseHeaders(ctx context.Context, requestID string, mo headers := []*configPb.HeaderValueOption{} headers = buildEnvoyProxyHeaders(headers, HeaderWentIntoReqHeaders, "true", HeaderRequestID, requestID) if routerCtx != nil && routerCtx.HasRouted() { - headers = buildEnvoyProxyHeaders(headers, HeaderTargetPod, routerCtx.TargetAddress()) + headers = buildEnvoyProxyHeaders(headers, + HeaderRoutingStrategy, string(routerCtx.Algorithm), + HeaderTargetPod, routerCtx.TargetPod().Name, + HeaderTargetPodIP, routerCtx.TargetAddress()) } if routerCtx != nil && routerCtx.RespHeaders != nil { diff --git a/pkg/plugins/gateway/gateway_rsp_headers_test.go b/pkg/plugins/gateway/gateway_rsp_headers_test.go index 41c68fe12..8c945baa5 100644 --- a/pkg/plugins/gateway/gateway_rsp_headers_test.go +++ b/pkg/plugins/gateway/gateway_rsp_headers_test.go @@ -84,7 +84,9 @@ func Test_HandleResponseHeaders(t *testing.T) { headers: []*configPb.HeaderValueOption{ {Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}}, {Header: &configPb.HeaderValue{Key: HeaderRequestID, RawValue: []byte("test-req-id")}}, - {Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("10.0.0.1:8000")}}, + {Header: &configPb.HeaderValue{Key: "routing-strategy", RawValue: []byte("random")}}, + {Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("test-pod")}}, + {Header: &configPb.HeaderValue{Key: HeaderTargetPodIP, RawValue: []byte("10.0.0.1:8000")}}, {Header: &configPb.HeaderValue{Key: "X-Custom", RawValue: []byte("value")}}, {Header: &configPb.HeaderValue{Key: ":status", RawValue: []byte("200")}}, }, @@ -100,7 +102,9 @@ func Test_HandleResponseHeaders(t *testing.T) { headers: []*configPb.HeaderValueOption{ {Header: &configPb.HeaderValue{Key: HeaderWentIntoReqHeaders, RawValue: []byte("true")}}, {Header: &configPb.HeaderValue{Key: HeaderRequestID, RawValue: []byte("test-req-id")}}, - {Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("10.0.0.1:8000")}}, + {Header: &configPb.HeaderValue{Key: "routing-strategy", RawValue: []byte("random")}}, + {Header: &configPb.HeaderValue{Key: HeaderTargetPod, RawValue: []byte("test-pod")}}, + {Header: &configPb.HeaderValue{Key: HeaderTargetPodIP, RawValue: []byte("10.0.0.1:8000")}}, {Header: &configPb.HeaderValue{Key: ":status", RawValue: []byte("500")}}, }, }, diff --git a/pkg/plugins/gateway/gateway_test.go b/pkg/plugins/gateway/gateway_test.go index 000dfe255..123110087 100644 --- a/pkg/plugins/gateway/gateway_test.go +++ b/pkg/plugins/gateway/gateway_test.go @@ -21,7 +21,6 @@ import ( "encoding/json" "errors" "fmt" - "os" "testing" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" @@ -78,70 +77,6 @@ func Test_ValidateRoutingStrategy(t *testing.T) { } } -func TestGetRoutingStrategy(t *testing.T) { - var tests = []struct { - headers []*configPb.HeaderValue - setEnvRoutingStrategy bool - envRoutingStrategy string - expectedStrategy string - expectedEnabled bool - message string - }{ - { - headers: []*configPb.HeaderValue{}, - setEnvRoutingStrategy: false, - expectedStrategy: "", - expectedEnabled: false, - message: "no routing strategy in headers or environment variable", - }, - { - headers: []*configPb.HeaderValue{ - {Key: "routing-strategy", RawValue: []byte("random")}, - }, - setEnvRoutingStrategy: false, - expectedStrategy: "random", - expectedEnabled: true, - message: "routing strategy from headers", - }, - { - headers: []*configPb.HeaderValue{}, - setEnvRoutingStrategy: true, - envRoutingStrategy: "random", - expectedStrategy: "random", - expectedEnabled: true, - message: "routing strategy from environment variable", - }, - { - headers: []*configPb.HeaderValue{ - {Key: "routing-strategy", RawValue: []byte("random")}, - }, - setEnvRoutingStrategy: true, - envRoutingStrategy: "least-request", - expectedStrategy: "random", - expectedEnabled: true, - message: "header routing strategy takes priority over environment variable", - }, - } - - for _, tt := range tests { - if tt.setEnvRoutingStrategy { - _ = os.Setenv("ROUTING_ALGORITHM", tt.envRoutingStrategy) - } else { - _ = os.Unsetenv("ROUTING_ALGORITHM") - } - - // refresh default values, the process won't modify this environment variable during normal running - defaultRoutingStrategy, defaultRoutingStrategyEnabled = utils.LookupEnv(EnvRoutingAlgorithm) - - routingStrategy, enabled := getRoutingStrategy(tt.headers) - assert.Equal(t, tt.expectedStrategy, routingStrategy, tt.message) - assert.Equal(t, tt.expectedEnabled, enabled, tt.message) - - // Cleanup environment variable for next test - _ = os.Unsetenv("ROUTING_ALGORITHM") - } -} - func Test_buildEnvoyProxyHeaders(t *testing.T) { headers := []*configPb.HeaderValueOption{} diff --git a/pkg/plugins/gateway/gateway_test_helpers.go b/pkg/plugins/gateway/gateway_test_helpers.go index 20941c89b..e37d74997 100644 --- a/pkg/plugins/gateway/gateway_test_helpers.go +++ b/pkg/plugins/gateway/gateway_test_helpers.go @@ -71,8 +71,8 @@ func (m *MockCache) DoneRequestCount(ctx *types.RoutingContext, requestID string m.Called(ctx, requestID, model, term) } -func (m *MockCache) DoneRequestTrace(ctx *types.RoutingContext, requestID string, model string, term int64, inputTokens int64, outputTokens int64) { - m.Called(ctx, requestID, model, term, inputTokens, outputTokens) +func (m *MockCache) DoneRequestTrace(ctx *types.RoutingContext, requestID string, model string, inputTokens int64, outputTokens int64, traceTerm int64) { + m.Called(ctx, requestID, model, inputTokens, outputTokens, traceTerm) } func (m *MockCache) AddSubscriber(subscriber metrics.MetricSubscriber) { diff --git a/pkg/plugins/gateway/types.go b/pkg/plugins/gateway/types.go index 1be2f1067..bea058d46 100644 --- a/pkg/plugins/gateway/types.go +++ b/pkg/plugins/gateway/types.go @@ -45,11 +45,13 @@ const ( // Request & Target Headers HeaderWentIntoReqHeaders = "x-went-into-req-headers" + HeaderTargetPodIP = "target-pod-ip" HeaderTargetPod = "target-pod" HeaderRoutingStrategy = "routing-strategy" HeaderRequestID = "request-id" HeaderModel = "model" HeaderExternalFilter = "external-filter" + HeaderConfigProfile = "config-profile" // RPM & TPM Update Errors HeaderUpdateTPM = "x-update-tpm" diff --git a/pkg/plugins/gateway/util.go b/pkg/plugins/gateway/util.go index a0632d0b0..d0bf54626 100644 --- a/pkg/plugins/gateway/util.go +++ b/pkg/plugins/gateway/util.go @@ -33,9 +33,11 @@ import ( envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/openai/openai-go" "github.com/openai/openai-go/packages/param" - "k8s.io/klog/v2" - + "github.com/vllm-project/aibrix/pkg/plugins/gateway/configprofiles" + "github.com/vllm-project/aibrix/pkg/types" "github.com/vllm-project/aibrix/pkg/utils" + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" ) var ( @@ -316,19 +318,47 @@ func validateStreamOptions(requestID string, user utils.User, stream *bool, stre return nil } +// applyConfigProfile resolves the model config from pod annotation (model.aibrix.ai/config) +// and applies the selected profile: sets ConfigProfile on routingCtx. +// - If the client provides config-profile, use that profile name. +// - If not provided or not found, fall back to defaultProfile (or "default") in the JSON. +func applyConfigProfile(routingCtx *types.RoutingContext, pods []*v1.Pod) { + headerProfile := routingCtx.ReqConfigProfile + profile := configprofiles.ResolveProfile(pods, headerProfile) + if profile == nil { + return + } + routingCtx.ConfigProfile = &types.ResolvedConfigProfile{ + RoutingStrategy: profile.RoutingStrategy, + PromptLenBucketMinLength: profile.PromptLenBucketMinLength, + PromptLenBucketMaxLength: profile.PromptLenBucketMaxLength, + Combined: profile.Combined, + } +} + var defaultRoutingStrategy, defaultRoutingStrategyEnabled = utils.LookupEnv(EnvRoutingAlgorithm) -// getRoutingStrategy retrieves the routing strategy from the headers or environment variable -// It returns the routing strategy value and whether custom routing strategy is enabled. -func getRoutingStrategy(headers []*configPb.HeaderValue) (string, bool) { - // Check headers for routing strategy - for _, header := range headers { - if strings.ToLower(header.Key) == HeaderRoutingStrategy { - return string(header.RawValue), true +// deriveRoutingStrategyFromContext retrieves routing strategy from headers or resolved profile, falling back to env defaults. +func deriveRoutingStrategyFromContext(routingCtx *types.RoutingContext) (string, bool) { + // Check request headers (case-insensitive key match) + if routingCtx != nil && routingCtx.ReqHeaders != nil { + for k, v := range routingCtx.ReqHeaders { + if strings.ToLower(k) == HeaderRoutingStrategy { + if strings.TrimSpace(v) != "" { + return v, true + } + break + } } } - - // If header not set, use default routing strategy from environment variable + // Fallback to resolved profile on routing context + if routingCtx != nil && routingCtx.ConfigProfile != nil { + s := strings.TrimSpace(routingCtx.ConfigProfile.RoutingStrategy) + if s != "" { + return s, true + } + } + // Fallback to environment default return defaultRoutingStrategy, defaultRoutingStrategyEnabled } @@ -589,20 +619,13 @@ func validateTokenInputs(tokenArrays [][]int64) error { return nil } -func buildGatewayPodMetricLabels(model, status, statusCode string) ([]string, []string) { - labelNames := []string{ - "model", - "status", - "status_code", - "pod_name", - } - labelValues := []string{ - model, - status, - statusCode, - POD_NAME, +func buildGatewayPodMetricLabels(model, status, statusCode string) map[string]string { + return map[string]string{ + "model": GetModelTag(model), + "status": status, + "status_code": statusCode, + "pod_name": POD_NAME, } - return labelNames, labelValues } func GetModelTag(model string) string { diff --git a/pkg/types/router_context.go b/pkg/types/router_context.go index d994677c6..21d221b3a 100644 --- a/pkg/types/router_context.go +++ b/pkg/types/router_context.go @@ -41,6 +41,16 @@ const ( type RequestFeatures []float64 +// ResolvedConfigProfile holds the resolved model config profile for a request. +// Populated from model.aibrix.ai/config annotation based on config-profile header or defaultProfile. +// Nil when no config is present; +type ResolvedConfigProfile struct { + RoutingStrategy string + PromptLenBucketMinLength int + PromptLenBucketMaxLength int + Combined bool +} + // RoutingAlgorithm defines the routing algorithms type RoutingAlgorithm string @@ -60,9 +70,10 @@ type RoutingContext struct { TraceTerm int64 // Trace term identifier, available after AddRequestCount call. RoutedTime time.Time // Time consumed during routing. - ReqHeaders map[string]string - ReqBody []byte - ReqPath string + ReqHeaders map[string]string + ReqBody []byte + ReqPath string + ReqConfigProfile string PrefillStartTime time.Time // Time when prefill request is started. PrefillEndTime time.Time // Time consumed during prefill. @@ -74,6 +85,11 @@ type RoutingContext struct { // during the Route() call. RespHeaders map[string]string + // ConfigProfile holds the resolved model config profile for this request. + // Set in HandleRequestBody from model.aibrix.ai/config (annotation) + // based on config-profile header. Nil when no config is present. + ConfigProfile *ResolvedConfigProfile + targetPodSet chan struct{} targetPod atomic.Pointer[v1.Pod] targetPort atomic.Int32 @@ -306,12 +322,14 @@ func (r *RoutingContext) reset(ctx context.Context, algorithms RoutingAlgorithm, r.ReqHeaders = map[string]string{} r.ReqPath = "" + r.ReqConfigProfile = "" r.ReqBody = []byte{} r.PrefillStartTime = time.Time{} r.PrefillEndTime = time.Time{} // RoutedTime will not be reset, it must before ReqeustTime at this time. r.RespHeaders = map[string]string{} + r.ConfigProfile = nil r.targetPodSet = make(chan struct{}) // Initialize channel r.targetPod.Store(nilPod) r.lastError.Store(nil) diff --git a/samples/disaggregation/sglang/pd-bucketing.yaml b/samples/disaggregation/sglang/pd-bucketing.yaml index f50ba3045..01db63dfb 100644 --- a/samples/disaggregation/sglang/pd-bucketing.yaml +++ b/samples/disaggregation/sglang/pd-bucketing.yaml @@ -25,13 +25,22 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "30000" prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + } + } + } labels: model.aibrix.ai/name: qwen3-8B model.aibrix.ai/port: "30000" model.aibrix.ai/metric-port: "30000" model.aibrix.ai/engine: sglang - prompt-min-length: "0" - prompt-max-length: "2048" spec: nodeSelector: kubernetes.io/hostname: 10.0.151.187 @@ -155,13 +164,22 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "30000" prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "promptLenBucketMaxLength": 2048 + } + } + } labels: model.aibrix.ai/name: qwen3-8B model.aibrix.ai/port: "30000" model.aibrix.ai/metric-port: "30000" model.aibrix.ai/engine: sglang - prompt-min-length: "0" - prompt-max-length: "2048" spec: nodeSelector: kubernetes.io/hostname: 10.0.151.187 @@ -304,13 +322,22 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "30000" prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 2049, + "promptLenBucketMaxLength": 4096 + } + } + } labels: model.aibrix.ai/name: qwen3-8B model.aibrix.ai/port: "30000" model.aibrix.ai/metric-port: "30000" model.aibrix.ai/engine: sglang - prompt-min-length: "2049" - prompt-max-length: "4096" spec: nodeSelector: kubernetes.io/hostname: 10.0.151.187 @@ -434,13 +461,22 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "30000" prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 2049, + "promptLenBucketMaxLength": 4096 + } + } + } labels: model.aibrix.ai/name: qwen3-8B model.aibrix.ai/port: "30000" model.aibrix.ai/metric-port: "30000" model.aibrix.ai/engine: sglang - prompt-min-length: "2049" - prompt-max-length: "4096" spec: nodeSelector: kubernetes.io/hostname: 10.0.151.187 @@ -584,13 +620,22 @@ spec: prometheus.io/scrape: "true" prometheus.io/port: "30000" prometheus.io/path: "/metrics" + model.aibrix.ai/config: | + { + "defaultProfile": "pd", + "profiles": { + "pd": { + "routingStrategy": "pd", + "promptLenBucketMinLength": 0, + "combined": true + } + } + } labels: model.aibrix.ai/name: qwen3-8B model.aibrix.ai/port: "30000" model.aibrix.ai/metric-port: "30000" model.aibrix.ai/engine: sglang - prompt-min-length: "0" - model.aibrix.ai/combined: "true" spec: nodeSelector: kubernetes.io/hostname: 10.1.9.155 diff --git a/test/e2e/routing_config_profile_test.go b/test/e2e/routing_config_profile_test.go new file mode 100644 index 000000000..5564698bd --- /dev/null +++ b/test/e2e/routing_config_profile_test.go @@ -0,0 +1,89 @@ +/* +Copyright 2025 The Aibrix Team. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "net/http" + "testing" + + "github.com/openai/openai-go" + "github.com/openai/openai-go/option" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestConfigProfileRoutingStrategy verifies that passing config-profile as a header +// causes the gateway plugin to select the correct routing-strategy from the model's +// config (model.aibrix.ai/config annotation). +// +// The config is defined in development/app/config/mock/config-profile-llama2-patch.yaml +// (mirrors config-profile.yaml structure): +// - defaultProfile: "least-request" +// - profiles: "least-request" (routingStrategy: least-request), "throughput" (routingStrategy: throughput) +// +// The gateway resolves config-profile header -> ResolveProfile -> deriveRoutingStrategyFromContext +// and sets routing-strategy in the response headers. +func TestConfigProfileRoutingStrategy(t *testing.T) { + msg := "config-profile routing test message" + + t.Run("no_config_profile_uses_default", func(t *testing.T) { + var dst *http.Response + client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "", option.WithResponseInto(&dst)) + + _, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{ + Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)}, + Model: modelNameQwen3, + }) + require.NoError(t, err) + + // No config-profile header -> defaultProfile "least-request" is used + got := dst.Header.Get("routing-strategy") + assert.Equal(t, "least-request", got, + "without config-profile header, gateway should use defaultProfile least-request") + }) + + t.Run("config_profile_least_request", func(t *testing.T) { + var dst *http.Response + client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "least-request", option.WithResponseInto(&dst)) + + _, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{ + Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)}, + Model: modelNameQwen3, + }) + require.NoError(t, err) + + got := dst.Header.Get("routing-strategy") + assert.Equal(t, "least-request", got, + "config-profile: least-request should select routing-strategy least-request") + }) + + t.Run("config_profile_throughput", func(t *testing.T) { + var dst *http.Response + client := createOpenAIClientWithConfigProfile(gatewayURL, apiKey, "throughput", option.WithResponseInto(&dst)) + + _, err := client.Chat.Completions.New(context.TODO(), openai.ChatCompletionNewParams{ + Messages: []openai.ChatCompletionMessageParamUnion{openai.UserMessage(msg)}, + Model: modelNameQwen3, + }) + require.NoError(t, err) + + got := dst.Header.Get("routing-strategy") + assert.Equal(t, "throughput", got, + "config-profile: throughput should select routing-strategy throughput") + }) +} diff --git a/test/e2e/util.go b/test/e2e/util.go index 4419d9704..7f7f40664 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -41,11 +41,12 @@ import ( ) const ( - gatewayURL = "http://localhost:8888" - engineURL = "http://localhost:8000" - apiKey = "test-key-1234567890" - modelName = "llama2-7b" - namespace = "aibrix-system" + gatewayURL = "http://localhost:8888" + engineURL = "http://localhost:8000" + apiKey = "test-key-1234567890" + modelName = "llama2-7b" + modelNameQwen3 = "qwen3-8b" + namespace = "aibrix-system" ) func initializeClient(ctx context.Context, t *testing.T) (*kubernetes.Clientset, *v1alpha1.Clientset) { @@ -129,6 +130,36 @@ func createOpenAIClientWithRoutingStrategy(baseURL, apiKey, routingStrategy stri ) } +// createOpenAIClientWithConfigProfile creates a client that sends config-profile header. +// The gateway plugin selects routing-strategy from the model's config profile (model.aibrix.ai/config) +// based on this header, rather than from the routing-strategy header. +func createOpenAIClientWithConfigProfile(baseURL, apiKey, configProfile string, + respOpt option.RequestOption) openai.Client { + transport := &http.Transport{ + DisableKeepAlives: true, + MaxIdleConns: 0, + } + + opts := []option.RequestOption{ + option.WithBaseURL(baseURL), + option.WithAPIKey(apiKey), + option.WithHTTPClient(&http.Client{Transport: transport}), + option.WithMiddleware(func(r *http.Request, mn option.MiddlewareNext) (*http.Response, error) { + r.URL.Path = "/v1" + r.URL.Path + return mn(r) + }), + option.WithMaxRetries(0), + } + if configProfile != "" { + opts = append(opts, option.WithHeader("config-profile", configProfile)) + } + if respOpt != nil { + opts = append(opts, respOpt) + } + + return openai.NewClient(opts...) +} + func validateInference(t *testing.T, modelName string) { client := createOpenAIClient(gatewayURL, apiKey) validateInferenceWithClient(t, client, modelName)