Skip to content

Commit 6f89a3f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into sync/upstream-ff5f8eab
2 parents 6a5d90e + af2b646 commit 6f89a3f

66 files changed

Lines changed: 1054 additions & 1047 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

deploy/grafana/inference_gateway.json

Lines changed: 35 additions & 35 deletions
Large diffs are not rendered by default.

docs/metrics.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Metrics defined by llm-d Router are in addition to Inference Gateway metrics. Fo
4444
4545
## Opt-in ext_proc Stream Metrics
4646

47-
Three metrics covering ext_proc gRPC stream lifecycle. Disabled by default; enable with `--enable-grpc-stream-metrics`. All carry the `llm_d_router_epp_` prefix.
47+
Three metrics covering ext_proc gRPC stream lifecycle. Disabled by default; enable with `--enable-grpc-stream-metrics`. These metrics are emitted under the `llm_d_epp_` prefix (separate from `llm_d_inference_scheduler_*`).
4848

4949
### `extproc_streams_inflight`
5050

pkg/epp/flowcontrol/integration_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,15 +1122,15 @@ func TestFlowControlMetricsEmitted(t *testing.T) {
11221122
var queueSizeWhileQueued float64
11231123
var foundQueueSize bool
11241124
for _, f := range families {
1125-
if f.GetName() == "llm_d_router_epp_flow_control_queue_size" {
1125+
if f.GetName() == "llm_d_epp_flow_control_queue_size" {
11261126
foundQueueSize = true
11271127
for _, m := range f.GetMetric() {
11281128
queueSizeWhileQueued += m.GetGauge().GetValue()
11291129
}
11301130
}
11311131
}
11321132
require.True(t, foundQueueSize,
1133-
"llm_d_router_epp_flow_control_queue_size metric should exist")
1133+
"llm_d_epp_flow_control_queue_size metric should exist")
11341134
require.Greater(t, queueSizeWhileQueued, 0.0,
11351135
"queue_size should be > 0 while a request is actively queued")
11361136

pkg/epp/framework/plugins/flowcontrol/fairness/program-aware/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ A complete sample is shipped at [`deploy/config/sim-program-aware-config.yaml`](
6363

6464
## Observability
6565

66-
The plugin exports two shared collectors and one strategy-owned collector under the `llm_d_router_epp` Prometheus subsystem:
66+
The plugin exports two shared collectors and one strategy-owned collector under the `llm_d_epp` Prometheus subsystem:
6767

6868
| Metric | Type | Labels | Description |
6969
|---|---|---|---|

pkg/epp/framework/plugins/requestcontrol/dataproducer/approximateprefix/metrics.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ var (
3232
prometheus.GaugeOpts{
3333
Subsystem: eppmetrics.InferenceExtensionSubsystem,
3434
Name: "prefix_indexer_size",
35-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_prefix_indexer_size] Size of the prefix indexer.", compbasemetrics.ALPHA),
35+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_prefix_indexer_size] Size of the prefix indexer.", compbasemetrics.ALPHA),
3636
},
3737
[]string{},
3838
)
@@ -50,7 +50,7 @@ var (
5050
prometheus.HistogramOpts{
5151
Subsystem: eppmetrics.InferenceExtensionSubsystem,
5252
Name: "prefix_indexer_hit_ratio",
53-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_prefix_indexer_hit_ratio] Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA),
53+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_prefix_indexer_hit_ratio] Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA),
5454
Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
5555
},
5656
[]string{},
@@ -70,7 +70,7 @@ var (
7070
prometheus.HistogramOpts{
7171
Subsystem: eppmetrics.InferenceExtensionSubsystem,
7272
Name: "prefix_indexer_hit_bytes",
73-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_prefix_indexer_hit_bytes] Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA),
73+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_prefix_indexer_hit_bytes] Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA),
7474
Buckets: []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536},
7575
},
7676
[]string{},

pkg/epp/framework/plugins/requestcontrol/dataproducer/predictedlatency/metrics.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ var (
6868
prometheus.GaugeOpts{
6969
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
7070
Name: "inference_request_metric",
71-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_inference_request_metric] Consolidated gauge for various inference request metrics including TTFT, TPOT, SLOs, and prediction durations.", compbasemetrics.ALPHA),
71+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_inference_request_metric] Consolidated gauge for various inference request metrics including TTFT, TPOT, SLOs, and prediction durations.", compbasemetrics.ALPHA),
7272
},
7373
modelTypeLabels,
7474
)
@@ -86,7 +86,7 @@ var (
8686
prometheus.HistogramOpts{
8787
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
8888
Name: "request_ttft_seconds",
89-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_ttft_seconds] Inference model TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
89+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_ttft_seconds] Inference model TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
9090
Buckets: generalLatencyBuckets,
9191
},
9292
modelLabels,
@@ -96,7 +96,7 @@ var (
9696
prometheus.HistogramOpts{
9797
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
9898
Name: "request_predicted_ttft_seconds",
99-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_predicted_ttft_seconds] Inference model Predicted TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
99+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_predicted_ttft_seconds] Inference model Predicted TTFT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
100100
Buckets: generalLatencyBuckets,
101101
},
102102
modelLabels,
@@ -116,7 +116,7 @@ var (
116116
prometheus.HistogramOpts{
117117
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
118118
Name: "request_ttft_prediction_duration_seconds",
119-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_ttft_prediction_duration_seconds] Duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
119+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_ttft_prediction_duration_seconds] Duration taken to generate TTFT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
120120
Buckets: predictionLatencyBuckets,
121121
},
122122
modelLabels,
@@ -136,7 +136,7 @@ var (
136136
prometheus.HistogramOpts{
137137
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
138138
Name: "request_tpot_seconds",
139-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_tpot_seconds] Inference model TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
139+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_tpot_seconds] Inference model TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
140140
Buckets: tpotBuckets,
141141
},
142142
modelLabels,
@@ -146,7 +146,7 @@ var (
146146
prometheus.HistogramOpts{
147147
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
148148
Name: "request_predicted_tpot_seconds",
149-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_predicted_tpot_seconds] Inference model Predicted TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
149+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_predicted_tpot_seconds] Inference model Predicted TPOT distribution in seconds for each model and target model.", compbasemetrics.ALPHA),
150150
Buckets: tpotBuckets,
151151
},
152152
modelLabels,
@@ -166,7 +166,7 @@ var (
166166
prometheus.HistogramOpts{
167167
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
168168
Name: "request_tpot_prediction_duration_seconds",
169-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_tpot_prediction_duration_seconds] Duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
169+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_tpot_prediction_duration_seconds] Duration taken to generate TPOT predictions in seconds for each model and target model.", compbasemetrics.ALPHA),
170170
Buckets: predictionLatencyBuckets,
171171
},
172172
modelLabels,
@@ -186,7 +186,7 @@ var (
186186
prometheus.CounterOpts{
187187
Subsystem: eppmetrics.InferenceObjectiveSubsystem,
188188
Name: "request_slo_violation_total",
189-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_request_slo_violation_total] Counter of SLO violations for each model, target model, and violation type.", compbasemetrics.ALPHA),
189+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_request_slo_violation_total] Counter of SLO violations for each model, target model, and violation type.", compbasemetrics.ALPHA),
190190
},
191191
modelTypeLabels,
192192
)

pkg/epp/framework/plugins/scheduling/profilehandler/disagg/metrics.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ var (
4747
prometheus.CounterOpts{
4848
Subsystem: eppmetrics.SchedulerSubsystem,
4949
Name: "pd_decision_total",
50-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_pd_decision_total] Total number of P/D disaggregation decisions made", compbasemetrics.ALPHA),
50+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_pd_decision_total] Total number of P/D disaggregation decisions made", compbasemetrics.ALPHA),
5151
},
5252
[]string{"model_name", "decision_type"}, // "decode-only" or "prefill-decode"
5353
)
@@ -65,13 +65,13 @@ var (
6565
// SchedulerDisaggDecisionCount records disaggregation routing decisions,
6666
// covering all stages: decode-only, prefill-decode, encode-decode, encode-prefill-decode.
6767
//
68-
// Deprecated: Use llm_d_router_epp_disagg_decision_total instead.
68+
// Deprecated: Use llm_d_epp_disagg_decision_total instead.
6969
// Tracked in: https://github.com/llm-d/llm-d-inference-scheduler/issues/1070
7070
SchedulerDisaggDecisionCount = prometheus.NewCounterVec(
7171
prometheus.CounterOpts{
7272
Subsystem: eppmetrics.SchedulerSubsystem,
7373
Name: "disagg_decision_total",
74-
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_router_epp_disagg_decision_total] Total number of disaggregation routing decisions made", compbasemetrics.ALPHA),
74+
Help: metricsutil.HelpMsgWithStability("[Deprecated: Use llm_d_epp_disagg_decision_total] Total number of disaggregation routing decisions made", compbasemetrics.ALPHA),
7575
},
7676
[]string{"model_name", "decision_type"},
7777
)

pkg/epp/framework/plugins/scheduling/profilehandler/disagg/metrics_test.go

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ func TestSchedulerPDDecisionCount(t *testing.T) {
3434
RecordPDDecision("test-plugin", "test-type", model, DecisionTypePrefillDecode)
3535

3636
expected := `
37-
# HELP llm_d_inference_scheduler_pd_decision_total [ALPHA] [Deprecated: Use llm_d_router_epp_pd_decision_total] Total number of P/D disaggregation decisions made
37+
# HELP llm_d_inference_scheduler_pd_decision_total [ALPHA] [Deprecated: Use llm_d_epp_pd_decision_total] Total number of P/D disaggregation decisions made
3838
# TYPE llm_d_inference_scheduler_pd_decision_total counter
3939
llm_d_inference_scheduler_pd_decision_total{decision_type="decode-only",model_name="test-model"} 1
4040
llm_d_inference_scheduler_pd_decision_total{decision_type="prefill-decode",model_name="test-model"} 2
@@ -46,14 +46,14 @@ func TestSchedulerPDDecisionCount(t *testing.T) {
4646
}
4747

4848
expectedNew := `
49-
# HELP llm_d_router_epp_pd_decision_total [ALPHA] Total number of P/D disaggregation decisions made
50-
# TYPE llm_d_router_epp_pd_decision_total counter
51-
llm_d_router_epp_pd_decision_total{decision_type="decode-only",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
52-
llm_d_router_epp_pd_decision_total{decision_type="prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 2
49+
# HELP llm_d_epp_pd_decision_total [ALPHA] Total number of P/D disaggregation decisions made
50+
# TYPE llm_d_epp_pd_decision_total counter
51+
llm_d_epp_pd_decision_total{decision_type="decode-only",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
52+
llm_d_epp_pd_decision_total{decision_type="prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 2
5353
`
5454

5555
if err := testutil.CollectAndCompare(LlmdPDDecisionCount, strings.NewReader(expectedNew),
56-
"llm_d_router_epp_pd_decision_total"); err != nil {
56+
"llm_d_epp_pd_decision_total"); err != nil {
5757
t.Errorf("RecordPDDecision() new failed: %v", err)
5858
}
5959
}
@@ -73,7 +73,7 @@ func TestRecordDisaggDecision(t *testing.T) {
7373
RecordDisaggDecision("test-plugin", "test-type", model, DecisionTypeEncodePrefillDecode)
7474

7575
expected := `
76-
# HELP llm_d_inference_scheduler_disagg_decision_total [ALPHA] [Deprecated: Use llm_d_router_epp_disagg_decision_total] Total number of disaggregation routing decisions made
76+
# HELP llm_d_inference_scheduler_disagg_decision_total [ALPHA] [Deprecated: Use llm_d_epp_disagg_decision_total] Total number of disaggregation routing decisions made
7777
# TYPE llm_d_inference_scheduler_disagg_decision_total counter
7878
llm_d_inference_scheduler_disagg_decision_total{decision_type="decode-only",model_name="test-model"} 1
7979
llm_d_inference_scheduler_disagg_decision_total{decision_type="encode-decode",model_name="test-model"} 1
@@ -87,16 +87,16 @@ func TestRecordDisaggDecision(t *testing.T) {
8787
}
8888

8989
expectedNew := `
90-
# HELP llm_d_router_epp_disagg_decision_total [ALPHA] Total number of disaggregation routing decisions made
91-
# TYPE llm_d_router_epp_disagg_decision_total counter
92-
llm_d_router_epp_disagg_decision_total{decision_type="decode-only",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
93-
llm_d_router_epp_disagg_decision_total{decision_type="encode-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
94-
llm_d_router_epp_disagg_decision_total{decision_type="encode-prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 3
95-
llm_d_router_epp_disagg_decision_total{decision_type="prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 2
90+
# HELP llm_d_epp_disagg_decision_total [ALPHA] Total number of disaggregation routing decisions made
91+
# TYPE llm_d_epp_disagg_decision_total counter
92+
llm_d_epp_disagg_decision_total{decision_type="decode-only",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
93+
llm_d_epp_disagg_decision_total{decision_type="encode-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 1
94+
llm_d_epp_disagg_decision_total{decision_type="encode-prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 3
95+
llm_d_epp_disagg_decision_total{decision_type="prefill-decode",model_name="test-model",plugin_name="test-plugin",plugin_type="test-type"} 2
9696
`
9797

9898
if err := testutil.CollectAndCompare(LlmdDisaggDecisionCount, strings.NewReader(expectedNew),
99-
"llm_d_router_epp_disagg_decision_total"); err != nil {
99+
"llm_d_epp_disagg_decision_total"); err != nil {
100100
t.Errorf("RecordDisaggDecision() new failed: %v", err)
101101
}
102102
}
@@ -108,7 +108,7 @@ func TestRecordDisaggDecisionEmptyModel(t *testing.T) {
108108
RecordDisaggDecision("test-plugin", "test-type", "", DecisionTypeDecodeOnly)
109109

110110
expected := `
111-
# HELP llm_d_inference_scheduler_disagg_decision_total [ALPHA] [Deprecated: Use llm_d_router_epp_disagg_decision_total] Total number of disaggregation routing decisions made
111+
# HELP llm_d_inference_scheduler_disagg_decision_total [ALPHA] [Deprecated: Use llm_d_epp_disagg_decision_total] Total number of disaggregation routing decisions made
112112
# TYPE llm_d_inference_scheduler_disagg_decision_total counter
113113
llm_d_inference_scheduler_disagg_decision_total{decision_type="decode-only",model_name="unknown"} 1
114114
`
@@ -119,13 +119,13 @@ func TestRecordDisaggDecisionEmptyModel(t *testing.T) {
119119
}
120120

121121
expectedNew := `
122-
# HELP llm_d_router_epp_disagg_decision_total [ALPHA] Total number of disaggregation routing decisions made
123-
# TYPE llm_d_router_epp_disagg_decision_total counter
124-
llm_d_router_epp_disagg_decision_total{decision_type="decode-only",model_name="unknown",plugin_name="test-plugin",plugin_type="test-type"} 1
122+
# HELP llm_d_epp_disagg_decision_total [ALPHA] Total number of disaggregation routing decisions made
123+
# TYPE llm_d_epp_disagg_decision_total counter
124+
llm_d_epp_disagg_decision_total{decision_type="decode-only",model_name="unknown",plugin_name="test-plugin",plugin_type="test-type"} 1
125125
`
126126

127127
if err := testutil.CollectAndCompare(LlmdDisaggDecisionCount, strings.NewReader(expectedNew),
128-
"llm_d_router_epp_disagg_decision_total"); err != nil {
128+
"llm_d_epp_disagg_decision_total"); err != nil {
129129
t.Errorf("RecordDisaggDecision() new empty model failed: %v", err)
130130
}
131131
}

pkg/epp/handlers/response_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,7 @@ func TestHandleResponseBodyWithoutSchedulingRequest(t *testing.T) {
300300
server.HandleResponseBody(ctx, reqCtx, []byte(body), true)
301301
})
302302

303-
histogram := findHistogramMetric(t, "llm_d_router_epp_request_ntpot_seconds", map[string]string{
303+
histogram := findHistogramMetric(t, "llm_d_epp_request_ntpot_seconds", map[string]string{
304304
"model_name": "incoming-model",
305305
"target_model_name": "target-model",
306306
"fairness_id": metadata.DefaultFairnessID,

pkg/epp/metrics/collectors/inference_pool_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,10 @@ func TestMetricsCollected(t *testing.T) {
118118
}
119119

120120
errNew := promtestutil.CollectAndCompare(collector, strings.NewReader(`
121-
# HELP llm_d_router_epp_per_endpoint_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying endpoint.
122-
# TYPE llm_d_router_epp_per_endpoint_queue_size gauge
123-
llm_d_router_epp_per_endpoint_queue_size{model_server_endpoint="pod1-rank-0",name="test-pool"} 100
124-
`), "llm_d_router_epp_per_endpoint_queue_size")
121+
# HELP llm_d_epp_per_endpoint_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying endpoint.
122+
# TYPE llm_d_epp_per_endpoint_queue_size gauge
123+
llm_d_epp_per_endpoint_queue_size{model_server_endpoint="pod1-rank-0",name="test-pool"} 100
124+
`), "llm_d_epp_per_endpoint_queue_size")
125125
if errNew != nil {
126126
t.Fatal(errNew)
127127
}

0 commit comments

Comments
 (0)