Skip to content

Commit 7f7943e

Browse files
committed
Merge remote-tracking branch 'upstream/main' into sync/upstream-ff5f8eab
2 parents 2147fa0 + e7a8c94 commit 7f7943e

69 files changed

Lines changed: 5363 additions & 748 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

DEVELOPMENT.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,14 +173,15 @@ PROM_ENABLED=true KIND_PROM_HOST_PORT=30091 make env-dev-kind
173173

174174
### Grafana Dashboard
175175

176-
The upstream [Inference Gateway dashboard] covers EPP, inference pool, and vLLM metrics.
176+
The bundled [Inference Gateway dashboard] covers EPP metrics across the inference
177+
pool, inference objective, and flow control layers.
177178

178179
Add a Prometheus datasource at `http://localhost:30090`, then import the JSON via
179180
**Dashboards > New > Import**. See the
180181
[Grafana installation docs](https://grafana.com/docs/grafana/latest/setup-grafana/installation/)
181182
for setup.
182183

183-
[Inference Gateway dashboard]:https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dashboards/inference_gateway.json
184+
[Inference Gateway dashboard]:deploy/grafana/inference_gateway.json
184185

185186
> [!NOTE]
186187
> For significant customization beyond the standard deployment, use the `deploy/components`

cmd/epp/runner/runner.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ import (
7272
srcmodels "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/datalayer/source/models"
7373
sourcenotifications "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/datalayer/source/notifications"
7474
"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/globalstrict"
75+
programaware "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/program-aware"
7576
"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/roundrobin"
7677
"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/ordering/edf"
7778
"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/ordering/fcfs"
@@ -525,6 +526,7 @@ func (r *Runner) registerInTreePlugins() {
525526
// Flow Control plugins
526527
fwkplugin.Register(globalstrict.GlobalStrictFairnessPolicyType, globalstrict.GlobalStrictFairnessPolicyFactory)
527528
fwkplugin.Register(roundrobin.RoundRobinFairnessPolicyType, roundrobin.RoundRobinFairnessPolicyFactory)
529+
fwkplugin.Register(programaware.ProgramAwarePluginType, programaware.ProgramAwarePluginFactory)
528530
fwkplugin.Register(fcfs.FCFSOrderingPolicyType, fcfs.FCFSOrderingPolicyFactory)
529531
fwkplugin.Register(edf.EDFOrderingPolicyType, edf.EDFOrderingPolicyFactory)
530532
fwkplugin.Register(slodeadline.SLODeadlineOrderingPolicyType, slodeadline.SLODeadlineOrderingPolicyFactory)

config/charts/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ Core settings for the Endpoint Picker Proxy (EPP) container and pod, including s
130130
| `router.epp.flags` | Map of command-line flags passed directly to the EPP binary. | `{}` |
131131
| `router.epp.affinity` | Affinity rules for EPP pods. | `{}` |
132132
| `router.epp.tolerations` | Tolerations for EPP pods. | `[]` |
133-
| `router.epp.resources` | EPP container resource requests and limits. | `requests.cpu: "4"`, `requests.memory: 8Gi`, `limits.memory: 16Gi` |
133+
| `router.epp.resources` | EPP container resource requests and limits. | `requests.cpu: "8"`, `requests.memory: 8Gi`, `limits.memory: 16Gi` |
134134
| `router.epp.pluginsConfigFile` | EPP plugins configuration file name. | `default-plugins.yaml` |
135135
| `router.epp.pluginsCustomConfig` | Inline custom YAML configuration for EPP plugins. | `{}` |
136136
| `router.epp.volumes` | Extra volumes for EPP pod. | `[]` |

config/charts/routerlib/templates/_config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,13 @@ data:
4747
parameters:
4848
affinityThreshold: 0.99
4949
ttftSource: latencyPredictor
50+
maxTTFTPenaltyMs: 5000
5051
- name: loose-affinity-filter
5152
type: prefix-cache-affinity-filter
5253
parameters:
5354
affinityThreshold: 0.80
5455
ttftSource: latencyPredictor
56+
maxTTFTPenaltyMs: 5000
5557
- type: latency-scorer
5658
- type: weighted-random-picker
5759
- type: slo-headroom-tier-filter

config/charts/routerlib/values.yaml

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ epp:
5959
# EPP container resources: CPU limits unset for burst capacity, memory capped at 16Gi
6060
resources:
6161
requests:
62-
cpu: "4"
62+
cpu: "8"
6363
memory: 8Gi
6464
limits:
6565
memory: 16Gi
@@ -160,10 +160,9 @@ latencyPredictor:
160160
# Training Server Configuration
161161
trainingServer:
162162
image:
163-
# TODO: Update this default once llm-d owns and publishes this image.
164-
registry: us-central1-docker.pkg.dev/k8s-staging-images
165-
repository: gateway-api-inference-extension/latency-training-server
166-
tag: main
163+
registry: ghcr.io/llm-d
164+
repository: llm-d-latency-predictor-training-server-dev
165+
tag: latest
167166
pullPolicy: Always
168167
port: 8000
169168
resources:
@@ -202,10 +201,9 @@ latencyPredictor:
202201
count: 1
203202
startPort: 8001
204203
image:
205-
# TODO: Update this default once llm-d owns and publishes this image.
206-
registry: us-central1-docker.pkg.dev/k8s-staging-images
207-
repository: gateway-api-inference-extension/latency-prediction-server
208-
tag: main
204+
registry: ghcr.io/llm-d
205+
repository: llm-d-latency-predictor-prediction-server-dev
206+
tag: latest
209207
pullPolicy: Always
210208
resources:
211209
requests:

deploy/config/epp-mm-embeddings-cache-config.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ plugins:
1414
parameters:
1515
cacheSizeInMB: 2048
1616
- type: mm-embeddings-cache-scorer
17-
- type: precise-prefix-cache-scorer
17+
- type: precise-prefix-cache-producer
1818
parameters:
1919
tokenProcessorConfig:
2020
blockSize: 64
2121
indexerConfig:
2222
kvBlockIndexConfig:
2323
enableMetrics: true
24+
- type: prefix-cache-scorer
25+
parameters:
26+
prefixMatchInfoProducerName: precise-prefix-cache-producer
2427
- type: queue-scorer
2528
- type: max-score-picker
2629
- type: single-profile-handler
@@ -33,12 +36,12 @@ dataLayer:
3336
- pluginRef: endpoint-notification-source
3437
extractors:
3538
- pluginRef: mm-embeddings-cache-producer
36-
- pluginRef: precise-prefix-cache-scorer
39+
- pluginRef: precise-prefix-cache-producer
3740
schedulingProfiles:
3841
- name: default
3942
plugins:
4043
- pluginRef: decode-filter
41-
- pluginRef: precise-prefix-cache-scorer
44+
- pluginRef: prefix-cache-scorer
4245
weight: 10
4346
- pluginRef: mm-embeddings-cache-scorer
4447
weight: 4

deploy/config/epp-precise-prefix-cache-config.yaml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
# Sample EPP configuration with the precise-prefix-cache scorer.
1+
# Sample EPP config: split precise-prefix-cache pipeline
2+
# token-producer -> precise-prefix-cache-producer -> prefix-cache-scorer
23
apiVersion: llm-d.ai/v1alpha1
34
kind: EndpointPickerConfig
45
plugins:
@@ -12,13 +13,16 @@ plugins:
1213
- type: core-metrics-extractor
1314
- type: single-profile-handler
1415
- type: decode-filter
15-
- type: precise-prefix-cache-scorer
16+
- type: precise-prefix-cache-producer
1617
parameters:
1718
tokenProcessorConfig:
1819
blockSize: 64
1920
indexerConfig:
2021
kvBlockIndexConfig:
21-
enableMetrics: true # enable kv-block index metrics (prometheus)
22+
enableMetrics: true # enable kv-block index metrics (prometheus)
23+
- type: prefix-cache-scorer
24+
parameters:
25+
prefixMatchInfoProducerName: precise-prefix-cache-producer
2226
- type: kv-cache-utilization-scorer
2327
- type: queue-scorer
2428
- type: max-score-picker
@@ -29,12 +33,12 @@ dataLayer:
2933
- pluginRef: core-metrics-extractor
3034
- pluginRef: endpoint-notification-source
3135
extractors:
32-
- pluginRef: precise-prefix-cache-scorer
36+
- pluginRef: precise-prefix-cache-producer
3337
schedulingProfiles:
3438
- name: default
3539
plugins:
3640
- pluginRef: decode-filter
37-
- pluginRef: precise-prefix-cache-scorer
41+
- pluginRef: prefix-cache-scorer
3842
weight: 2.0
3943
- pluginRef: kv-cache-utilization-scorer
4044
weight: 1.0

deploy/config/epp-precise-prefix-cache-split-config.yaml

Lines changed: 0 additions & 47 deletions
This file was deleted.

deploy/config/sim-epp-kvcache-config.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ plugins:
88
modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace to use a different model
99
vllm:
1010
url: http://localhost:8000
11-
- type: precise-prefix-cache-scorer
11+
- type: precise-prefix-cache-producer
1212
parameters:
1313
tokenProcessorConfig:
1414
blockSize: 16
@@ -18,6 +18,9 @@ plugins:
1818
kvBlockIndexConfig:
1919
enableMetrics: false # enable kv-block index metrics (prometheus)
2020
metricsLoggingInterval: 6000000000 # log kv-block metrics as well (1m in nanoseconds)
21+
- type: prefix-cache-scorer
22+
parameters:
23+
prefixMatchInfoProducerName: precise-prefix-cache-producer
2124
- type: decode-filter
2225
- type: max-score-picker
2326
- type: single-profile-handler
@@ -26,5 +29,5 @@ schedulingProfiles:
2629
plugins:
2730
- pluginRef: decode-filter
2831
- pluginRef: max-score-picker
29-
- pluginRef: precise-prefix-cache-scorer
32+
- pluginRef: prefix-cache-scorer
3033
weight: 10

deploy/config/sim-epp-no-hit-lru.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
apiVersion: llm-d.ai/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: precise-prefix-cache-scorer
6+
- type: precise-prefix-cache-producer
77
parameters:
88
indexerConfig:
99
tokenProcessorConfig:
1010
blockSize: 5
1111
kvBlockIndexConfig:
1212
maxPrefixBlocksToMatch: 256
13+
- type: prefix-cache-scorer
14+
parameters:
15+
prefixMatchInfoProducerName: precise-prefix-cache-producer
1316
- type: no-hit-lru-scorer
1417
parameters:
1518
lruSize: 2048
@@ -21,7 +24,7 @@ schedulingProfiles:
2124
plugins:
2225
- pluginRef: decode-filter
2326
- pluginRef: max-score-picker
24-
- pluginRef: precise-prefix-cache-scorer
27+
- pluginRef: prefix-cache-scorer
2528
weight: 2
2629
- pluginRef: no-hit-lru-scorer
2730
weight: 1

0 commit comments

Comments
 (0)