Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,27 @@ EPP_TAG ?= dev
IMG = $(IMAGE_TAG_BASE):$(EPP_TAG)
NAMESPACE ?= hc4ai-operator

# Map go arch to typos arch
ifeq ($(TARGETARCH),amd64)
TYPOS_TARGET_ARCH = x86_64
else ifeq ($(TARGETARCH),arm64)
TYPOS_TARGET_ARCH = aarch64
else
TYPOS_TARGET_ARCH = $(TARGETARCH)
endif

ifeq ($(TARGETOS),darwin)
ifeq ($(TARGETARCH),amd64)
TOKENIZER_ARCH = x86_64
else
TOKENIZER_ARCH = $(TARGETARCH)
endif
TAR_OPTS = --strip-components 1
TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-apple-darwin
else
TOKENIZER_ARCH = $(TARGETARCH)
TAR_OPTS = --wildcards '*/typos'
TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-unknown-linux-musl
endif

CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "")
Expand Down Expand Up @@ -94,6 +107,7 @@ post-deploy-test: ## Run post deployment tests
lint: check-golangci-lint check-typos ## Run lint
@printf "\033[33;1m==== Running linting ====\033[0m\n"
golangci-lint run
$(TYPOS)

##@ Build

Expand Down Expand Up @@ -388,4 +402,3 @@ download-zmq: ## Install ZMQ dependencies based on OS/ARCH
fi; \
echo "✅ ZMQ dependencies installed."; \
fi

2 changes: 1 addition & 1 deletion Makefile.tools.mk
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ TYPOS_VERSION ?= v1.34.0
typos: $(TYPOS)
$(TYPOS): | $(LOCALBIN)
@echo "Downloading typos $(TYPOS_VERSION)..."
curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-x86_64-unknown-linux-musl.tar.gz | tar -xz -C $(LOCALBIN) --wildcards '*/typos'
curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-$(TYPOS_ARCH).tar.gz | tar -xz -C $(LOCALBIN) $(TAR_OPTS)
chmod +x $(TYPOS)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ schedulingProfiles:
plugins:
- pluginRef: decode-filter
- pluginRef: prefix-cache-scorer
weight: 2.0
weight: 1.0
- pluginRef: load-aware-scorer
weight: 1.0
- pluginRef: max-score-picker
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@ kind: EndpointPickerConfig
plugins:
- type: single-profile-handler
- type: decode-filter
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
mode: cache_tracking
indexerConfig:
tokenProcessorConfig:
blockSize: 64 # must match vLLM block size
Expand All @@ -21,8 +20,8 @@ schedulingProfiles:
- name: default
plugins:
- pluginRef: decode-filter
- pluginRef: prefix-cache-scorer
weight: 3.0
- pluginRef: precise-prefix-cache-scorer
weight: 2.0
- pluginRef: kv-cache-scorer
weight: 1.0
- pluginRef: queue-scorer
Expand Down
38 changes: 11 additions & 27 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ The design enables:

- Support for **multiple base models** within a shared cluster [Not supported in
Phase1]
- Efficient routing based on **KV cache locality**, **prefix**, **session affinity**, **load**, and
- Efficient routing based on **KV cache locality**, **session affinity**, **load**, and
**model metadata**
- Disaggregated **Prefill/Decode (P/D)** execution
- Pluggable **filters**, **scorers**, and **scrapers** for extensible routing
Expand Down Expand Up @@ -245,29 +245,14 @@ Filters out pods that are not marked as prefill. The filter looks for the label

---

#### PrefixCacheScorer
#### PrecisePrefixCacheScorer

The `prefix-cache-scorer` scores a request based on KV-cache localities.
It supports two modes: `estimate` and `cache_tracking`.

##### `estimate` mode (default):

This mode uses the default GIE prefix scorer and scores pods based on the estimated cache locality of the prompt.
The estimation is based on scheduling history.

- **Type**: `prefix-cache-scorer`
- **Parameters**:
- `hashBlockSize`: Specifies the size of the blocks used to split the input **prompt** when calculating block hashes. Defaults to `64` if not specified.
- `maxPrefixBlocksToMatch`: Specifies the maximum number of prefix blocks to match. Defaults to `256` if not specified.
- `lruCapacityPerServer`: Specifies the capacity of the LRU indexer, in number of entries per server (pod). Defaults to `31,250` if not specified.

**Note:** `mode: estimate` is not required, as it is the default.

##### `cache_tracking` mode:

This mode scores requests based on the actual KV-cache states across the vLLM instances.
It is more accurate than both `SessionAffinity` and `PrefixCachePlugin` in `estimate` mode,
but incurs additional computation overhead and KV-Events streaming to track the KV-cache states.
The `precise-prefix-cache-scorer` scores a request based on KV-cache localities.
Similarly to the IGW `prefix-cache-scorer`, it provides a score based on the number of
matching KV-cache blocks between the request's prompt and the KV-cache contents of each pod.
However, unlike the IGW `prefix-cache-scorer`, which relies on estimations based on scheduling history,
the `precise-prefix-cache-scorer` tracks the real-time KV-cache states across the vLLM instances to
provide more accurate scoring.

When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-cache states
across the vLLM instances. It will use the `kvcache.Indexer` to score the pods based on the
Expand All @@ -276,9 +261,8 @@ When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-c

Configuration:

- **Type**: `prefix-cache-scorer`
- **Type**: `precise-prefix-cache-scorer`
- **Parameters**:
- `mode: cache_tracking`
- `indexerConfig`: Configuration for the `kvcache.Indexer`.
- `kvEventsConfig`: Configuration for the `kvevents.Pool`.

Expand All @@ -294,7 +278,7 @@ Example configuration with the above parameters set:

```yaml
plugins:
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
indexerConfig:
tokenProcessorConfig:
Expand All @@ -310,7 +294,7 @@ Example configuration with all parameters set:

```yaml
plugins:
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
kvEventsConfig:
zmqEndpoint: tcp://*:5557
Expand Down
6 changes: 2 additions & 4 deletions pkg/plugins/register.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
package plugins

import (
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"

"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/filter"
prerequest "github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/pre-request"
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/profile"
"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/scorer"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
)

// RegisterAllPlugins registers the factory functions of all plugins in this repository.
Expand All @@ -18,7 +16,7 @@ func RegisterAllPlugins() {
plugins.Register(filter.PrefillRoleType, filter.PrefillRoleFactory)
plugins.Register(prerequest.PrefillHeaderHandlerType, prerequest.PrefillHeaderHandlerFactory)
plugins.Register(profile.PdProfileHandlerType, profile.PdProfileHandlerFactory)
plugins.Register(prefix.PrefixCachePluginType, scorer.PrefixCachePluginFactory)
plugins.Register(scorer.PrecisePrefixCachePluginType, scorer.PrecisePrefixCachePluginFactory)
plugins.Register(scorer.LoadAwareType, scorer.LoadAwareFactory)
plugins.Register(scorer.SessionAffinityType, scorer.SessionAffinityFactory)
plugins.Register(scorer.ActiveRequestType, scorer.ActiveRequestFactory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,18 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
)

// PrefixCacheTrackingConfig holds the configuration for the
// PrefixCacheTracking.
type PrefixCacheTrackingConfig struct {
const (
// PrecisePrefixCachePluginType is the type-name of the PrecisePrefixCacheScorer plugin.
PrecisePrefixCachePluginType = "precise-prefix-cache-scorer"
)

// PrecisePrefixCachePluginConfig holds the configuration for the
// PrecisePrefixCacheScorer plugin.
type PrecisePrefixCachePluginConfig struct {
// IndexerConfig holds the configuration for the `kvcache.Indexer` which is
// used to score pods based on the KV-cache index state.
IndexerConfig *kvcache.Config `json:"indexerConfig"`
Expand All @@ -29,13 +33,13 @@ type PrefixCacheTrackingConfig struct {
}

// compile-time type assertion
var _ framework.Scorer = &PrefixCacheTracking{}
var _ framework.Scorer = &PrecisePrefixCacheScorer{}

// PrefixCacheTrackingPluginFactory defines the factory function for creating
// PrecisePrefixCachePluginFactory defines the factory function for creating
// a new instance of the PrefixCacheTrackingPlugin.
func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage,
func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage,
handle plugins.Handle) (plugins.Plugin, error) {
parameters := PrefixCacheTrackingConfig{
parameters := PrecisePrefixCachePluginConfig{
IndexerConfig: kvcache.NewDefaultConfig(),
KVEventsConfig: kvevents.DefaultConfig(),
}
Expand All @@ -47,13 +51,13 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage

if rawParameters != nil {
if err := json.Unmarshal(rawParameters, &parameters); err != nil {
return nil, fmt.Errorf("failed to parse %s plugin config: %w", prefix.PrefixCachePluginType, err)
return nil, fmt.Errorf("failed to parse %s plugin config: %w", PrecisePrefixCachePluginType, err)
}
}

scorer, err := New(handle.Context(), parameters)
if err != nil {
return nil, fmt.Errorf("failed to create %s plugin: %w", prefix.PrefixCachePluginType, err)
return nil, fmt.Errorf("failed to create %s plugin: %w", PrecisePrefixCachePluginType, err)
}

return scorer.WithName(name), nil
Expand All @@ -68,7 +72,7 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage
//
// If the configuration is invalid or if the indexer fails to initialize,
// an error is returned.
func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTracking, error) {
func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePrefixCacheScorer, error) {
// initialize the indexer
kvCacheIndexer, err := kvcache.NewKVCacheIndexer(ctx, config.IndexerConfig)
if err != nil {
Expand All @@ -81,36 +85,36 @@ func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTra
pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex())
pool.Start(ctx)

return &PrefixCacheTracking{
typedName: plugins.TypedName{Type: prefix.PrefixCachePluginType},
return &PrecisePrefixCacheScorer{
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
kvCacheIndexer: kvCacheIndexer,
}, nil
}

// PrefixCacheTracking implements the framework.Scorer interface.
// The scorer implements the `cache_tracking` mode of the prefix cache plugin.
// PrecisePrefixCacheScorer implements the framework.Scorer interface.
// The scorer implements precise prefix-cache KV-block locality scoring.
// It uses the `kvcache.Indexer` to score pods based on the KV-cache index
// state, and the `kvevents.Pool` to subscribe to KV-cache events
// to update the internal KV-cache index state.
type PrefixCacheTracking struct {
// to keep the internal KV-cache index state up-to-date.
type PrecisePrefixCacheScorer struct {
typedName plugins.TypedName
kvCacheIndexer *kvcache.Indexer
}

// TypedName returns the typed name of the plugin.
func (s *PrefixCacheTracking) TypedName() plugins.TypedName {
func (s *PrecisePrefixCacheScorer) TypedName() plugins.TypedName {
return s.typedName
}

// WithName sets the name of the plugin.
func (s *PrefixCacheTracking) WithName(name string) *PrefixCacheTracking {
func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScorer {
s.typedName.Name = name
return s
}

// Score scores the provided pod based on the KVCache index state.
// The returned scores are normalized to a range of 0-1.
func (s *PrefixCacheTracking) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
loggerDebug := log.FromContext(ctx).WithName(s.typedName.String()).V(logutil.DEBUG)
if request == nil {
loggerDebug.Info("Request is nil, skipping scoring")
Expand Down
61 changes: 0 additions & 61 deletions pkg/plugins/scorer/prefix_cache.go

This file was deleted.

Loading
Loading