llm-d · github-actions · Aug 31, 2025 · Aug 29, 2025
diff --git a/Makefile b/Makefile
@@ -13,14 +13,27 @@ EPP_TAG ?= dev
 IMG = $(IMAGE_TAG_BASE):$(EPP_TAG)
 NAMESPACE ?= hc4ai-operator
 
+# Map go arch to typos arch
+ifeq ($(TARGETARCH),amd64)
+TYPOS_TARGET_ARCH = x86_64
+else ifeq ($(TARGETARCH),arm64)
+TYPOS_TARGET_ARCH = aarch64
+else
+TYPOS_TARGET_ARCH = $(TARGETARCH)
+endif
+
 ifeq ($(TARGETOS),darwin)
 ifeq ($(TARGETARCH),amd64)
 TOKENIZER_ARCH = x86_64
 else
 TOKENIZER_ARCH = $(TARGETARCH)
 endif
+TAR_OPTS = --strip-components 1
+TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-apple-darwin
 else
 TOKENIZER_ARCH = $(TARGETARCH)
+TAR_OPTS = --wildcards '*/typos'
+TYPOS_ARCH = $(TYPOS_TARGET_ARCH)-unknown-linux-musl
 endif
 
 CONTAINER_TOOL := $(shell { command -v docker >/dev/null 2>&1 && echo docker; } || { command -v podman >/dev/null 2>&1 && echo podman; } || echo "")
@@ -94,6 +107,7 @@ post-deploy-test: ## Run post deployment tests
 lint: check-golangci-lint check-typos ## Run lint
 	@printf "\033[33;1m==== Running linting ====\033[0m\n"
 	golangci-lint run
+	$(TYPOS)
 
 ##@ Build
 
@@ -388,4 +402,3 @@ download-zmq: ## Install ZMQ dependencies based on OS/ARCH
 	  fi; \
 	  echo "✅ ZMQ dependencies installed."; \
 	fi
-
diff --git a/Makefile.tools.mk b/Makefile.tools.mk
@@ -12,5 +12,5 @@ TYPOS_VERSION ?= v1.34.0
 typos: $(TYPOS)
 $(TYPOS): | $(LOCALBIN)
 	@echo "Downloading typos $(TYPOS_VERSION)..."
-	curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-x86_64-unknown-linux-musl.tar.gz | tar -xz -C $(LOCALBIN) --wildcards '*/typos'
+	curl -L https://github.com/crate-ci/typos/releases/download/$(TYPOS_VERSION)/typos-$(TYPOS_VERSION)-$(TYPOS_ARCH).tar.gz | tar -xz -C $(LOCALBIN) $(TAR_OPTS)
 	chmod +x $(TYPOS)
diff --git a/...oy/config/epp-prefix-estimate-config.yaml → ...fig/epp-estimate-prefix-cache-config.yaml b/...oy/config/epp-prefix-estimate-config.yaml → ...fig/epp-estimate-prefix-cache-config.yaml
@@ -13,7 +13,7 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: prefix-cache-scorer
-    weight: 2.0
+    weight: 1.0
   - pluginRef: load-aware-scorer
     weight: 1.0
   - pluginRef: max-score-picker
diff --git a/...fig/epp-prefix-cache-tracking-config.yaml → ...nfig/epp-precise-prefix-cache-config.yaml b/...fig/epp-prefix-cache-tracking-config.yaml → ...nfig/epp-precise-prefix-cache-config.yaml
@@ -5,9 +5,8 @@ kind: EndpointPickerConfig
 plugins:
   - type: single-profile-handler
   - type: decode-filter
-  - type: prefix-cache-scorer
+  - type: precise-prefix-cache-scorer
     parameters:
-      mode: cache_tracking
       indexerConfig:
         tokenProcessorConfig:
           blockSize: 64                 # must match vLLM block size
@@ -21,8 +20,8 @@ schedulingProfiles:
   - name: default
     plugins:
       - pluginRef: decode-filter
-      - pluginRef: prefix-cache-scorer
-        weight: 3.0
+      - pluginRef: precise-prefix-cache-scorer
+        weight: 2.0
       - pluginRef: kv-cache-scorer
         weight: 1.0
       - pluginRef: queue-scorer

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -12,7 +12,7 @@ The design enables:
 
 - Support for **multiple base models** within a shared cluster [Not supported in
 Phase1]
-- Efficient routing based on **KV cache locality**, **prefix**, **session affinity**, **load**, and
+- Efficient routing based on **KV cache locality**, **session affinity**, **load**, and
 **model metadata**
 - Disaggregated **Prefill/Decode (P/D)** execution
 - Pluggable **filters**, **scorers**, and **scrapers** for extensible routing
@@ -245,29 +245,14 @@ Filters out pods that are not marked as prefill. The filter looks for the label
 
 ---
 
-#### PrefixCacheScorer
+#### PrecisePrefixCacheScorer
 
-The `prefix-cache-scorer` scores a request based on KV-cache localities.
-It supports two modes: `estimate` and `cache_tracking`.
-
-##### `estimate` mode (default):
-
-This mode uses the default GIE prefix scorer and scores pods based on the estimated cache locality of the prompt.
-The estimation is based on scheduling history.
-
-- **Type**: `prefix-cache-scorer`
-- **Parameters**:
-  - `hashBlockSize`: Specifies the size of the blocks used to split the input **prompt** when calculating block hashes. Defaults to `64` if not specified.
-  - `maxPrefixBlocksToMatch`: Specifies the maximum number of prefix blocks to match. Defaults to `256` if not specified.
-  - `lruCapacityPerServer`: Specifies the capacity of the LRU indexer, in number of entries per server (pod). Defaults to `31,250` if not specified.
-
-**Note:** `mode: estimate` is not required, as it is the default.
-
-##### `cache_tracking` mode:
-
-This mode scores requests based on the actual KV-cache states across the vLLM instances. 
- It is more accurate than both `SessionAffinity` and `PrefixCachePlugin` in `estimate` mode,
- but incurs additional computation overhead and KV-Events streaming to track the KV-cache states.
+The `precise-prefix-cache-scorer` scores a request based on KV-cache localities.
+Similarly to the IGW `prefix-cache-scorer`, it provides a score based on the number of
+ matching KV-cache blocks between the request's prompt and the KV-cache contents of each pod.
+ However, unlike the IGW `prefix-cache-scorer`, which relies on estimations based on scheduling history,
+ the `precise-prefix-cache-scorer` tracks the real-time KV-cache states across the vLLM instances to
+ provide more accurate scoring.
 
 When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-cache states
  across the vLLM instances. It will use the `kvcache.Indexer` to score the pods based on the
@@ -276,9 +261,8 @@ When enabled, the scorer will use the `llm-d-kv-cache-manager` to track the KV-c
 
 Configuration:
 
-- **Type**: `prefix-cache-scorer`
+- **Type**: `precise-prefix-cache-scorer`
 - **Parameters**:
-  - `mode: cache_tracking`
   - `indexerConfig`: Configuration for the `kvcache.Indexer`.
   - `kvEventsConfig`: Configuration for the `kvevents.Pool`.
 
@@ -294,7 +278,7 @@ Example configuration with the above parameters set:
 
 ```yaml
 plugins:
-  - type: prefix-cache-scorer
+  - type: precise-prefix-cache-scorer
     parameters:
       indexerConfig:
         tokenProcessorConfig:
@@ -310,7 +294,7 @@ Example configuration with all parameters set:
 
 ```yaml
 plugins:
-  - type: prefix-cache-scorer
+  - type: precise-prefix-cache-scorer
     parameters:
         kvEventsConfig:
           zmqEndpoint: tcp://*:5557

diff --git a/pkg/plugins/register.go b/pkg/plugins/register.go
@@ -1,13 +1,11 @@
 package plugins
 
 import (
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
-
 	"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/filter"
 	prerequest "github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/pre-request"
 	"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/profile"
 	"github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/scorer"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
 )
 
 // RegisterAllPlugins registers the factory functions of all plugins in this repository.
@@ -18,7 +16,7 @@ func RegisterAllPlugins() {
 	plugins.Register(filter.PrefillRoleType, filter.PrefillRoleFactory)
 	plugins.Register(prerequest.PrefillHeaderHandlerType, prerequest.PrefillHeaderHandlerFactory)
 	plugins.Register(profile.PdProfileHandlerType, profile.PdProfileHandlerFactory)
-	plugins.Register(prefix.PrefixCachePluginType, scorer.PrefixCachePluginFactory)
+	plugins.Register(scorer.PrecisePrefixCachePluginType, scorer.PrecisePrefixCachePluginFactory)
 	plugins.Register(scorer.LoadAwareType, scorer.LoadAwareFactory)
 	plugins.Register(scorer.SessionAffinityType, scorer.SessionAffinityFactory)
 	plugins.Register(scorer.ActiveRequestType, scorer.ActiveRequestFactory)

diff --git a/pkg/plugins/scorer/prefix_cache_tracking.go → pkg/plugins/scorer/precise_prefix_cache.go b/pkg/plugins/scorer/prefix_cache_tracking.go → pkg/plugins/scorer/precise_prefix_cache.go
@@ -11,14 +11,18 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 	logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
 )
 
-// PrefixCacheTrackingConfig holds the configuration for the
-// PrefixCacheTracking.
-type PrefixCacheTrackingConfig struct {
+const (
+	// PrecisePrefixCachePluginType is the type-name of the PrecisePrefixCacheScorer plugin.
+	PrecisePrefixCachePluginType = "precise-prefix-cache-scorer"
+)
+
+// PrecisePrefixCachePluginConfig holds the configuration for the
+// PrecisePrefixCacheScorer plugin.
+type PrecisePrefixCachePluginConfig struct {
 	// IndexerConfig holds the configuration for the `kvcache.Indexer` which is
 	// used to score pods based on the KV-cache index state.
 	IndexerConfig *kvcache.Config `json:"indexerConfig"`
@@ -29,13 +33,13 @@ type PrefixCacheTrackingConfig struct {
 }
 
 // compile-time type assertion
-var _ framework.Scorer = &PrefixCacheTracking{}
+var _ framework.Scorer = &PrecisePrefixCacheScorer{}
 
-// PrefixCacheTrackingPluginFactory defines the factory function for creating
+// PrecisePrefixCachePluginFactory defines the factory function for creating
 // a new instance of the PrefixCacheTrackingPlugin.
-func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage,
+func PrecisePrefixCachePluginFactory(name string, rawParameters json.RawMessage,
 	handle plugins.Handle) (plugins.Plugin, error) {
-	parameters := PrefixCacheTrackingConfig{
+	parameters := PrecisePrefixCachePluginConfig{
 		IndexerConfig:  kvcache.NewDefaultConfig(),
 		KVEventsConfig: kvevents.DefaultConfig(),
 	}
@@ -47,13 +51,13 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage
 
 	if rawParameters != nil {
 		if err := json.Unmarshal(rawParameters, &parameters); err != nil {
-			return nil, fmt.Errorf("failed to parse %s plugin config: %w", prefix.PrefixCachePluginType, err)
+			return nil, fmt.Errorf("failed to parse %s plugin config: %w", PrecisePrefixCachePluginType, err)
 		}
 	}
 
 	scorer, err := New(handle.Context(), parameters)
 	if err != nil {
-		return nil, fmt.Errorf("failed to create %s plugin: %w", prefix.PrefixCachePluginType, err)
+		return nil, fmt.Errorf("failed to create %s plugin: %w", PrecisePrefixCachePluginType, err)
 	}
 
 	return scorer.WithName(name), nil
@@ -68,7 +72,7 @@ func PrefixCacheTrackingPluginFactory(name string, rawParameters json.RawMessage
 //
 // If the configuration is invalid or if the indexer fails to initialize,
 // an error is returned.
-func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTracking, error) {
+func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePrefixCacheScorer, error) {
 	// initialize the indexer
 	kvCacheIndexer, err := kvcache.NewKVCacheIndexer(ctx, config.IndexerConfig)
 	if err != nil {
@@ -81,36 +85,36 @@ func New(ctx context.Context, config PrefixCacheTrackingConfig) (*PrefixCacheTra
 	pool := kvevents.NewPool(config.KVEventsConfig, kvCacheIndexer.KVBlockIndex())
 	pool.Start(ctx)
 
-	return &PrefixCacheTracking{
-		typedName:      plugins.TypedName{Type: prefix.PrefixCachePluginType},
+	return &PrecisePrefixCacheScorer{
+		typedName:      plugins.TypedName{Type: PrecisePrefixCachePluginType},
 		kvCacheIndexer: kvCacheIndexer,
 	}, nil
 }
 
-// PrefixCacheTracking implements the framework.Scorer interface.
-// The scorer implements the `cache_tracking` mode of the prefix cache plugin.
+// PrecisePrefixCacheScorer implements the framework.Scorer interface.
+// The scorer implements precise prefix-cache KV-block locality scoring.
 // It uses the `kvcache.Indexer` to score pods based on the KV-cache index
 // state, and the `kvevents.Pool` to subscribe to KV-cache events
-// to update the internal KV-cache index state.
-type PrefixCacheTracking struct {
+// to keep the internal KV-cache index state up-to-date.
+type PrecisePrefixCacheScorer struct {
 	typedName      plugins.TypedName
 	kvCacheIndexer *kvcache.Indexer
 }
 
 // TypedName returns the typed name of the plugin.
-func (s *PrefixCacheTracking) TypedName() plugins.TypedName {
+func (s *PrecisePrefixCacheScorer) TypedName() plugins.TypedName {
 	return s.typedName
 }
 
 // WithName sets the name of the plugin.
-func (s *PrefixCacheTracking) WithName(name string) *PrefixCacheTracking {
+func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScorer {
 	s.typedName.Name = name
 	return s
 }
 
 // Score scores the provided pod based on the KVCache index state.
 // The returned scores are normalized to a range of 0-1.
-func (s *PrefixCacheTracking) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
+func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	loggerDebug := log.FromContext(ctx).WithName(s.typedName.String()).V(logutil.DEBUG)
 	if request == nil {
 		loggerDebug.Info("Request is nil, skipping scoring")

diff --git a/pkg/plugins/scorer/prefix_cache.go b/pkg/plugins/scorer/prefix_cache.go