read configuration of prefix aware scorer from environment variables

mayabar · mayabar · commit d046d75a1e36 · 2025-05-27T12:06:33.000+03:00
Signed-off-by: Maya Barnea &lt;mayab@il.ibm.com&gt;
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -81,7 +81,7 @@ These components are maintained in the `llm-d-inference-scheduler` repository an
 | Scorer           | Description                                | Env Vars |
 |------------------|--------------------------------------------|----------|
 | Session-aware    | Prefers pods from same session             | `ENABLE_SESSION_AWARE_SCORER`, `SESSION_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_SESSION_AWARE_SCORER`, `PREFILL_SESSION_AWARE_SCORER_WEIGHT` |
-| Prefix-aware     | Matches prompt prefix                      | `ENABLE_PREFIX_AWARE_SCORER`, `PREFIX_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_PREFIX_AWARE_SCORER`, `PREFILL_PREFIX_AWARE_SCORER_WEIGHT`, `PREFIX_SCORER_BLOCK_SIZE`|
+| Prefix-aware     | Matches prompt prefix                      | `ENABLE_PREFIX_AWARE_SCORER`, `PREFIX_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_PREFIX_AWARE_SCORER`, `PREFILL_PREFIX_AWARE_SCORER_WEIGHT`, `PREFIX_SCORER_BLOCK_SIZE`, `PREFIX_SCORER_MAX_CACHE_SIZE`, `PREFIX_SCORER_MAX_BLOCK_CACHE_SIZE`|
 | KVCache-aware    | Optimizes for KV reuse                     | `ENABLE_KVCACHE_AWARE_SCORER`, `KVCACHE_INDEXER_REDIS_ADDR`, `PREFILL_ENABLE_KVCACHE_AWARE_SCORER`, `PREFILL_KVCACHE_INDEXER_REDIS_ADDR`, `HF_TOKEN`, `KVCACHE_INDEXER_REDIS_ADDR` |
 | Load-aware       | Avoids busy pods                           | `ENABLE_LOAD_AWARE_SCORER`, `LOAD_AWARE_SCORER_WEIGHT`, `PREFILL_ENABLE_LOAD_AWARE_SCORER`, `PREFILL_LOAD_AWARE_SCORER_WEIGHT` |
 
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -52,8 +52,17 @@ const (
 	pdPromptLenThresholdEnvKey  = "PD_PROMPT_LEN_THRESHOLD"
 	pdPromptLenThresholdDefault = 100
 
-	prefixScorerBlockSizeEnvKey  = "PREFIX_SCORER_BLOCK_SIZE"
-	prefixScorerBlockSizeDefault = 256
+	prefixMaxCacheSizeKey = "PREFIX_SCORER_MAX_CACHE_SIZE"
+	// DefaultPrefixMaxCacheSize sets the maximum number of blocks the LRU cache can store.
+	DefaultPrefixMaxCacheSize = 500000
+
+	prefixScorerBlockSizeEnvKey = "PREFIX_SCORER_BLOCK_SIZE"
+	// DefaultPrefixBlockSize defines how many runes each block contains in the prefix cache.
+	DefaultPrefixBlockSize = 256
+
+	prefixMaxBlockCacheSizeKey = "PREFIX_SCORER_MAX_BLOCK_CACHE_SIZE"
+	// DefaultPrefixMaxBlockCacheSize sets the maximum number of pods a block can store.
+	DefaultPrefixMaxBlockCacheSize = 100
 )
 
 // Config contains scheduler configuration, currently configuration is loaded from environment variables
@@ -62,9 +71,11 @@ type Config struct {
 	DecodeSchedulerPlugins  map[string]int
 	PrefillSchedulerPlugins map[string]int
 
-	PDEnabled       bool
-	PDThreshold     int
-	PrefixBlockSize int
+	PDEnabled            bool
+	PDThreshold          int
+	PrefixBlockSize      int
+	PrefixCacheSize      int
+	PrefixBlockCacheSize int
 }
 
 // NewConfig creates a new instance if Config
@@ -75,7 +86,9 @@ func NewConfig(logger logr.Logger) *Config {
 		PrefillSchedulerPlugins: map[string]int{},
 		PDEnabled:               false,
 		PDThreshold:             math.MaxInt,
-		PrefixBlockSize:         prefixScorerBlockSizeDefault,
+		PrefixBlockSize:         DefaultPrefixBlockSize,
+		PrefixCacheSize:         DefaultPrefixMaxCacheSize,
+		PrefixBlockCacheSize:    DefaultPrefixMaxBlockCacheSize,
 	}
 }
 
@@ -95,7 +108,9 @@ func (c *Config) LoadConfig() {
 
 	c.PDEnabled = env.GetEnvString(pdEnabledEnvKey, "false", c.logger) == "true"
 	c.PDThreshold = env.GetEnvInt(pdPromptLenThresholdEnvKey, pdPromptLenThresholdDefault, c.logger)
-	c.PrefixBlockSize = env.GetEnvInt(prefixScorerBlockSizeEnvKey, prefixScorerBlockSizeDefault, c.logger)
+	c.PrefixBlockSize = env.GetEnvInt(prefixScorerBlockSizeEnvKey, DefaultPrefixBlockSize, c.logger)
+	c.PrefixCacheSize = env.GetEnvInt(prefixMaxCacheSizeKey, DefaultPrefixMaxCacheSize, c.logger)
+	c.PrefixBlockCacheSize = env.GetEnvInt(prefixMaxBlockCacheSizeKey, DefaultPrefixMaxBlockCacheSize, c.logger)
 }
 
 func (c *Config) loadPluginInfo(plugins map[string]int, prefill bool, pluginNames ...string) {
diff --git a/pkg/scheduling/plugins/scorer/prefix_store.go b/pkg/scheduling/plugins/scorer/prefix_store.go
@@ -10,15 +10,8 @@ import (
 
 	"github.com/cespare/xxhash/v2"
 	lru "github.com/hashicorp/golang-lru/v2"
-)
 
-const (
-	// defaultMaxCacheSize sets the maximum number of blocks the LRU cache can store.
-	defaultMaxCacheSize = 500000
-	// defaultBlockSize defines how many runes each block contains in the prefix cache.
-	defaultBlockSize = 256
-	// defaultMaxBlockCacheSize sets the maximum number of pods a block can store.
-	defaultMaxBlockCacheSize = 100
+	"github.com/llm-d/llm-d-inference-scheduler/pkg/config"
 )
 
 // PrefixStoreConfig contains initialization configuration for PrefixStore.
@@ -35,9 +28,9 @@ type PrefixStoreConfig struct {
 // configuration.
 func DefaultPrefixStoreConfig() *PrefixStoreConfig {
 	return &PrefixStoreConfig{
-		CacheSize:      defaultMaxCacheSize,
-		BlockSize:      defaultBlockSize,
-		BlockCacheSize: defaultMaxBlockCacheSize,
+		CacheSize:      config.DefaultPrefixMaxCacheSize,
+		BlockSize:      config.DefaultPrefixBlockSize,
+		BlockCacheSize: config.DefaultPrefixMaxBlockCacheSize,
 	}
 }