feat: use Tinyllama as the "model" for kind test

zdtsw · zdtsw · commit 7aa5bc3ca91c · 2026-01-21T18:13:11.000+01:00
- in order to test precies-prefix-cache-score we cannot use
  fool-reviewer since it need call kv-cache-manager to get tokenizer by
  getting a real model from HF
- the change is to switch the "default model" to TinyLlama
- also to make tokenizer folder writable need change permission to the
  USER in Dockerfile
- rename dp-epp-config.yaml sim-dp-epp-config.yaml as it is used for
  local test

Signed-off-by: Wen Zhou &lt;wenzhou@redhat.com&gt;
diff --git a/Dockerfile.epp b/Dockerfile.epp
@@ -92,6 +92,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace
 RUN ${PYTHON} -c "import tokenizer_wrapper"  # verify tokenizer_wrapper is correctly installed
 
 ENV HF_HOME="/tmp/.cache"
+# used by kv-cache-manager
+ENV LOCAL_TOKENIZER_DIR="/tmp/.cache"
+# Create cache directory and set permissions for non-root user
+RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME}
 
 USER 65532:65532
 
@@ -103,4 +107,3 @@ EXPOSE 9090
 EXPOSE 5557
 
 ENTRYPOINT ["/app/epp"]
-
diff --git a/deploy/config/epp-config.yaml b/deploy/config/epp-config.yaml
@@ -3,7 +3,7 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -12,5 +12,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-dp-epp-config.yaml b/deploy/config/sim-dp-epp-config.yaml
@@ -3,7 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
+  parameters:
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: decode-filter
 - type: max-score-picker
 - type: data-parallel-profile-handler
@@ -14,5 +20,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-epp-config.yaml b/deploy/config/sim-epp-config.yaml
@@ -3,11 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: decode-filter
 - type: max-score-picker
 - type: single-profile-handler
@@ -16,5 +18,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/deploy/config/sim-epp-kvcache-config.yaml b/deploy/config/sim-epp-kvcache-config.yaml
@@ -3,7 +3,7 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
     mode: cache_tracking
     tokenProcessorConfig:
@@ -15,7 +15,7 @@ plugins:
       prefixStoreConfig:
         blockSize: 16 
       tokenizersPoolConfig:
-        modelName: <model-name>            # specify the model name to use for tokenizer loading
+        modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0  # replace value to use different model for tokenizer loading
         hf:
           tokenizersCacheDir: "/cache/tokenizers"
       kvBlockIndexConfig:
@@ -29,5 +29,5 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 10
diff --git a/deploy/config/sim-epp-no-hit-lru.yaml b/deploy/config/sim-epp-no-hit-lru.yaml
@@ -3,11 +3,13 @@
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: no-hit-lru-scorer
   parameters:
     lruSize: 2048
@@ -19,7 +21,7 @@ schedulingProfiles:
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
   - pluginRef: no-hit-lru-scorer
     weight: 1
diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml
@@ -4,11 +4,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
 plugins:
 - type: prefill-header-handler
-- type: prefix-cache-scorer
+- type: precise-prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
-    maxPrefixBlocksToMatch: 256
-    lruCapacityPerServer: 31250
+    indexerConfig:
+      tokenProcessorConfig:
+        blockSize: 5
+      kvBlockIndexConfig:
+        maxPrefixBlocksToMatch: 256
 - type: prefill-filter
 - type: decode-filter
 - type: max-score-picker
@@ -22,11 +24,11 @@ schedulingProfiles:
   plugins:
   - pluginRef: prefill-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
 - name: decode
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
-  - pluginRef: prefix-cache-scorer
+  - pluginRef: precise-prefix-cache-scorer
     weight: 2
diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh
@@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}"
 export EPP_IMAGE
 
 # Set the model name to deploy
-export MODEL_NAME="${MODEL_NAME:-food-review}"
+export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}"
 # Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct")
 export MODEL_FAMILY="${MODEL_NAME%%/*}"
 # Extract model ID (e.g., "Llama-3.1-8B-Instruct")
@@ -89,7 +89,7 @@ else
         PRIMARY_PORT="8000"
       fi
     else
-      DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
+      DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml"
     fi
   else
     echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"