Skip to content

Commit 7aa5bc3

Browse files
committed
feat: use Tinyllama as the "model" for kind test
- in order to test precies-prefix-cache-score we cannot use fool-reviewer since it need call kv-cache-manager to get tokenizer by getting a real model from HF - the change is to switch the "default model" to TinyLlama - also to make tokenizer folder writable need change permission to the USER in Dockerfile - rename dp-epp-config.yaml sim-dp-epp-config.yaml as it is used for local test Signed-off-by: Wen Zhou <wenzhou@redhat.com>
1 parent 981f17a commit 7aa5bc3

File tree

8 files changed

+41
-26
lines changed

8 files changed

+41
-26
lines changed

Dockerfile.epp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace
9292
RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed
9393

9494
ENV HF_HOME="/tmp/.cache"
95+
# used by kv-cache-manager
96+
ENV LOCAL_TOKENIZER_DIR="/tmp/.cache"
97+
# Create cache directory and set permissions for non-root user
98+
RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME}
9599

96100
USER 65532:65532
97101

@@ -103,4 +107,3 @@ EXPOSE 9090
103107
EXPOSE 5557
104108

105109
ENTRYPOINT ["/app/epp"]
106-

deploy/config/epp-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache-scorer
6+
- type: precise-prefix-cache-scorer
77
- type: decode-filter
88
- type: max-score-picker
99
- type: single-profile-handler
@@ -12,5 +12,5 @@ schedulingProfiles:
1212
plugins:
1313
- pluginRef: decode-filter
1414
- pluginRef: max-score-picker
15-
- pluginRef: prefix-cache-scorer
15+
- pluginRef: precise-prefix-cache-scorer
1616
weight: 2
Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache-scorer
6+
- type: precise-prefix-cache-scorer
7+
parameters:
8+
indexerConfig:
9+
tokenProcessorConfig:
10+
blockSize: 5
11+
kvBlockIndexConfig:
12+
maxPrefixBlocksToMatch: 256
713
- type: decode-filter
814
- type: max-score-picker
915
- type: data-parallel-profile-handler
@@ -14,5 +20,5 @@ schedulingProfiles:
1420
plugins:
1521
- pluginRef: decode-filter
1622
- pluginRef: max-score-picker
17-
- pluginRef: prefix-cache-scorer
23+
- pluginRef: precise-prefix-cache-scorer
1824
weight: 2

deploy/config/sim-epp-config.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache-scorer
6+
- type: precise-prefix-cache-scorer
77
parameters:
8-
hashBlockSize: 5
9-
maxPrefixBlocksToMatch: 256
10-
lruCapacityPerServer: 31250
8+
indexerConfig:
9+
tokenProcessorConfig:
10+
blockSize: 5
11+
kvBlockIndexConfig:
12+
maxPrefixBlocksToMatch: 256
1113
- type: decode-filter
1214
- type: max-score-picker
1315
- type: single-profile-handler
@@ -16,5 +18,5 @@ schedulingProfiles:
1618
plugins:
1719
- pluginRef: decode-filter
1820
- pluginRef: max-score-picker
19-
- pluginRef: prefix-cache-scorer
21+
- pluginRef: precise-prefix-cache-scorer
2022
weight: 2

deploy/config/sim-epp-kvcache-config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache-scorer
6+
- type: precise-prefix-cache-scorer
77
parameters:
88
mode: cache_tracking
99
tokenProcessorConfig:
@@ -15,7 +15,7 @@ plugins:
1515
prefixStoreConfig:
1616
blockSize: 16
1717
tokenizersPoolConfig:
18-
modelName: <model-name> # specify the model name to use for tokenizer loading
18+
modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace value to use different model for tokenizer loading
1919
hf:
2020
tokenizersCacheDir: "/cache/tokenizers"
2121
kvBlockIndexConfig:
@@ -29,5 +29,5 @@ schedulingProfiles:
2929
plugins:
3030
- pluginRef: decode-filter
3131
- pluginRef: max-score-picker
32-
- pluginRef: prefix-cache-scorer
32+
- pluginRef: precise-prefix-cache-scorer
3333
weight: 10

deploy/config/sim-epp-no-hit-lru.yaml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
6-
- type: prefix-cache-scorer
6+
- type: precise-prefix-cache-scorer
77
parameters:
8-
hashBlockSize: 5
9-
maxPrefixBlocksToMatch: 256
10-
lruCapacityPerServer: 31250
8+
indexerConfig:
9+
tokenProcessorConfig:
10+
blockSize: 5
11+
kvBlockIndexConfig:
12+
maxPrefixBlocksToMatch: 256
1113
- type: no-hit-lru-scorer
1214
parameters:
1315
lruSize: 2048
@@ -19,7 +21,7 @@ schedulingProfiles:
1921
plugins:
2022
- pluginRef: decode-filter
2123
- pluginRef: max-score-picker
22-
- pluginRef: prefix-cache-scorer
24+
- pluginRef: precise-prefix-cache-scorer
2325
weight: 2
2426
- pluginRef: no-hit-lru-scorer
2527
weight: 1

deploy/config/sim-pd-epp-config.yaml

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1
44
kind: EndpointPickerConfig
55
plugins:
66
- type: prefill-header-handler
7-
- type: prefix-cache-scorer
7+
- type: precise-prefix-cache-scorer
88
parameters:
9-
hashBlockSize: 5
10-
maxPrefixBlocksToMatch: 256
11-
lruCapacityPerServer: 31250
9+
indexerConfig:
10+
tokenProcessorConfig:
11+
blockSize: 5
12+
kvBlockIndexConfig:
13+
maxPrefixBlocksToMatch: 256
1214
- type: prefill-filter
1315
- type: decode-filter
1416
- type: max-score-picker
@@ -22,11 +24,11 @@ schedulingProfiles:
2224
plugins:
2325
- pluginRef: prefill-filter
2426
- pluginRef: max-score-picker
25-
- pluginRef: prefix-cache-scorer
27+
- pluginRef: precise-prefix-cache-scorer
2628
weight: 2
2729
- name: decode
2830
plugins:
2931
- pluginRef: decode-filter
3032
- pluginRef: max-score-picker
31-
- pluginRef: prefix-cache-scorer
33+
- pluginRef: precise-prefix-cache-scorer
3234
weight: 2

scripts/kind-dev-env.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}"
3737
export EPP_IMAGE
3838

3939
# Set the model name to deploy
40-
export MODEL_NAME="${MODEL_NAME:-food-review}"
40+
export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}"
4141
# Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct")
4242
export MODEL_FAMILY="${MODEL_NAME%%/*}"
4343
# Extract model ID (e.g., "Llama-3.1-8B-Instruct")
@@ -89,7 +89,7 @@ else
8989
PRIMARY_PORT="8000"
9090
fi
9191
else
92-
DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
92+
DEFAULT_EPP_CONFIG="deploy/config/sim-dp-epp-config.yaml"
9393
fi
9494
else
9595
echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"

0 commit comments

Comments
 (0)