File tree Expand file tree Collapse file tree 8 files changed +41
-26
lines changed
Expand file tree Collapse file tree 8 files changed +41
-26
lines changed Original file line number Diff line number Diff line change @@ -92,6 +92,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace
9292RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed
9393
9494ENV HF_HOME="/tmp/.cache"
95+ # used by kv-cache-manager
96+ ENV LOCAL_TOKENIZER_DIR="/tmp/.cache"
97+ # Create cache directory and set permissions for non-root user
98+ RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME}
9599
96100USER 65532:65532
97101
@@ -103,4 +107,3 @@ EXPOSE 9090
103107EXPOSE 5557
104108
105109ENTRYPOINT ["/app/epp"]
106-
Original file line number Diff line number Diff line change 33apiVersion : inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
6- - type : prefix-cache-scorer
6+ - type : precise- prefix-cache-scorer
77- type : decode-filter
88- type : max-score-picker
99- type : single-profile-handler
@@ -12,5 +12,5 @@ schedulingProfiles:
1212 plugins :
1313 - pluginRef : decode-filter
1414 - pluginRef : max-score-picker
15- - pluginRef : prefix-cache-scorer
15+ - pluginRef : precise- prefix-cache-scorer
1616 weight : 2
Original file line number Diff line number Diff line change 33apiVersion : inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
6- - type : prefix-cache-scorer
6+ - type : precise-prefix-cache-scorer
7+ parameters :
8+ indexerConfig :
9+ tokenProcessorConfig :
10+ blockSize : 5
11+ kvBlockIndexConfig :
12+ maxPrefixBlocksToMatch : 256
713- type : decode-filter
814- type : max-score-picker
915- type : data-parallel-profile-handler
@@ -14,5 +20,5 @@ schedulingProfiles:
1420 plugins :
1521 - pluginRef : decode-filter
1622 - pluginRef : max-score-picker
17- - pluginRef : prefix-cache-scorer
23+ - pluginRef : precise- prefix-cache-scorer
1824 weight : 2
Original file line number Diff line number Diff line change 33apiVersion : inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
6- - type : prefix-cache-scorer
6+ - type : precise- prefix-cache-scorer
77 parameters :
8- hashBlockSize : 5
9- maxPrefixBlocksToMatch : 256
10- lruCapacityPerServer : 31250
8+ indexerConfig :
9+ tokenProcessorConfig :
10+ blockSize : 5
11+ kvBlockIndexConfig :
12+ maxPrefixBlocksToMatch : 256
1113- type : decode-filter
1214- type : max-score-picker
1315- type : single-profile-handler
@@ -16,5 +18,5 @@ schedulingProfiles:
1618 plugins :
1719 - pluginRef : decode-filter
1820 - pluginRef : max-score-picker
19- - pluginRef : prefix-cache-scorer
21+ - pluginRef : precise- prefix-cache-scorer
2022 weight : 2
Original file line number Diff line number Diff line change 33apiVersion : inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
6- - type : prefix-cache-scorer
6+ - type : precise- prefix-cache-scorer
77 parameters :
88 mode : cache_tracking
99 tokenProcessorConfig :
@@ -15,7 +15,7 @@ plugins:
1515 prefixStoreConfig :
1616 blockSize : 16
1717 tokenizersPoolConfig :
18- modelName : <model-name> # specify the model name to use for tokenizer loading
18+ modelName : TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace value to use different model for tokenizer loading
1919 hf :
2020 tokenizersCacheDir : " /cache/tokenizers"
2121 kvBlockIndexConfig :
@@ -29,5 +29,5 @@ schedulingProfiles:
2929 plugins :
3030 - pluginRef : decode-filter
3131 - pluginRef : max-score-picker
32- - pluginRef : prefix-cache-scorer
32+ - pluginRef : precise- prefix-cache-scorer
3333 weight : 10
Original file line number Diff line number Diff line change 33apiVersion : inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
6- - type : prefix-cache-scorer
6+ - type : precise- prefix-cache-scorer
77 parameters :
8- hashBlockSize : 5
9- maxPrefixBlocksToMatch : 256
10- lruCapacityPerServer : 31250
8+ indexerConfig :
9+ tokenProcessorConfig :
10+ blockSize : 5
11+ kvBlockIndexConfig :
12+ maxPrefixBlocksToMatch : 256
1113- type : no-hit-lru-scorer
1214 parameters :
1315 lruSize : 2048
@@ -19,7 +21,7 @@ schedulingProfiles:
1921 plugins :
2022 - pluginRef : decode-filter
2123 - pluginRef : max-score-picker
22- - pluginRef : prefix-cache-scorer
24+ - pluginRef : precise- prefix-cache-scorer
2325 weight : 2
2426 - pluginRef : no-hit-lru-scorer
2527 weight : 1
Original file line number Diff line number Diff line change @@ -4,11 +4,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1
44kind : EndpointPickerConfig
55plugins :
66- type : prefill-header-handler
7- - type : prefix-cache-scorer
7+ - type : precise- prefix-cache-scorer
88 parameters :
9- hashBlockSize : 5
10- maxPrefixBlocksToMatch : 256
11- lruCapacityPerServer : 31250
9+ indexerConfig :
10+ tokenProcessorConfig :
11+ blockSize : 5
12+ kvBlockIndexConfig :
13+ maxPrefixBlocksToMatch : 256
1214- type : prefill-filter
1315- type : decode-filter
1416- type : max-score-picker
@@ -22,11 +24,11 @@ schedulingProfiles:
2224 plugins :
2325 - pluginRef : prefill-filter
2426 - pluginRef : max-score-picker
25- - pluginRef : prefix-cache-scorer
27+ - pluginRef : precise- prefix-cache-scorer
2628 weight : 2
2729- name : decode
2830 plugins :
2931 - pluginRef : decode-filter
3032 - pluginRef : max-score-picker
31- - pluginRef : prefix-cache-scorer
33+ - pluginRef : precise- prefix-cache-scorer
3234 weight : 2
Original file line number Diff line number Diff line change @@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}"
3737export EPP_IMAGE
3838
3939# Set the model name to deploy
40- export MODEL_NAME=" ${MODEL_NAME:- food-review } "
40+ export MODEL_NAME=" ${MODEL_NAME:- TinyLlama / TinyLlama-1.1B-Chat-v1.0 } "
4141# Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct")
4242export MODEL_FAMILY=" ${MODEL_NAME%%/* } "
4343# Extract model ID (e.g., "Llama-3.1-8B-Instruct")
8989 PRIMARY_PORT=" 8000"
9090 fi
9191 else
92- DEFAULT_EPP_CONFIG=" deploy/config/dp-epp-config.yaml"
92+ DEFAULT_EPP_CONFIG=" deploy/config/sim- dp-epp-config.yaml"
9393 fi
9494 else
9595 echo " Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"
You can’t perform that action at this time.
0 commit comments