diff --git a/Dockerfile.epp b/Dockerfile.epp index 9d1e89fdb..4996c6fbe 100644 --- a/Dockerfile.epp +++ b/Dockerfile.epp @@ -102,6 +102,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed ENV HF_HOME="/tmp/.cache" +# used by kv-cache-manager +ENV LOCAL_TOKENIZER_DIR="/tmp/.cache" +# Create cache directory and set permissions for non-root user +RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME} USER 65532:65532 @@ -113,4 +117,3 @@ EXPOSE 9090 EXPOSE 5557 ENTRYPOINT ["/app/epp"] - diff --git a/deploy/config/dp-epp-config.yaml b/deploy/config/dp-epp-config.yaml index 703a44f67..6e8418866 100644 --- a/deploy/config/dp-epp-config.yaml +++ b/deploy/config/dp-epp-config.yaml @@ -3,7 +3,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer + parameters: + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: decode-filter - type: max-score-picker - type: data-parallel-profile-handler @@ -14,5 +20,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 diff --git a/deploy/config/sim-epp-kvcache-config.yaml b/deploy/config/sim-epp-kvcache-config.yaml index 7850950ef..76aab070f 100644 --- a/deploy/config/sim-epp-kvcache-config.yaml +++ b/deploy/config/sim-epp-kvcache-config.yaml @@ -3,7 +3,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: mode: cache_tracking tokenProcessorConfig: @@ -15,7 +15,7 @@ plugins: prefixStoreConfig: blockSize: 16 tokenizersPoolConfig: - modelName: # specify the model name to use for tokenizer loading + modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace value to use different model for tokenizer loading hf: tokenizersCacheDir: "/cache/tokenizers" kvBlockIndexConfig: @@ -29,5 +29,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 10 diff --git a/deploy/config/sim-epp-no-hit-lru.yaml b/deploy/config/sim-epp-no-hit-lru.yaml index 8d0224411..e10ec5062 100644 --- a/deploy/config/sim-epp-no-hit-lru.yaml +++ b/deploy/config/sim-epp-no-hit-lru.yaml @@ -3,11 +3,13 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: prefix-cache-scorer +- type: precise-prefix-cache-scorer parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 + indexerConfig: + tokenProcessorConfig: + blockSize: 5 + kvBlockIndexConfig: + maxPrefixBlocksToMatch: 256 - type: no-hit-lru-scorer parameters: lruSize: 2048 @@ -19,7 +21,7 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer + - pluginRef: precise-prefix-cache-scorer weight: 2 - pluginRef: no-hit-lru-scorer weight: 1 diff --git a/scripts/kind-dev-env.sh b/scripts/kind-dev-env.sh index 5ef91726d..6b0d22a04 100755 --- a/scripts/kind-dev-env.sh +++ b/scripts/kind-dev-env.sh @@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}" export EPP_IMAGE # Set the model name to deploy -export MODEL_NAME="${MODEL_NAME:-food-review}" +export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}" # Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct") export MODEL_FAMILY="${MODEL_NAME%%/*}" # Extract model ID (e.g., "Llama-3.1-8B-Instruct") @@ -74,32 +74,41 @@ export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}" # Data Parallel size export VLLM_DATA_PARALLEL_SIZE="${VLLM_DATA_PARALLEL_SIZE:-1}" -PRIMARY_PORT="0" -if [ "${PD_ENABLED}" != "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -eq 1 ]; then - if [ "${KV_CACHE_ENABLED}" != "true" ]; then - DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml" - else - DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml" - fi -else - if [ "${KV_CACHE_ENABLED}" != "true" ]; then - if [ "${PD_ENABLED}" == "\"true\"" ]; then - DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml" - if [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then - PRIMARY_PORT="8000" - fi - else - DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml" - fi - else +# Validate configuration constraints +if [ "${KV_CACHE_ENABLED}" == "true" ]; then + # KV cache requires simple mode: no PD and DP size must be 1 + if [ "${PD_ENABLED}" == "\"true\"" ] || [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported" exit 1 fi fi -export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}" +# Set PRIMARY_PORT based on PD mode with data parallelism +if [ "${PD_ENABLED}" == "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then + PRIMARY_PORT="8000" +else + PRIMARY_PORT="0" +fi export PRIMARY_PORT +# Determine EPP config file based on feature flags +if [ "${KV_CACHE_ENABLED}" == "true" ]; then + # KV cache mode (simple mode only) + DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml" +elif [ "${PD_ENABLED}" == "\"true\"" ]; then + # Prefill-Decode mode + DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml" +elif [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then + # Data Parallel mode (only needed for Istio pre-1.28.1) + # Not really called in kind(docker.io/istio/pilot:1.28.1) by "make env-dev-kind" + DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml" +else + # Simple mode + DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml" +fi + +export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}" + # ------------------------------------------------------------------------------ # Setup & Requirement Checks # ------------------------------------------------------------------------------