Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Dockerfile.epp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,10 @@ ENV PYTHONPATH=/workspace/kv-cache/pkg/preprocessing/chat_completions:/workspace
RUN ${PYTHON} -c "import tokenizer_wrapper" # verify tokenizer_wrapper is correctly installed

ENV HF_HOME="/tmp/.cache"
# used by kv-cache-manager
ENV LOCAL_TOKENIZER_DIR="/tmp/.cache"
# Create cache directory and set permissions for non-root user
RUN mkdir -p /tmp/.cache && chown -R 65532:65532 ${HF_HOME}

USER 65532:65532

Expand All @@ -113,4 +117,3 @@ EXPOSE 9090
EXPOSE 5557

ENTRYPOINT ["/app/epp"]

10 changes: 8 additions & 2 deletions deploy/config/dp-epp-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
indexerConfig:
tokenProcessorConfig:
blockSize: 5
kvBlockIndexConfig:
maxPrefixBlocksToMatch: 256
- type: decode-filter
- type: max-score-picker
- type: data-parallel-profile-handler
Expand All @@ -14,5 +20,5 @@ schedulingProfiles:
plugins:
- pluginRef: decode-filter
- pluginRef: max-score-picker
- pluginRef: prefix-cache-scorer
- pluginRef: precise-prefix-cache-scorer
weight: 2
6 changes: 3 additions & 3 deletions deploy/config/sim-epp-kvcache-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
mode: cache_tracking
tokenProcessorConfig:
Expand All @@ -15,7 +15,7 @@ plugins:
prefixStoreConfig:
blockSize: 16
tokenizersPoolConfig:
modelName: <model-name> # specify the model name to use for tokenizer loading
modelName: TinyLlama/TinyLlama-1.1B-Chat-v1.0 # replace value to use different model for tokenizer loading
hf:
tokenizersCacheDir: "/cache/tokenizers"
kvBlockIndexConfig:
Expand All @@ -29,5 +29,5 @@ schedulingProfiles:
plugins:
- pluginRef: decode-filter
- pluginRef: max-score-picker
- pluginRef: prefix-cache-scorer
- pluginRef: precise-prefix-cache-scorer
weight: 10
12 changes: 7 additions & 5 deletions deploy/config/sim-epp-no-hit-lru.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
- type: precise-prefix-cache-scorer
parameters:
hashBlockSize: 5
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
indexerConfig:
tokenProcessorConfig:
blockSize: 5
kvBlockIndexConfig:
maxPrefixBlocksToMatch: 256
- type: no-hit-lru-scorer
parameters:
lruSize: 2048
Expand All @@ -19,7 +21,7 @@ schedulingProfiles:
plugins:
- pluginRef: decode-filter
- pluginRef: max-score-picker
- pluginRef: prefix-cache-scorer
- pluginRef: precise-prefix-cache-scorer
weight: 2
- pluginRef: no-hit-lru-scorer
weight: 1
49 changes: 29 additions & 20 deletions scripts/kind-dev-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ EPP_IMAGE="${EPP_IMAGE:-${IMAGE_REGISTRY}/llm-d-inference-scheduler:${EPP_TAG}}"
export EPP_IMAGE

# Set the model name to deploy
export MODEL_NAME="${MODEL_NAME:-food-review}"
export MODEL_NAME="${MODEL_NAME:-TinyLlama/TinyLlama-1.1B-Chat-v1.0}"
# Extract model family (e.g., "meta-llama" from "meta-llama/Llama-3.1-8B-Instruct")
export MODEL_FAMILY="${MODEL_NAME%%/*}"
# Extract model ID (e.g., "Llama-3.1-8B-Instruct")
Expand Down Expand Up @@ -74,32 +74,41 @@ export VLLM_REPLICA_COUNT_D="${VLLM_REPLICA_COUNT_D:-2}"
# Data Parallel size
export VLLM_DATA_PARALLEL_SIZE="${VLLM_DATA_PARALLEL_SIZE:-1}"

PRIMARY_PORT="0"
if [ "${PD_ENABLED}" != "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -eq 1 ]; then
if [ "${KV_CACHE_ENABLED}" != "true" ]; then
DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml"
else
DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml"
fi
else
if [ "${KV_CACHE_ENABLED}" != "true" ]; then
if [ "${PD_ENABLED}" == "\"true\"" ]; then
DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml"
if [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
PRIMARY_PORT="8000"
fi
else
DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
fi
else
# Validate configuration constraints
if [ "${KV_CACHE_ENABLED}" == "true" ]; then
# KV cache requires simple mode: no PD and DP size must be 1
if [ "${PD_ENABLED}" == "\"true\"" ] || [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
echo "Invalid configuration: PD_ENABLED=true and KV_CACHE_ENABLED=true is not supported"
exit 1
fi
fi

export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}"
# Set PRIMARY_PORT based on PD mode with data parallelism
if [ "${PD_ENABLED}" == "\"true\"" ] && [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
PRIMARY_PORT="8000"
else
PRIMARY_PORT="0"
fi
export PRIMARY_PORT

# Determine EPP config file based on feature flags
if [ "${KV_CACHE_ENABLED}" == "true" ]; then
# KV cache mode (simple mode only)
DEFAULT_EPP_CONFIG="deploy/config/sim-epp-kvcache-config.yaml"
elif [ "${PD_ENABLED}" == "\"true\"" ]; then
# Prefill-Decode mode
DEFAULT_EPP_CONFIG="deploy/config/sim-pd-epp-config.yaml"
elif [ ${VLLM_DATA_PARALLEL_SIZE} -ne 1 ]; then
# Data Parallel mode (only needed for Istio pre-1.28.1)
# Not really called in kind(docker.io/istio/pilot:1.28.1) by "make env-dev-kind"
DEFAULT_EPP_CONFIG="deploy/config/dp-epp-config.yaml"
else
# Simple mode
DEFAULT_EPP_CONFIG="deploy/config/sim-epp-config.yaml"
fi

export EPP_CONFIG="${EPP_CONFIG:-${DEFAULT_EPP_CONFIG}}"

# ------------------------------------------------------------------------------
# Setup & Requirement Checks
# ------------------------------------------------------------------------------
Expand Down