llm-d · vMaroon · Jun 26, 2025 · Jun 6, 2025 · Jun 25, 2025
@@ -0,0 +1,116 @@
+# KV-Cache Manager Setup Guide
+
+This guide provides a complete walkthrough for setting up and testing the example llm-d-kv-cache-manager system. You will deploy a vLLM with LMCache and Redis, then run an example application that demonstrates KV cache indexing capabilities.
+
+By following this guide, you will:
+
+1. **Deploy the Infrastructure**: Use Helm to set up:
+   - vLLM nodes with LMCache CPU offloading (4 replicas) serving Llama 3.1 8B Instruct model
+   - Redis server
+2. **Test with Example Application**: Run a Go application that:
+   - Connects to your deployed vLLM and Redis infrastructure,
+   - Demonstrates KV cache indexing by processing a sample prompt
+
+The demonstrated KV-cache indexer is utilized for AI-aware routing to accelerate inference across the system through minimizing redundant computation.
+
+## vLLM Deployment
+
+The llm-d-kv-cache-manager repository includes a Helm chart for deploying vLLM with CPU offloading (LMCache) and KV-events indexing (Redis). This section describes how to use this Helm chart for a complete deployment.
+
+*Note*: Ensure that the Kubernetes node designated for running vLLM supports GPU workloads.
+
+### Prerequisites
+
+- Kubernetes cluster with GPU support
+- Helm 3.x
+- HuggingFace token for accessing models
+- kubectl configured to access your cluster
+
+### Installation
+
+1. Set environment variables:
+
+```bash
+export HF_TOKEN=<your-huggingface-token>
+export NAMESPACE=<your-namespace>
+export MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct"
+export VLLM_POOLLABEL="vllm-model-pool"
+```
+
+> Note that both the Helm deployment and the example application use the same `MODEL_NAME` environment variable,
+> ensuring alignment between the vLLM deployment configuration and the KV cache indexer.
+> Set this variable once during initial setup and both components will use the same model configuration.
+
+2. Deploy using Helm:
+
+```bash
+helm upgrade --install vllm-stack ./vllm-setup-helm \
+  --namespace $NAMESPACE \
+  --create-namespace \
+  --set secret.create=true \
+  --set secret.hfTokenValue=$HF_TOKEN \
+  --set vllm.model.name=$MODEL_NAME \
+  --set vllm.poolLabelValue=$VLLM_POOLLABEL \
+  -f ./vllm-setup-helm/values.yaml
+```
+
+**Note:**
+
+- Adjust the resource and limit allocations for vLLM and Redis in `values.yaml` to match your cluster's capacity.
+- By default, the chart uses a `PersistentVolume` to cache the model. To disable this, set `.persistence.enabled` to `false`.
+
+3. Verify the deployment:
+
+```bash
+kubectl get deployments -n $NAMESPACE
+```
+
+You should see:
+
+- vLLM pods (default: 4 replicas)
+- Redis lookup server pod
+
+### Configuration Options
+
+The Helm chart supports various configuration options. See [values.yaml](../../vllm-setup-helm/values.yaml) for all available options.
+
+Key configuration parameters:
+
+- `vllm.model.name`: The HuggingFace model to use (default: `meta-llama/Llama-3.1-8B-Instruct`)
+- `vllm.replicaCount`: Number of vLLM replicas (default: 4)
+- `vllm.poolLabelValue`: Label value for the inference pool (used by scheduler)
+- `redis.enabled`: Whether to deploy Redis for KV cache indexing (default: true)
+- `persistence.enabled`: Enable persistent storage for model cache (default: true)
+- `secret.create`: Create HuggingFace token secret (default: true)
+
+## Using the KV Cache Indexer Example
+
+### Prerequisites
+
+Ensure you have a running deployment with vLLM and Redis as described above.
+
+### Running the Example
+
+The vLLM node can be tested with the prompt found in `examples/kv-cache-index/main.go`.
+
+First, download the tokenizer bindings required by the `kvcache.Indexer` for prompt tokenization:
+
+```bash
+make download-tokenizer
+```
+
+Then, set the required environment variables and run example:
+
+```bash
+export HF_TOKEN=<token>
+export REDIS_ADDR=<redis://$user:$pass@localhost:6379/$db> # optional, defaults to localhost:6379
+export MODEL_NAME=<model_name_used_in_vllm_deployment> # optional, defaults to meta-llama/Llama-3.1-8B-Instruct
+
+go run -ldflags="-extldflags '-L$(pwd)/lib'" examples/kv-cache-index/main.go
+```
+
+Environment variables:
+
+- `HF_TOKEN` (required): HuggingFace access token
+- `REDIS_ADDR` (optional): Redis address; defaults to localhost:6379.
+- `MODEL_NAME` (optional): The model name used in vLLM deployment; defaults to meta-llama/Llama-3.1-8B-Instruct. Use the same value you set during Helm deployment.
@@ -1,3 +1,5 @@
 # KVCacheIndex Use Example
 
-The code in main.go showcases how to configure and use a KVCacheIndex module.
+The code in main.go showcases how to configure and use a KVCacheIndex module.
+
+For instructions on setting up an example environment for this demonstration, please refer to [docs/deployment/setup.md](../../docs/deployment/setup.md).
@@ -18,16 +18,19 @@ package main
 
 import (
 	"context"
+	"fmt"
 	"os"
 	"time"
 
+	"github.com/redis/go-redis/v9"
+
 	"k8s.io/klog/v2"
 
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache"
 )
 
 /*
-Refer to docs/phase1-setup.md
+Refer to docs/deployment/setup.md
 
 In Redis:
 1) "meta-llama/Llama-3.1-8B-Instruct@33c26f4ed679005e733e382beeb8df69d8362c07400bb07fec69712413cb4310"
@@ -37,58 +40,93 @@ In Redis:
 */
 
 //nolint:lll // need prompt as-is, chunking to string concatenation is too much of a hassle
-const prompt = `lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur pretium tincidunt lacus. Nulla gravida orci a odio. Nullam varius, turpis et commodo pharetra, est eros bibendum elit, nec luctus magna felis sollicitudin mauris. Integer in mauris eu nibh euismod gravida. Duis ac tellus et risus vulputate vehicula. Donec lobortis risus a elit. Etiam tempor. Ut ullamcorper, ligula eu tempor congue, eros est euismod turpis, id tincidunt sapien risus a quam. Maecenas fermentum consequat mi. Donec fermentum. Pellentesque malesuada nulla a mi. Duis sapien sem, aliquet nec, commodo eget, consequat quis, neque. Aliquam faucibus, elit ut dictum aliquet, felis nisl adipiscing sapien, sed malesuada diam lacus eget erat. Cras mollis scelerisque nunc. Nullam arcu. Aliquam consequat. Curabitur augue lorem, dapibus quis, laoreet et, pretium ac, nisi. Aenean magna nisl, mollis quis, molestie eu, feugiat in, orci. In hac habitasse platea dictumst.`
-const modelName = "ibm-granite/granite-3.3-8b-instruct"
+const (
+	prompt           = `lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur pretium tincidunt lacus. Nulla gravida orci a odio. Nullam varius, turpis et commodo pharetra, est eros bibendum elit, nec luctus magna felis sollicitudin mauris. Integer in mauris eu nibh euismod gravida. Duis ac tellus et risus vulputate vehicula. Donec lobortis risus a elit. Etiam tempor. Ut ullamcorper, ligula eu tempor congue, eros est euismod turpis, id tincidunt sapien risus a quam. Maecenas fermentum consequat mi. Donec fermentum. Pellentesque malesuada nulla a mi. Duis sapien sem, aliquet nec, commodo eget, consequat quis, neque. Aliquam faucibus, elit ut dictum aliquet, felis nisl adipiscing sapien, sed malesuada diam lacus eget erat. Cras mollis scelerisque nunc. Nullam arcu. Aliquam consequat. Curabitur augue lorem, dapibus quis, laoreet et, pretium ac, nisi. Aenean magna nisl, mollis quis, molestie eu, feugiat in, orci. In hac habitasse platea dictumst.`
+	defaultModelName = "meta-llama/Llama-3.1-8B-Instruct"
+
+	envRedisAddr = "REDIS_ADDR"
+	envHFToken   = "HF_TOKEN"
+	envModelName = "MODEL_NAME"
+)
 
-func getKVCacheIndexerConfig() *kvcache.Config {
+func getKVCacheIndexerConfig() (*kvcache.Config, error) {
 	config := kvcache.NewDefaultConfig()
 
 	// For sample running with mistral (tokenizer), a huggingface token is needed
-	huggingFaceToken := os.Getenv("HF_TOKEN")
+	huggingFaceToken := os.Getenv(envHFToken)
 	if huggingFaceToken != "" {
 		config.TokenizersPoolConfig.HuggingFaceToken = huggingFaceToken
 	}
 
-	return config
+	redisAddr := os.Getenv(envRedisAddr)
+	if redisAddr != "" {
+		redisOpt, err := redis.ParseURL(redisAddr)
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse redis host: %w", err)
+		}
+
+		config.KVBlockIndexConfig.RedisConfig.RedisOpt = redisOpt
+	}
+
+	return config, nil
+}
+
+func getModelName() string {
+	modelName := os.Getenv(envModelName)
+	if modelName != "" {
+		return modelName
+	}
+
+	return defaultModelName
 }
 
 func main() {
-	ctx, cancel := context.WithCancel(context.Background())
+	ctx := context.Background()
 	logger := klog.FromContext(ctx)
 
-	kvCacheIndexer, err := kvcache.NewKVCacheIndexer(getKVCacheIndexerConfig())
+	if err := kvCacheIndexer(ctx, logger); err != nil {
+		logger.Error(err, "failed to run kv-cache-indexer")
+		os.Exit(1)
+	}
+}
+
+func kvCacheIndexer(ctx context.Context, logger klog.Logger) error {
+	config, err := getKVCacheIndexerConfig()
 	if err != nil {
-		logger.Error(err, "failed to init Indexer")
+		return err
 	}
 
-	logger.Info("created Indexer")
+	//nolint:contextcheck // NewKVCacheIndexer does not accept context parameter
+	kvCacheIndexer, err := kvcache.NewKVCacheIndexer(config)
+	if err != nil {
+		return err
+	}
+
+	logger.Info("Created Indexer")
 
 	go kvCacheIndexer.Run(ctx)
-	logger.Info("started Indexer")
+	modelName := getModelName()
+	logger.Info("Started Indexer", "model", modelName)
 
 	// Get pods for the prompt
 	pods, err := kvCacheIndexer.GetPodScores(ctx, prompt, modelName, nil)
 	if err != nil {
-		logger.Error(err, "failed to get pod scores")
-		return
+		return err
 	}
 
 	// Print the pods - should be empty because no tokenization
-	logger.Info("got pods", "pods", pods)
+	logger.Info("Got pods", "pods", pods)
 
 	// Sleep 3 secs
 	time.Sleep(3 * time.Second)
 
 	// Get pods for the prompt
 	pods, err = kvCacheIndexer.GetPodScores(ctx, prompt, modelName, nil)
 	if err != nil {
-		logger.Error(err, "failed to get pod scores")
-		return
+		return err
 	}
 
 	// Print the pods - should be empty because no tokenization
-	logger.Info("got pods", "pods", pods)
-
-	// Cancel the context
-	cancel()
+	logger.Info("Got pods", "pods", pods)
+	return nil
 }
@@ -16,6 +16,7 @@ Deploying (repo root as working directory):
 ```
 helm upgrade --install vllm-p2p ./vllm-setup-helm \
   --namespace $NAMESPACE \
+  --create-namespace \
   --set secret.create=true \
   --set secret.hfTokenValue=$HF_TOKEN \
   --set vllm.poolLabelValue="vllm-llama3-8b-instruct" 

@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ .Release.Name }}-vllm-{{ .Values.vllm.model.label }}
+  name: {{ .Release.Name }}-vllm-{{ lower .Values.vllm.model.label }}
   namespace: {{ .Release.Namespace | default .Values.namespace }}
   labels:
     {{- include "chart.labels" . | nindent 4 }}