This guide walks you through setting up Valkey for RAG caching from scratch.
- Docker or Podman
- Python 3.11+, Node.js 18+, or Go 1.21+
- OpenAI API key (or Ollama for local development)
cd deployment/docker
docker-compose up -d valkey-stackdocker run -d --name valkey \
-p 6379:6379 \
valkey/valkey-stack:latest# macOS
brew install valkey
# Ubuntu/Debian
sudo apt-get install valkey
# Start server
valkey-server# Using valkey-cli
valkey-cli ping
# Expected: PONG
# Or using redis-cli (compatible)
redis-cli pingConnect to Valkey and create a vector search index:
valkey-cliFT.CREATE idx:my_cache
ON HASH
PREFIX 1 "cache:"
SCHEMA
query TEXT
response TEXT
embedding VECTOR HNSW 6
TYPE FLOAT32
DIM 1536
DISTANCE_METRIC COSINE
import valkey
import numpy as np
# Connect
client = valkey.Valkey(host='localhost', port=6379)
# Create embedding (mock - use real embeddings in production)
embedding = np.random.randn(1536).astype(np.float32)
embedding_bytes = embedding.tobytes()
# Store
client.hset("cache:entry1", mapping={
"query": "What is machine learning?",
"response": "Machine learning is a subset of AI...",
"embedding": embedding_bytes,
})
print("Stored cache entry!")import { createClient } from '@redis/client';
const client = createClient({ url: 'redis://localhost:6379' });
await client.connect();
// Create embedding (mock)
const embedding = new Float32Array(1536);
for (let i = 0; i < 1536; i++) embedding[i] = Math.random();
// Store
await client.hSet('cache:entry1', {
query: 'What is machine learning?',
response: 'Machine learning is a subset of AI...',
embedding: Buffer.from(embedding.buffer),
});
console.log('Stored cache entry!');import (
"github.com/redis/go-redis/v9"
"encoding/binary"
"math"
)
client := redis.NewClient(&redis.Options{Addr: "localhost:6379"})
// Create embedding (mock)
embedding := make([]float32, 1536)
embeddingBytes := float32ToBytes(embedding)
// Store
client.HSet(ctx, "cache:entry1",
"query", "What is machine learning?",
"response", "Machine learning is a subset of AI...",
"embedding", embeddingBytes,
)import valkey
import numpy as np
client = valkey.Valkey(host='localhost', port=6379, decode_responses=False)
# Query embedding
query_embedding = np.random.randn(1536).astype(np.float32)
query_bytes = query_embedding.tobytes()
# KNN Search
results = client.execute_command(
'FT.SEARCH', 'idx:my_cache',
'*=>[KNN 1 @embedding $query_vec AS score]',
'PARAMS', '2', 'query_vec', query_bytes,
'SORTBY', 'score',
'RETURN', '3', 'query', 'response', 'score',
'DIALECT', '2'
)
print(f"Found {results[0]} results")
if results[0] > 0:
print(f"Best match score: {results[2][5]}") # Lower is better for distanceChoose your language and run the complete example:
cd examples/python/semantic-cache
pip install -r requirements.txt
export OPENAI_API_KEY="sk-..."
python main.pycd examples/typescript/semantic-cache
npm install
export OPENAI_API_KEY="sk-..."
npm run devcd examples/go/semantic-cache
go run main.go# First query - cache miss
curl -X POST http://localhost:8000/query \
-H "Content-Type: application/json" \
-d '{"query": "What is machine learning?"}'
# Similar query - cache hit!
curl -X POST http://localhost:8000/query \
-H "Content-Type: application/json" \
-d '{"query": "Explain ML to me"}'
# Check cache stats
curl http://localhost:8000/cache/statsThe index hasn't been created. Run the FT.CREATE command or let the application create it on startup.
Your query vector dimensions don't match the index. Ensure both use the same dimension (e.g., 1536 for OpenAI).
Valkey isn't running or is on a different port. Check with docker ps or valkey-cli ping.
- Semantic Caching Patterns - Learn threshold tuning
- Vector Search Deep Dive - Advanced search techniques
- Production Scaling - Deploy to production