vllm-project · daric93 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 10, 2026
@@ -1416,9 +1416,25 @@ global:
           enable_metrics: true
     memory:
       enabled: true
+      backend: milvus
       auto_store: true
       disabled_routes: []
       disabled_models: []
+      valkey:
+        host: valkey
+        port: 6379
+        database: 0
+        password: valkey-secret
+        timeout: 10
+        collection_prefix: "mem:"
+        index_name: mem_idx
+        dimension: 384
+        metric_type: COSINE
+        index_m: 16
+        index_ef_construction: 200
+        tls_enabled: false
+        tls_ca_path: ""
+        tls_insecure_skip_verify: false
       milvus:
         address: milvus:19530
         collection: agentic_memory

@@ -2,8 +2,10 @@
 
 This directory under `deploy/examples/runtime/` holds repo-owned runtime support examples that are not part of the user-facing `config/` contract.
 
+- `memory/`: agentic memory backend configuration references (Milvus, Valkey)
 - `semantic-cache/`: external semantic-cache backend example files
 - `response-api/`: external Response API Redis example files
 - `tools/`: local tools database examples
+- `vector-store/`: vector store backend configuration references
 
 These files exist for local development, tutorials, and tests. They are not the canonical router config surface.
@@ -0,0 +1,93 @@
+# Valkey Memory Store Configuration for Agentic Memory
+# This configuration file contains settings for using Valkey (with the Search
+# module) as the agentic memory backend.
+#
+# To use this configuration:
+# 1. Set backend: "valkey" in global.stores.memory in your config.yaml
+# 2. Inline the valkey settings from this file into global.stores.memory.valkey
+# 3. Ensure Valkey server with the Search module is running and accessible
+#
+# Valkey Search module is required for FT.CREATE / FT.SEARCH vector operations.
+# Use the valkey/valkey-bundle image or compile Valkey with --enable-search.
+
+# Connection settings
+host: "localhost"        # For production: use your Valkey cluster endpoint
+port: 6379               # Standard Valkey port
+database: 0              # Database number (0-15)
+password: ""             # Authentication password (leave empty if not required)
+timeout: 10              # Connection/request timeout in seconds
+
+# Index and storage settings
+collection_prefix: "mem:"   # Key prefix for HASH documents
+index_name: "mem_idx"       # FT.CREATE index name
+dimension: 384              # Embedding vector dimension (must match model)
+metric_type: "COSINE"       # Distance metric: COSINE, L2, or IP
+
+# HNSW index tuning
+index_m: 16                 # Bi-directional links per node (higher = more accurate, more RAM)
+index_ef_construction: 256  # Build-time search width (higher = better recall, slower build)
+
+# TLS settings
+tls_enabled: false                 # Enable TLS for the Valkey connection
+tls_ca_path: ""                    # Path to PEM-encoded CA cert (empty = system trust store)
+tls_insecure_skip_verify: false    # Skip server cert verification (development only)
+
+# Full canonical config.yaml usage example:
+#
+# global:
+#   stores:
+#     memory:
+#       enabled: true
+#       backend: valkey                       # <-- select Valkey backend
+#       auto_store: true
+#       valkey:                               # <-- Valkey-specific settings
+#         host: valkey
+#         port: 6379
+#         database: 0
+#         timeout: 10
+#         collection_prefix: "mem:"
+#         index_name: mem_idx
+#         dimension: 384
+#         metric_type: COSINE
+#         index_m: 16
+#         index_ef_construction: 256
+#         tls_enabled: false
+#         tls_ca_path: ""
+#         tls_insecure_skip_verify: false
+#       embedding_model: bert
+#       default_retrieval_limit: 5
+#       default_similarity_threshold: 0.70
+#       hybrid_search: true
+#       hybrid_mode: rerank
+#       adaptive_threshold: true
+#       quality_scoring:
+#         initial_strength_days: 30
+#         prune_threshold: 0.15
+#         max_memories_per_user: 200
+#       reflection:
+#         enabled: true
+#         algorithm: recency_semantic
+#         max_inject_tokens: 512
+#         recency_decay_days: 14
+#         dedup_threshold: 0.9
+#
+# Example configurations for different environments:
+#
+# Local Development (Docker):
+#   host: "localhost"
+#   port: 6379
+#   password: ""
+#
+# Production (Docker / Kubernetes):
+#   host: "valkey-service.valkey-system.svc.cluster.local"
+#   port: 6379
+#   password: "${VALKEY_PASSWORD}"     # from secret
+#   index_m: 32                        # higher recall for production
+#   index_ef_construction: 512
+#
+# Kubernetes with TLS:
+#   host: "valkey-tls.valkey-system.svc.cluster.local"
+#   port: 6380
+#   password: "${VALKEY_PASSWORD}"
+#   tls_enabled: true
+#   tls_ca_path: "/etc/valkey/certs/ca.pem"   # mounted from secret
@@ -0,0 +1,134 @@
+# Same routing as config.memory-user.yaml but uses the Valkey memory backend
+# instead of Milvus.  Requires a Valkey instance with the Search module on the
+# same Docker network (e.g. semantic-router-valkey:6379).
+#
+# Usage: point your router's CONFIG_FILE to this file.
+
+version: v0.3
+listeners:
+  - name: http-8888
+    address: 0.0.0.0
+    port: 8888
+    timeout: 300s
+providers:
+  defaults:
+    default_model: qwen3
+  models:
+    - name: qwen3
+      provider_model_id: qwen3
+      backend_refs:
+        - name: llm_katan
+          weight: 1
+          endpoint: host.docker.internal:8000
+          protocol: http
+routing:
+  modelCards:
+    - name: qwen3
+      modality: text
+  signals:
+    domains:
+      - name: general
+        description: General queries for memory testing
+        mmlu_categories: [other]
+    keywords:
+      - name: no_memory_trigger
+        operator: OR
+        keywords: [NOMEM_MARKER]
+      - name: custom_threshold_trigger
+        operator: OR
+        keywords: [THRESHOLD_MARKER]
+  decisions:
+    - name: no_memory_route
+      description: Route with memory explicitly disabled for per-decision testing
+      priority: 200
+      rules:
+        operator: OR
+        conditions:
+          - type: keyword
+            name: no_memory_trigger
+      modelRefs:
+        - model: qwen3
+          use_reasoning: false
+      plugins:
+        - type: system_prompt
+          configuration:
+            system_prompt: You are a helpful assistant. Memory access is disabled for this route.
+            mode: insert
+        - type: memory
+          configuration:
+            enabled: false
+
+    - name: custom_threshold_route
+      description: Route with high similarity threshold for per-decision testing
+      priority: 150
+      rules:
+        operator: OR
+        conditions:
+          - type: keyword
+            name: custom_threshold_trigger
+      modelRefs:
+        - model: qwen3
+          use_reasoning: false
+      plugins:
+        - type: system_prompt
+          configuration:
+            system_prompt: You are a helpful assistant with strict memory matching.
+            mode: insert
+        - type: memory
+          configuration:
+            enabled: true
+            retrieval_limit: 5
+            similarity_threshold: 0.99
+            auto_store: true
+
+    - name: default_route
+      description: Default route for memory testing
+      priority: 1
+      rules:
+        operator: OR
+        conditions:
+          - type: domain
+            name: general
+      modelRefs:
+        - model: qwen3
+          use_reasoning: false
+      plugins:
+        - type: system_prompt
+          configuration:
+            system_prompt: You are MoM, a helpful AI assistant with memory. You remember important facts about users and use this context to provide personalized assistance.
+            mode: insert
+        - type: memory
+          configuration:
+            enabled: true
+            retrieval_limit: 5
+            similarity_threshold: 0.45
+            auto_store: true
+global:
+  services:
+    response_api:
+      enabled: true
+      store_backend: memory
+      ttl_seconds: 86400
+  stores:
+    memory:
+      enabled: true
+      backend: valkey
+      auto_store: true
+      valkey:
+        host: semantic-router-valkey
+        port: 6379
+        database: 0
+        timeout: 10
+        collection_prefix: "mem:"
+        index_name: mem_idx
+        dimension: 384
+        metric_type: COSINE
+        index_m: 16
+        index_ef_construction: 256
+      embedding_model: mmbert
+      default_retrieval_limit: 5
+      default_similarity_threshold: 0.45
+    semantic_cache:
+      embedding_model: mmbert
+    vector_store:
+      embedding_model: mmbert
@@ -119,6 +119,7 @@ func assertReferenceConfigSemanticCacheCoverage(t testingT, semanticCache map[st
 func assertReferenceConfigMemoryCoverage(t testingT, memory map[string]interface{}) {
 	assertMapCoversStructFields(t, memory, reflect.TypeOf(MemoryConfig{}), "global.stores.memory")
 	assertMapCoversStructFields(t, mustMapAt(t, memory, "milvus"), reflect.TypeOf(MemoryMilvusConfig{}), "global.stores.memory.milvus")
+	assertMapCoversStructFields(t, mustMapAt(t, memory, "valkey"), reflect.TypeOf(MemoryValkeyConfig{}), "global.stores.memory.valkey")
 	assertMapCoversStructFields(t, mustMapAt(t, memory, "quality_scoring"), reflect.TypeOf(MemoryQualityScoringConfig{}), "global.stores.memory.quality_scoring")
 	assertMapCoversStructFields(t, mustMapAt(t, memory, "reflection"), reflect.TypeOf(MemoryReflectionConfig{}), "global.stores.memory.reflection")
 }

@@ -207,10 +207,12 @@ type SemanticCache struct {
 
 type MemoryConfig struct {
 	Enabled                    bool                       `yaml:"enabled,omitempty"`
+	Backend                    string                     `yaml:"backend,omitempty"`
 	AutoStore                  bool                       `yaml:"auto_store,omitempty"`
 	DisabledRoutes             []string                   `yaml:"disabled_routes,omitempty"`
 	DisabledModels             []string                   `yaml:"disabled_models,omitempty"`
 	Milvus                     MemoryMilvusConfig         `yaml:"milvus,omitempty"`
+	Valkey                     *MemoryValkeyConfig        `yaml:"valkey,omitempty"`
 	RedisCache                 *MemoryRedisCacheConfig    `yaml:"redis_cache,omitempty"`
 	EmbeddingModel             string                     `yaml:"embedding_model,omitempty"`
 	ExtractionBatchSize        int                        `yaml:"extraction_batch_size,omitempty"`
@@ -262,6 +264,40 @@ type MemoryMilvusConfig struct {
 	NumPartitions int    `yaml:"num_partitions,omitempty"`
 }
 
+// MemoryValkeyConfig holds configuration for the Valkey memory store backend.
+// Uses Valkey with the Search module for vector similarity operations.
+type MemoryValkeyConfig struct {
+	// Host is the Valkey server hostname (default "localhost").
+	Host string `yaml:"host"`
+	// Port is the Valkey server port (default 6379).
+	Port int `yaml:"port"`
+	// Database number (default 0).
+	Database int `yaml:"database"`
+	// Password for Valkey authentication (optional).
+	Password string `yaml:"password,omitempty"`
+	// Timeout is the connection/request timeout in seconds (default 10).
+	Timeout int `yaml:"timeout"`
+	// CollectionPrefix is the prefix for hash keys (default "mem:").
+	CollectionPrefix string `yaml:"collection_prefix,omitempty"`
+	// IndexName is the FT index name (default "mem_idx").
+	IndexName string `yaml:"index_name,omitempty"`
+	// Dimension is the embedding vector dimension (default 384).
+	Dimension int `yaml:"dimension,omitempty"`
+	// MetricType is the distance metric: "COSINE", "L2", or "IP" (default "COSINE").
+	MetricType string `yaml:"metric_type,omitempty"`
+	// IndexM is the HNSW M parameter (default 16).
+	IndexM int `yaml:"index_m,omitempty"`
+	// IndexEfConstruction is the HNSW efConstruction parameter (default 256).
+	IndexEfConstruction int `yaml:"index_ef_construction,omitempty"`
+	// TLSEnabled enables TLS for the Valkey connection.
+	TLSEnabled bool `yaml:"tls_enabled,omitempty"`
+	// TLSCAPath is the path to a PEM-encoded CA certificate file for server verification.
+	// When empty and TLS is enabled, the system's default trust store is used.
+	TLSCAPath string `yaml:"tls_ca_path,omitempty"`
+	// TLSInsecureSkipVerify skips server certificate verification (development only).
+	TLSInsecureSkipVerify bool `yaml:"tls_insecure_skip_verify,omitempty"`
+}
+
 // ResponseAPIConfig controls response and conversation history storage.
 // StoreBackend defaults to "redis" for durable storage that survives router
 // restarts. Set to "memory" only for local development — all history is lost