-
Notifications
You must be signed in to change notification settings - Fork 623
Expand file tree
/
Copy pathvllm.ai_v1alpha1_semanticrouter_hybrid_cache.yaml
More file actions
151 lines (134 loc) · 3.64 KB
/
vllm.ai_v1alpha1_semanticrouter_hybrid_cache.yaml
File metadata and controls
151 lines (134 loc) · 3.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
---
# SemanticRouter with Hybrid Cache Backend
# This example demonstrates how to configure the semantic router to use Hybrid caching
# Hybrid cache combines in-memory HNSW index with persistent Milvus storage for optimal performance
apiVersion: vllm.ai/v1alpha1
kind: SemanticRouter
metadata:
name: semantic-router-hybrid
namespace: default
spec:
replicas: 2
# Configure vLLM backend endpoints
vllmEndpoints:
- name: small-model
model: qwen3-1.8b
reasoningFamily: qwen3
backend:
type: service
service:
name: vllm-small
port: 8000
- name: large-model
model: llama3-70b
reasoningFamily: qwen3
backend:
type: service
service:
name: vllm-large
port: 8000
# Persistence for model cache
persistence:
enabled: true
size: 20Gi
# Autoscaling configuration
autoscaling:
enabled: true
minReplicas: 2
maxReplicas: 5
targetCPUUtilizationPercentage: 70
# Resource limits
resources:
requests:
cpu: 2000m
memory: 4Gi
limits:
cpu: 4000m
memory: 8Gi
# Semantic router configuration
config:
embedding_models:
mmbert_model_path: models/mom-embedding-ultra
use_cpu: true
# Semantic cache with Hybrid backend (HNSW + Milvus)
semantic_cache:
enabled: true
backend_type: hybrid
similarity_threshold: "0.85"
ttl_seconds: 3600
max_entries: 5000
eviction_policy: lru
embedding_model: mmbert
# HNSW configuration for hybrid backend
hnsw:
use_hnsw: true
hnsw_m: 32 # Higher for better recall
hnsw_ef_construction: 128
max_memory_entries: 5000
# Milvus configuration for persistent storage
# NOTE: Update host to match your Milvus deployment namespace
milvus:
connection:
host: milvus-standalone.cache-backends.svc.cluster.local
port: 19530
database: semantic_router_cache
timeout: 30
auth:
enabled: true
username: root
password_secret_ref:
name: milvus-credentials
key: password
collection:
name: semantic_cache_hybrid
description: "Hybrid cache backend storage"
vector_field:
name: embedding
dimension: 384 # For BERT embeddings
metric_type: IP
index:
type: HNSW
params:
M: 32
efConstruction: 128
search:
params:
ef: 128
topk: 20
consistency_level: Eventually # Faster for hybrid usage
performance:
connection_pool:
max_connections: 20
max_idle_connections: 10
acquire_timeout: 30
batch:
insert_batch_size: 200
timeout: 60
data_management:
ttl:
enabled: true
timestamp_field: created_at
cleanup_interval: 1800
compaction:
enabled: true
interval: 43200 # Twice daily
development:
drop_collection_on_startup: false
auto_create_collection: true
verbose_errors: false
# Disable features that require additional models for this example
# Enable these features in production with proper model configuration
prompt_guard:
enabled: false
tools:
enabled: false
---
# Secret for Milvus password (create separately)
apiVersion: v1
kind: Secret
metadata:
name: milvus-credentials
namespace: default
type: Opaque
stringData:
password: "your-milvus-password-here"