Skip to content

Commit 4eb1009

Browse files
authored
feat(onnx-binding): add ONNX Runtime binding with ROCm/GPU support for 2DMSE model (#1218)
1 parent da16389 commit 4eb1009

40 files changed

+11297
-2
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Simple test configuration for onnx-binding mmBERT
2+
# This config enables basic embedding functionality for testing
3+
4+
# Response API Configuration
5+
response_api:
6+
enabled: true
7+
store_backend: "memory"
8+
ttl_seconds: 86400
9+
max_responses: 1000
10+
11+
# Router Replay Configuration
12+
router_replay:
13+
store_backend: "memory"
14+
ttl_seconds: 2592000
15+
async_writes: true
16+
17+
# Semantic Cache Configuration - uses mmBERT embeddings
18+
semantic_cache:
19+
enabled: true
20+
backend_type: "memory"
21+
similarity_threshold: 0.85
22+
max_entries: 1000
23+
ttl_seconds: 3600
24+
eviction_policy: "fifo"
25+
# Use mmBERT 32K ONNX model for embeddings
26+
embedding_model: "mmbert"
27+
28+
# Embedding Models Configuration - mmBERT 32K via ONNX Runtime
29+
embedding_models:
30+
use_cpu: true
31+
# mmBERT 32K YaRN model path (local ONNX)
32+
mmbert_model_path: "onnx-binding/mmbert-32k-yarn-onnx"
33+
34+
# Tools Configuration - uses embeddings for semantic matching
35+
tools:
36+
enabled: true
37+
tools_db_path: "config/tools_db.json"
38+
model_type: "mmbert" # Use mmBERT for tool matching
39+
target_dim: 0 # Use full dimension (768)
40+
similarity_threshold: 0.7
41+
42+
# Disable classifiers for simple testing
43+
prompt_guard:
44+
enabled: false
45+
46+
classifier:
47+
category_model:
48+
enabled: false
49+
pii_model:
50+
enabled: false
51+
52+
feedback_detector:
53+
enabled: false
54+
55+
hallucination_mitigation:
56+
enabled: false
57+
58+
# vLLM Endpoints Configuration
59+
vllm_endpoints:
60+
- name: "ollama"
61+
address: "127.0.0.1"
62+
port: 11434
63+
weight: 1
64+
65+
model_config:
66+
"qwen2.5:3b":
67+
reasoning_family: "qwen3"
68+
preferred_endpoints: ["ollama"]
69+
70+
# Simple categories for testing
71+
categories:
72+
- name: general
73+
description: "General queries"
74+
- name: technical
75+
description: "Technical questions"
76+
77+
decisions:
78+
- name: "default"
79+
description: "Default routing"
80+
priority: 100
81+
rules:
82+
operator: "OR"
83+
conditions:
84+
- type: "domain"
85+
name: "general"
86+
modelRefs:
87+
- model: "qwen2.5:3b"
88+
use_reasoning: false
89+
weight: 100
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
# Test configuration for onnx-binding with classifiers
2+
# Uses ONNX models for both embeddings and classification
3+
4+
# Model-On-Demand Registry - maps local paths to HuggingFace repos
5+
# The ONNX models are stored in the onnx/ subdirectory of the merged model repos
6+
mom_registry:
7+
# ONNX classifier models (already downloaded/exported locally)
8+
"models/mmbert32k-intent-classifier-merged-onnx": "llm-semantic-router/mmbert32k-intent-classifier-merged"
9+
"models/mmbert32k-jailbreak-detector-merged-onnx": "llm-semantic-router/mmbert32k-jailbreak-detector-merged"
10+
"models/mmbert32k-pii-detector-merged-onnx": "llm-semantic-router/mmbert32k-pii-detector-merged"
11+
"models/mmbert32k-factcheck-classifier-merged-onnx": "llm-semantic-router/mmbert32k-factcheck-classifier-merged"
12+
"models/mmbert32k-feedback-detector-merged-onnx": "llm-semantic-router/mmbert32k-feedback-detector-merged"
13+
# Required by hallucination_mitigation when enabled
14+
"models/mom-halugate-detector": "KRLabsOrg/lettucedect-base-modernbert-en-v1"
15+
"models/mom-halugate-explainer": "tasksource/ModernBERT-base-nli"
16+
17+
# Response API Configuration
18+
response_api:
19+
enabled: true
20+
store_backend: "memory"
21+
ttl_seconds: 86400
22+
max_responses: 1000
23+
24+
# Router Replay Configuration
25+
router_replay:
26+
store_backend: "memory"
27+
ttl_seconds: 2592000
28+
async_writes: true
29+
30+
# Semantic Cache Configuration - uses mmBERT embeddings
31+
semantic_cache:
32+
enabled: true
33+
backend_type: "memory"
34+
similarity_threshold: 0.85
35+
max_entries: 1000
36+
ttl_seconds: 3600
37+
eviction_policy: "fifo"
38+
embedding_model: "mmbert"
39+
40+
# Embedding Models Configuration - mmBERT 32K via ONNX Runtime
41+
embedding_models:
42+
use_cpu: true
43+
# mmBERT 32K YaRN model path (local ONNX)
44+
mmbert_model_path: "onnx-binding/mmbert-32k-yarn-onnx"
45+
46+
# Tools Configuration - uses embeddings for semantic matching
47+
tools:
48+
enabled: true
49+
tools_db_path: "config/tools_db.json"
50+
model_type: "mmbert"
51+
target_dim: 0
52+
similarity_threshold: 0.7
53+
54+
# Classification Models Configuration - ONNX Runtime via mmBERT-32K
55+
# Enable classifiers with mmBERT-32K (uses onnx-binding)
56+
prompt_guard:
57+
enabled: true
58+
use_mmbert_32k: true # Enable mmBERT-32K for jailbreak detection
59+
model_id: "models/mmbert32k-jailbreak-detector-merged-onnx"
60+
jailbreak_mapping_path: "models/mmbert32k-jailbreak-detector-merged-onnx/jailbreak_mapping.json"
61+
use_cpu: true
62+
threshold: 0.5
63+
64+
classifier:
65+
category_model:
66+
enabled: true
67+
use_mmbert_32k: true # Enable mmBERT-32K for intent classification
68+
model_id: "models/mmbert32k-intent-classifier-merged-onnx"
69+
category_mapping_path: "models/mmbert32k-intent-classifier-merged-onnx/category_mapping.json"
70+
use_cpu: true
71+
threshold: 0.5
72+
pii_model:
73+
enabled: true
74+
use_mmbert_32k: true # Enable mmBERT-32K for PII detection
75+
model_id: "models/mmbert32k-pii-detector-merged-onnx"
76+
pii_mapping_path: "models/mmbert32k-pii-detector-merged-onnx/pii_mapping.json"
77+
use_cpu: true
78+
threshold: 0.5
79+
80+
feedback_detector:
81+
enabled: true
82+
use_mmbert_32k: true
83+
model_id: "models/mmbert32k-feedback-detector-merged-onnx"
84+
feedback_mapping_path: "models/mmbert32k-feedback-detector-merged-onnx/label_mapping.json"
85+
use_cpu: true
86+
threshold: 0.5
87+
88+
hallucination_mitigation:
89+
enabled: true
90+
fact_check_model:
91+
model_id: "models/mmbert32k-factcheck-classifier-merged-onnx"
92+
use_cpu: true
93+
use_mmbert_32k: true
94+
threshold: 0.5
95+
hallucination_model:
96+
model_id: "models/mom-halugate-detector"
97+
use_cpu: true
98+
nli_model:
99+
model_id: "models/mom-halugate-explainer"
100+
use_cpu: true
101+
102+
# vLLM Endpoints Configuration
103+
vllm_endpoints:
104+
- name: "ollama"
105+
address: "127.0.0.1"
106+
port: 11434
107+
weight: 1
108+
109+
model_config:
110+
"qwen2.5:3b":
111+
reasoning_family: "qwen3"
112+
preferred_endpoints: ["ollama"]
113+
114+
# Categories for testing
115+
categories:
116+
- name: general
117+
description: "General queries"
118+
- name: technical
119+
description: "Technical questions"
120+
- name: biology
121+
description: "Biology questions"
122+
- name: math
123+
description: "Math questions"
124+
- name: physics
125+
description: "Physics questions"
126+
- name: computer_science
127+
description: "Computer science questions"
128+
129+
# Fact-check signal rules (matched signals emitted by fact-check classifier)
130+
fact_check_rules:
131+
- name: "needs_fact_check"
132+
description: "Query requires factual verification"
133+
- name: "no_fact_check_needed"
134+
description: "Query does not require factual verification"
135+
136+
# User feedback signal rules (matched signals emitted by feedback detector)
137+
user_feedback_rules:
138+
- name: "satisfied"
139+
description: "User is satisfied with the response"
140+
- name: "need_clarification"
141+
description: "User needs clarification"
142+
- name: "wrong_answer"
143+
description: "User indicates the answer is wrong"
144+
- name: "want_different"
145+
description: "User wants a different answer"
146+
147+
decisions:
148+
- name: "needs_fact_check"
149+
priority: 90
150+
rules:
151+
operator: "AND"
152+
conditions:
153+
- type: "fact_check"
154+
name: "needs_fact_check"
155+
modelRefs:
156+
- model: "qwen2.5:3b"
157+
use_reasoning: false
158+
weight: 100
159+
- name: "no_fact_check_needed"
160+
priority: 89
161+
rules:
162+
operator: "AND"
163+
conditions:
164+
- type: "fact_check"
165+
name: "no_fact_check_needed"
166+
modelRefs:
167+
- model: "qwen2.5:3b"
168+
use_reasoning: false
169+
weight: 100
170+
- name: "satisfied"
171+
priority: 88
172+
rules:
173+
operator: "AND"
174+
conditions:
175+
- type: "user_feedback"
176+
name: "satisfied"
177+
modelRefs:
178+
- model: "qwen2.5:3b"
179+
use_reasoning: false
180+
weight: 100
181+
- name: "need_clarification"
182+
priority: 87
183+
rules:
184+
operator: "AND"
185+
conditions:
186+
- type: "user_feedback"
187+
name: "need_clarification"
188+
modelRefs:
189+
- model: "qwen2.5:3b"
190+
use_reasoning: false
191+
weight: 100
192+
- name: "wrong_answer"
193+
priority: 86
194+
rules:
195+
operator: "AND"
196+
conditions:
197+
- type: "user_feedback"
198+
name: "wrong_answer"
199+
modelRefs:
200+
- model: "qwen2.5:3b"
201+
use_reasoning: false
202+
weight: 100
203+
- name: "want_different"
204+
priority: 85
205+
rules:
206+
operator: "AND"
207+
conditions:
208+
- type: "user_feedback"
209+
name: "want_different"
210+
modelRefs:
211+
- model: "qwen2.5:3b"
212+
use_reasoning: false
213+
weight: 100
214+
- name: "default"
215+
priority: 100
216+
rules:
217+
operator: "OR"
218+
conditions:
219+
- type: "domain"
220+
name: "general"
221+
modelRefs:
222+
- model: "qwen2.5:3b"
223+
use_reasoning: false
224+
weight: 100

0 commit comments

Comments
 (0)