Skip to content

Commit b28e092

Browse files
committed
feat: add hallucination detection pipeline for output safety
Two-stage pipeline (R9) for detecting hallucinated content in LLM responses: Stage 1 — Sentinel: Lightweight heuristic that determines if a response needs detailed fact-checking via word overlap analysis. High-confidence responses skip Stage 2 to save compute. Stage 2 — Sentence-level detector: Splits response into sentences and scores each against the user's prompt for factual consistency using a cross-encoder model (vectara/hallucination_evaluation_model). Falls back to heuristic scoring when ML model is unavailable. Changes: - Add hallucination_detector.rs with HallucinationDetector struct - Extend OutputSafetyConfig with hallucination_enabled, hallucination_model, hallucination_threshold, hallucination_min_response_length fields - Integrate into OutputAnalyzer via analyze_output_with_prompt() - Add hallucination config section to config.example.yaml - Unit tests for detector, threshold behaviour, skip-short-response logic, sentence splitting, result-to-findings conversion, and integration tests
1 parent 41b7f2b commit b28e092

7 files changed

Lines changed: 1285 additions & 9 deletions

File tree

config.example.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,23 @@ output_safety:
297297
# (severe_toxic, threat, or score >= 0.9). Use with caution.
298298
block_on_critical: false
299299

300+
# --- Hallucination detection (R9) ---
301+
# Enable hallucination detection on response content. Uses a two-stage
302+
# pipeline: a sentinel heuristic gates expensive cross-encoder inference,
303+
# then each response sentence is scored against the user's prompt for
304+
# factual consistency.
305+
hallucination_enabled: false
306+
# HuggingFace model ID for the cross-encoder hallucination detector.
307+
# The model scores (premise, hypothesis) pairs for factual consistency.
308+
hallucination_model: "vectara/hallucination_evaluation_model"
309+
# Threshold below which a sentence is considered potentially hallucinated
310+
# (0.0–1.0). Lower values are more permissive; higher values flag more.
311+
hallucination_threshold: 0.5
312+
# Minimum response length (in characters) to run hallucination detection.
313+
# Responses shorter than this are skipped to avoid false positives on
314+
# brief answers and save compute.
315+
hallucination_min_response_length: 50
316+
300317
# ---------------------------------------------------------------------------
301318
# ML-based security analysis
302319
# ---------------------------------------------------------------------------
@@ -318,6 +335,12 @@ output_safety:
318335
# ner_enabled: false
319336
# # HuggingFace model ID for NER-based PII detection.
320337
# ner_model: "dslim/bert-base-NER"
338+
# # Enable dedicated jailbreak detection (runs alongside prompt injection detection).
339+
# # Detects DAN/character jailbreaks, system prompt extraction, privilege escalation,
340+
# # and encoding-based evasion (base64, ROT13, leetspeak, reversed text).
341+
# jailbreak_enabled: true
342+
# # Confidence threshold for jailbreak detection (0.0–1.0).
343+
# jailbreak_threshold: 0.7
321344

322345
# ---------------------------------------------------------------------------
323346
# Graceful shutdown — connection draining and task completion

crates/llmtrace-core/src/lib.rs

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,8 +1399,8 @@ impl Default for StreamingAnalysisConfig {
13991399
/// Output safety configuration for response content analysis.
14001400
///
14011401
/// When enabled, the proxy analyses LLM response content for toxicity,
1402-
/// PII leakage, and secret exposure. This is a post-processing step that
1403-
/// runs after the upstream response is received.
1402+
/// PII leakage, secret exposure, and hallucination detection. This is a
1403+
/// post-processing step that runs after the upstream response is received.
14041404
///
14051405
/// # Example (YAML)
14061406
///
@@ -1410,6 +1410,10 @@ impl Default for StreamingAnalysisConfig {
14101410
/// toxicity_enabled: true
14111411
/// toxicity_threshold: 0.7
14121412
/// block_on_critical: false
1413+
/// hallucination_enabled: false
1414+
/// hallucination_model: "vectara/hallucination_evaluation_model"
1415+
/// hallucination_threshold: 0.5
1416+
/// hallucination_min_response_length: 50
14131417
/// ```
14141418
#[derive(Debug, Clone, Serialize, Deserialize)]
14151419
pub struct OutputSafetyConfig {
@@ -1425,19 +1429,52 @@ pub struct OutputSafetyConfig {
14251429
/// Block (replace) the response if critical toxicity is detected.
14261430
#[serde(default)]
14271431
pub block_on_critical: bool,
1432+
/// Enable hallucination detection on response content.
1433+
///
1434+
/// When enabled, response sentences are scored against the user's prompt
1435+
/// for factual consistency using a cross-encoder model.
1436+
#[serde(default)]
1437+
pub hallucination_enabled: bool,
1438+
/// HuggingFace model ID for hallucination detection.
1439+
#[serde(default = "default_hallucination_model")]
1440+
pub hallucination_model: String,
1441+
/// Threshold below which a sentence is considered potentially hallucinated
1442+
/// (0.0–1.0). Sentences scoring below this are flagged.
1443+
#[serde(default = "default_hallucination_threshold")]
1444+
pub hallucination_threshold: f32,
1445+
/// Minimum response length (in characters) to run hallucination detection.
1446+
/// Responses shorter than this are skipped to save compute.
1447+
#[serde(default = "default_hallucination_min_response_length")]
1448+
pub hallucination_min_response_length: usize,
14281449
}
14291450

14301451
fn default_toxicity_threshold() -> f32 {
14311452
0.7
14321453
}
14331454

1455+
fn default_hallucination_model() -> String {
1456+
"vectara/hallucination_evaluation_model".to_string()
1457+
}
1458+
1459+
fn default_hallucination_threshold() -> f32 {
1460+
0.5
1461+
}
1462+
1463+
fn default_hallucination_min_response_length() -> usize {
1464+
50
1465+
}
1466+
14341467
impl Default for OutputSafetyConfig {
14351468
fn default() -> Self {
14361469
Self {
14371470
enabled: false,
14381471
toxicity_enabled: false,
14391472
toxicity_threshold: default_toxicity_threshold(),
14401473
block_on_critical: false,
1474+
hallucination_enabled: false,
1475+
hallucination_model: default_hallucination_model(),
1476+
hallucination_threshold: default_hallucination_threshold(),
1477+
hallucination_min_response_length: default_hallucination_min_response_length(),
14411478
}
14421479
}
14431480
}
@@ -1538,6 +1575,8 @@ impl Default for ShutdownConfig {
15381575
/// ml_download_timeout_seconds: 300
15391576
/// ner_enabled: true
15401577
/// ner_model: "dslim/bert-base-NER"
1578+
/// jailbreak_enabled: true
1579+
/// jailbreak_threshold: 0.7
15411580
/// ```
15421581
#[derive(Debug, Clone, Serialize, Deserialize)]
15431582
pub struct SecurityAnalysisConfig {
@@ -1578,6 +1617,16 @@ pub struct SecurityAnalysisConfig {
15781617
/// (suitable for architecture validation; not for production inference).
15791618
#[serde(default)]
15801619
pub fusion_model_path: Option<String>,
1620+
/// Enable dedicated jailbreak detection (runs alongside prompt injection).
1621+
///
1622+
/// When `true` (the default when security analysis is enabled), a separate
1623+
/// jailbreak detector with heuristic patterns and encoding evasion checks
1624+
/// is run on every request.
1625+
#[serde(default = "default_jailbreak_enabled")]
1626+
pub jailbreak_enabled: bool,
1627+
/// Confidence threshold for jailbreak detection (0.0–1.0).
1628+
#[serde(default = "default_jailbreak_threshold")]
1629+
pub jailbreak_threshold: f32,
15811630
}
15821631

15831632
fn default_ml_model() -> String {
@@ -1604,6 +1653,14 @@ fn default_ner_model() -> String {
16041653
"dslim/bert-base-NER".to_string()
16051654
}
16061655

1656+
fn default_jailbreak_enabled() -> bool {
1657+
true
1658+
}
1659+
1660+
fn default_jailbreak_threshold() -> f32 {
1661+
0.7
1662+
}
1663+
16071664
impl Default for SecurityAnalysisConfig {
16081665
fn default() -> Self {
16091666
Self {
@@ -1617,6 +1674,8 @@ impl Default for SecurityAnalysisConfig {
16171674
ner_model: default_ner_model(),
16181675
fusion_enabled: false,
16191676
fusion_model_path: None,
1677+
jailbreak_enabled: default_jailbreak_enabled(),
1678+
jailbreak_threshold: default_jailbreak_threshold(),
16201679
}
16211680
}
16221681
}
@@ -2604,6 +2663,8 @@ mod tests {
26042663
ner_model: default_ner_model(),
26052664
fusion_enabled: false,
26062665
fusion_model_path: None,
2666+
jailbreak_enabled: true,
2667+
jailbreak_threshold: 0.7,
26072668
},
26082669
otel_ingest: OtelIngestConfig::default(),
26092670
auth: AuthConfig::default(),
@@ -3059,6 +3120,8 @@ mod tests {
30593120
assert_eq!(config.security_analysis.ner_model, "dslim/bert-base-NER");
30603121
assert!(!config.security_analysis.fusion_enabled);
30613122
assert!(config.security_analysis.fusion_model_path.is_none());
3123+
assert!(config.security_analysis.jailbreak_enabled);
3124+
assert!((config.security_analysis.jailbreak_threshold - 0.7).abs() < f32::EPSILON);
30623125
}
30633126

30643127
#[test]
@@ -3077,6 +3140,8 @@ mod tests {
30773140
assert_eq!(config.ner_model, "dslim/bert-base-NER");
30783141
assert!(!config.fusion_enabled);
30793142
assert!(config.fusion_model_path.is_none());
3143+
assert!(config.jailbreak_enabled);
3144+
assert!((config.jailbreak_threshold - 0.7).abs() < f32::EPSILON);
30803145
}
30813146

30823147
#[test]
@@ -3092,6 +3157,8 @@ mod tests {
30923157
ner_model: "dslim/bert-base-NER".to_string(),
30933158
fusion_enabled: false,
30943159
fusion_model_path: None,
3160+
jailbreak_enabled: true,
3161+
jailbreak_threshold: 0.7,
30953162
};
30963163
let json = serde_json::to_string(&config).unwrap();
30973164
let deserialized: SecurityAnalysisConfig = serde_json::from_str(&json).unwrap();

crates/llmtrace-proxy/src/streaming.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,7 @@ mod tests {
959959
toxicity_enabled: true,
960960
toxicity_threshold: 0.5,
961961
block_on_critical: false,
962+
..Default::default()
962963
}
963964
}
964965

@@ -1047,6 +1048,7 @@ mod tests {
10471048
toxicity_enabled: false,
10481049
toxicity_threshold: 0.5,
10491050
block_on_critical: false,
1051+
..Default::default()
10501052
};
10511053
let mut monitor = StreamingOutputMonitor::new(&streaming, &output).unwrap();
10521054

@@ -1074,6 +1076,7 @@ mod tests {
10741076
toxicity_enabled: false,
10751077
toxicity_threshold: 0.5,
10761078
block_on_critical: false,
1079+
..Default::default()
10771080
};
10781081
let mut monitor = StreamingOutputMonitor::new(&streaming, &output).unwrap();
10791082

0 commit comments

Comments
 (0)