techlab-innov
diff --git a/‎config.example.yaml‎
Lines changed: 25 additions & 0 deletions b/‎config.example.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎crates/llmtrace-core/src/lib.rs‎
Lines changed: 86 additions & 0 deletions b/‎crates/llmtrace-core/src/lib.rs‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎crates/llmtrace-proxy/src/proxy.rs‎
Lines changed: 81 additions & 3 deletions b/‎crates/llmtrace-proxy/src/proxy.rs‎
Lines changed: 81 additions & 3 deletions
@@ -271,6 +271,31 @@ streaming_analysis:
   # Number of completion tokens between each incremental analysis check.
   # Lower values detect threats faster but add marginal CPU overhead per chunk.
   token_interval: 50
+  # Enable output-side analysis during SSE streaming (PII, secrets, toxicity
+  # on response content in real-time). Requires output_safety.enabled = true.
+  output_enabled: false
+  # If a critical output safety finding is detected mid-stream, inject a
+  # warning into the SSE stream and terminate. Use with caution — this will
+  # cut off the LLM response mid-generation.
+  early_stop_on_critical: false
+
+# ---------------------------------------------------------------------------
+# Output safety — toxicity detection and response content analysis (R6)
+# ---------------------------------------------------------------------------
+
+output_safety:
+  # Enable output safety analysis on LLM responses. When enabled, the proxy
+  # analyses response content for toxicity, PII leakage, and secret exposure.
+  enabled: false
+  # Enable toxicity detection on response content. Uses a BERT-based classifier
+  # (unitary/toxic-bert) or falls back to keyword-based detection.
+  toxicity_enabled: false
+  # Confidence threshold for toxicity detection (0.0–1.0). Categories scoring
+  # above this threshold are reported as findings.
+  toxicity_threshold: 0.7
+  # Block (replace) the entire response if critical toxicity is detected
+  # (severe_toxic, threat, or score >= 0.9). Use with caution.
+  block_on_critical: false
 
 # ---------------------------------------------------------------------------
 # ML-based security analysis
 
@@ -915,6 +915,9 @@ pub struct ProxyConfig {
     /// PII detection and redaction configuration.
     #[serde(default)]
     pub pii: PiiConfig,
+    /// Output safety configuration (toxicity detection, output analysis).
+    #[serde(default)]
+    pub output_safety: OutputSafetyConfig,
     /// Graceful shutdown configuration.
     #[serde(default)]
     pub shutdown: ShutdownConfig,
@@ -952,6 +955,7 @@ impl Default for ProxyConfig {
             anomaly_detection: AnomalyDetectionConfig::default(),
             streaming_analysis: StreamingAnalysisConfig::default(),
             pii: PiiConfig::default(),
+            output_safety: OutputSafetyConfig::default(),
             shutdown: ShutdownConfig::default(),
         }
     }
@@ -1365,6 +1369,12 @@ pub struct StreamingAnalysisConfig {
     /// Number of tokens between each incremental analysis check.
     #[serde(default = "default_streaming_token_interval")]
     pub token_interval: u32,
+    /// Enable output-side analysis during SSE streaming (PII, secrets, toxicity on response content).
+    #[serde(default)]
+    pub output_enabled: bool,
+    /// If a critical finding is detected mid-stream, inject a warning and stop.
+    #[serde(default)]
+    pub early_stop_on_critical: bool,
 }
 
 fn default_streaming_token_interval() -> u32 {
@@ -1376,6 +1386,58 @@ impl Default for StreamingAnalysisConfig {
         Self {
             enabled: false,
             token_interval: default_streaming_token_interval(),
+            output_enabled: false,
+            early_stop_on_critical: false,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Output safety configuration
+// ---------------------------------------------------------------------------
+
+/// Output safety configuration for response content analysis.
+///
+/// When enabled, the proxy analyses LLM response content for toxicity,
+/// PII leakage, and secret exposure. This is a post-processing step that
+/// runs after the upstream response is received.
+///
+/// # Example (YAML)
+///
+/// ```yaml
+/// output_safety:
+///   enabled: true
+///   toxicity_enabled: true
+///   toxicity_threshold: 0.7
+///   block_on_critical: false
+/// ```
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct OutputSafetyConfig {
+    /// Enable output safety analysis on LLM responses.
+    #[serde(default)]
+    pub enabled: bool,
+    /// Enable toxicity detection on response content.
+    #[serde(default)]
+    pub toxicity_enabled: bool,
+    /// Confidence threshold for toxicity detection (0.0–1.0).
+    #[serde(default = "default_toxicity_threshold")]
+    pub toxicity_threshold: f32,
+    /// Block (replace) the response if critical toxicity is detected.
+    #[serde(default)]
+    pub block_on_critical: bool,
+}
+
+fn default_toxicity_threshold() -> f32 {
+    0.7
+}
+
+impl Default for OutputSafetyConfig {
+    fn default() -> Self {
+        Self {
+            enabled: false,
+            toxicity_enabled: false,
+            toxicity_threshold: default_toxicity_threshold(),
+            block_on_critical: false,
         }
     }
 }
@@ -1503,6 +1565,19 @@ pub struct SecurityAnalysisConfig {
     /// HuggingFace model ID for NER-based PII detection.
     #[serde(default = "default_ner_model")]
     pub ner_model: String,
+    /// Enable feature-level fusion classifier (ADR-013).
+    ///
+    /// When `true`, the ensemble concatenates DeBERTa embeddings with heuristic
+    /// feature vectors and feeds them through a learned fusion classifier instead
+    /// of combining scores after independent classification.
+    #[serde(default)]
+    pub fusion_enabled: bool,
+    /// Optional file path for trained fusion classifier weights.
+    ///
+    /// When `None`, the fusion classifier is initialised with random weights
+    /// (suitable for architecture validation; not for production inference).
+    #[serde(default)]
+    pub fusion_model_path: Option<String>,
 }
 
 fn default_ml_model() -> String {
@@ -1540,6 +1615,8 @@ impl Default for SecurityAnalysisConfig {
             ml_download_timeout_seconds: default_ml_download_timeout_seconds(),
             ner_enabled: false,
             ner_model: default_ner_model(),
+            fusion_enabled: false,
+            fusion_model_path: None,
         }
     }
 }
@@ -2525,6 +2602,8 @@ mod tests {
                 ml_download_timeout_seconds: 300,
                 ner_enabled: false,
                 ner_model: default_ner_model(),
+                fusion_enabled: false,
+                fusion_model_path: None,
             },
             otel_ingest: OtelIngestConfig::default(),
             auth: AuthConfig::default(),
@@ -2534,6 +2613,7 @@ mod tests {
             pii: PiiConfig {
                 action: PiiAction::AlertAndRedact,
             },
+            output_safety: OutputSafetyConfig::default(),
             shutdown: ShutdownConfig::default(),
         };
 
@@ -2977,6 +3057,8 @@ mod tests {
         assert_eq!(config.security_analysis.ml_download_timeout_seconds, 300);
         assert!(!config.security_analysis.ner_enabled);
         assert_eq!(config.security_analysis.ner_model, "dslim/bert-base-NER");
+        assert!(!config.security_analysis.fusion_enabled);
+        assert!(config.security_analysis.fusion_model_path.is_none());
     }
 
     #[test]
@@ -2993,6 +3075,8 @@ mod tests {
         assert_eq!(config.ml_download_timeout_seconds, 300);
         assert!(!config.ner_enabled);
         assert_eq!(config.ner_model, "dslim/bert-base-NER");
+        assert!(!config.fusion_enabled);
+        assert!(config.fusion_model_path.is_none());
     }
 
     #[test]
@@ -3006,6 +3090,8 @@ mod tests {
             ml_download_timeout_seconds: 600,
             ner_enabled: true,
             ner_model: "dslim/bert-base-NER".to_string(),
+            fusion_enabled: false,
+            fusion_model_path: None,
         };
         let json = serde_json::to_string(&config).unwrap();
         let deserialized: SecurityAnalysisConfig = serde_json::from_str(&json).unwrap();
 
@@ -7,7 +7,7 @@
 use crate::circuit_breaker::CircuitBreaker;
 use crate::cost::CostEstimator;
 use crate::provider::{self, ParsedResponse};
-use crate::streaming::{StreamingAccumulator, StreamingSecurityMonitor};
+use crate::streaming::{StreamingAccumulator, StreamingOutputMonitor, StreamingSecurityMonitor};
 use axum::body::Body;
 use axum::extract::State;
 use axum::http::{HeaderMap, Request, Response, StatusCode};
@@ -433,6 +433,15 @@ pub async fn proxy_handler(
         } else {
             None
         };
+        // Initialise the streaming output monitor for response-side analysis (R7).
+        let mut output_monitor = if is_streaming {
+            StreamingOutputMonitor::new(
+                &state_bg.config.streaming_analysis,
+                &state_bg.config.output_safety,
+            )
+        } else {
+            None
+        };
         let mut raw_collected = Vec::new();
         let mut ttft_ms: Option<u64> = None;
 
@@ -466,6 +475,36 @@ pub async fn proxy_handler(
                                 }
                             }
                         }
+
+                        // --- Real-time streaming OUTPUT analysis (R7) ---
+                        if let Some(ref mut out_mon) = output_monitor {
+                            if out_mon.should_analyze(acc.completion_token_count) {
+                                let new_findings = out_mon
+                                    .analyze_incremental(&acc.content, acc.completion_token_count);
+                                if !new_findings.is_empty() {
+                                    info!(
+                                        %trace_id,
+                                        count = new_findings.len(),
+                                        tokens = acc.completion_token_count,
+                                        "Streaming output safety findings detected mid-stream"
+                                    );
+                                    if let Some(ref engine) = state_bg.alert_engine {
+                                        engine.check_and_alert(trace_id, tenant_id, &new_findings);
+                                    }
+                                }
+                            }
+
+                            // Early stop: inject warning and terminate stream
+                            if out_mon.should_early_stop() {
+                                warn!(
+                                    %trace_id,
+                                    "Critical output safety issue detected — early stopping stream"
+                                );
+                                let warning = StreamingOutputMonitor::early_stop_sse_event();
+                                let _ = body_sender.send(Ok(Bytes::from(warning))).await;
+                                break;
+                            }
+                        }
                     }
                     raw_collected.extend_from_slice(&bytes);
                     if body_sender.send(Ok(bytes)).await.is_err() {
@@ -500,12 +539,33 @@ pub async fn proxy_handler(
             }
         }
 
+        // Run one final streaming OUTPUT analysis flush.
+        if let (Some(ref acc), Some(ref mut out_mon)) = (&sse_accumulator, &mut output_monitor) {
+            let final_findings =
+                out_mon.analyze_incremental(&acc.content, acc.completion_token_count);
+            if !final_findings.is_empty() {
+                info!(
+                    %trace_id,
+                    count = final_findings.len(),
+                    "Streaming output safety findings in final flush"
+                );
+                if let Some(ref engine) = state_bg.alert_engine {
+                    engine.check_and_alert(trace_id, tenant_id, &final_findings);
+                }
+            }
+        }
+
         // Collect streaming security findings for attachment to the trace span.
-        let streaming_findings: Vec<SecurityFinding> = streaming_monitor
+        let mut streaming_findings: Vec<SecurityFinding> = streaming_monitor
             .as_mut()
             .map(|m| m.take_findings())
             .unwrap_or_default();
 
+        // Merge in streaming output findings.
+        if let Some(ref mut out_mon) = output_monitor {
+            streaming_findings.extend(out_mon.take_findings());
+        }
+
         // Build the captured interaction with streaming metrics if applicable
         let (response_text, prompt_tokens, completion_tokens, total_tokens) =
             if let Some(acc) = sse_accumulator {
@@ -744,7 +804,7 @@ async fn run_security_analysis(
         parameters: std::collections::HashMap::new(),
     };
 
-    match state
+    let mut all_findings = match state
         .security
         .analyze_interaction(&captured.prompt_text, &captured.response_text, &context)
         .await
@@ -775,7 +835,25 @@ async fn run_security_analysis(
             error!(trace_id = %captured.trace_id, "Security analysis failed: {}", e);
             Vec::new()
         }
+    };
+
+    // --- Output safety analysis (R6) ---
+    if state.config.output_safety.enabled && !captured.response_text.is_empty() {
+        let output_analyzer =
+            llmtrace_security::OutputAnalyzer::new_with_fallback(&state.config.output_safety);
+        let result = output_analyzer.analyze_output(&captured.response_text);
+        if !result.findings.is_empty() {
+            info!(
+                trace_id = %captured.trace_id,
+                finding_count = result.findings.len(),
+                has_critical = result.has_critical_toxicity,
+                "Output safety findings detected"
+            );
+            all_findings.extend(result.findings);
+        }
     }
+
+    all_findings
 }
 
 /// Store a trace event enriched with security findings.