|
1 | | -import logging |
2 | 1 | import hashlib |
| 2 | +import logging |
| 3 | +import os |
3 | 4 |
|
4 | 5 | from langchain_core.messages import AIMessage |
5 | 6 |
|
|
8 | 9 | from app.utils.validators import ValidationError, validate_ci, validate_email, validate_phone |
9 | 10 | from app.agents.wizard_workflow.messages import WIZARD_COMPLETION_MESSAGE |
10 | 11 |
|
| 12 | +try: |
| 13 | + from guardrails import Guard |
| 14 | + from guardrails.hub import DetectJailbreak |
| 15 | +except ImportError: # pragma: no cover - optional dependency during local dev |
| 16 | + Guard = None |
| 17 | + DetectJailbreak = None |
| 18 | + |
11 | 19 | logger = logging.getLogger(__name__) |
12 | 20 |
|
13 | 21 | MAX_ANSWER_LENGTH = 2000 |
|
26 | 34 | "<system>", |
27 | 35 | ) |
28 | 36 |
|
| 37 | +_DEFAULT_JAILBREAK_THRESHOLD = 0.9 |
| 38 | + |
| 39 | + |
| 40 | +def _env_flag(value: str | None, *, default: bool = True) -> bool: |
| 41 | + if value is None: |
| 42 | + return default |
| 43 | + return value.strip().lower() not in {"", "0", "false", "no", "off"} |
| 44 | + |
| 45 | + |
| 46 | +_DETECT_JAILBREAK_ENABLED = _env_flag(os.getenv("WIZARD_DETECT_JAILBREAK_ENABLED"), default=True) |
| 47 | + |
| 48 | + |
| 49 | +def _load_detect_jailbreak_guard(): |
| 50 | + if not _DETECT_JAILBREAK_ENABLED: |
| 51 | + logger.info("[WIZARD/guardrails] DetectJailbreak disabled via env flag.") |
| 52 | + return None |
| 53 | + if Guard is None or DetectJailbreak is None: |
| 54 | + logger.warning( |
| 55 | + "[WIZARD/guardrails] guardrails-ai is not installed. " |
| 56 | + "Install guardrails-ai>=0.5.10 and the DetectJailbreak hub package." |
| 57 | + ) |
| 58 | + return None |
| 59 | + |
| 60 | + raw_threshold = os.getenv("WIZARD_DETECT_JAILBREAK_THRESHOLD") |
| 61 | + try: |
| 62 | + threshold = float(raw_threshold) if raw_threshold is not None else _DEFAULT_JAILBREAK_THRESHOLD |
| 63 | + except ValueError: |
| 64 | + threshold = _DEFAULT_JAILBREAK_THRESHOLD |
| 65 | + logger.warning( |
| 66 | + "[WIZARD/guardrails] Invalid threshold %r. Falling back to %.2f.", |
| 67 | + raw_threshold, |
| 68 | + _DEFAULT_JAILBREAK_THRESHOLD, |
| 69 | + ) |
| 70 | + |
| 71 | + try: |
| 72 | + return Guard().use(DetectJailbreak, threshold=threshold) |
| 73 | + except Exception: |
| 74 | + logger.exception( |
| 75 | + "[WIZARD/guardrails] Could not initialize DetectJailbreak. " |
| 76 | + "Run `guardrails hub install hub://guardrails/detect_jailbreak` and retry." |
| 77 | + ) |
| 78 | + return None |
| 79 | + |
| 80 | + |
| 81 | +_DETECT_JAILBREAK_GUARD = _load_detect_jailbreak_guard() |
| 82 | + |
| 83 | + |
| 84 | +def _is_detected_as_jailbreak(message: str) -> bool: |
| 85 | + guard = _DETECT_JAILBREAK_GUARD |
| 86 | + if guard is None: |
| 87 | + return False |
| 88 | + |
| 89 | + try: |
| 90 | + result = guard.validate(message) |
| 91 | + except Exception: |
| 92 | + logger.exception("[WIZARD/guardrails] DetectJailbreak validation failed. Allowing message as fallback.") |
| 93 | + return False |
| 94 | + |
| 95 | + return not bool(getattr(result, "validation_passed", True)) |
| 96 | + |
| 97 | + |
| 98 | +def _blocked_guardrail_response(state: WizardState, current_q: int, cleaned: str, reason: str): |
| 99 | + msg_preview = cleaned[:64] |
| 100 | + msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12] |
| 101 | + logger.warning( |
| 102 | + "[WIZARD/guardrails] %s session_id=%s current_question=%s msg_preview=%r msg_hash=%s", |
| 103 | + reason, |
| 104 | + state.get("wizard_session_id"), |
| 105 | + current_q, |
| 106 | + msg_preview, |
| 107 | + msg_hash, |
| 108 | + ) |
| 109 | + return { |
| 110 | + **state, |
| 111 | + "messages": [ |
| 112 | + AIMessage( |
| 113 | + content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado." |
| 114 | + ) |
| 115 | + ], |
| 116 | + "awaiting_answer": True, |
| 117 | + "completed": False, |
| 118 | + "wizard_status": "ACTIVE", |
| 119 | + "valid": False, |
| 120 | + } |
| 121 | + |
29 | 122 |
|
30 | 123 | def _normalize_answer(value): |
31 | 124 | if isinstance(value, str): |
@@ -186,28 +279,17 @@ def input_guardrails_node(state: WizardState): |
186 | 279 |
|
187 | 280 | lowered = cleaned.lower() |
188 | 281 | if any(pattern in lowered for pattern in GUARDRAIL_BLOCK_PATTERNS): |
189 | | - msg_preview = cleaned[:64] |
190 | | - msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12] |
191 | | - logger.warning( |
192 | | - "[WIZARD/guardrails] Possible prompt-injection-like answer blocked " |
193 | | - "session_id=%s current_question=%s msg_preview=%r msg_hash=%s", |
194 | | - state.get("wizard_session_id"), |
| 282 | + return _blocked_guardrail_response( |
| 283 | + state, current_q, cleaned, "Possible prompt-injection-like answer blocked." |
| 284 | + ) |
| 285 | + |
| 286 | + if _is_detected_as_jailbreak(cleaned): |
| 287 | + return _blocked_guardrail_response( |
| 288 | + state, |
195 | 289 | current_q, |
196 | | - msg_preview, |
197 | | - msg_hash, |
| 290 | + cleaned, |
| 291 | + "DetectJailbreak flagged potential jailbreak attempt.", |
198 | 292 | ) |
199 | | - return { |
200 | | - **state, |
201 | | - "messages": [ |
202 | | - AIMessage( |
203 | | - content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado." |
204 | | - ) |
205 | | - ], |
206 | | - "awaiting_answer": True, |
207 | | - "completed": False, |
208 | | - "wizard_status": "ACTIVE", |
209 | | - "valid": False, |
210 | | - } |
211 | 293 |
|
212 | 294 | return { |
213 | 295 | **state, |
|
0 commit comments