fix v5

DrChumbo · DrChumbo · commit 876f6277a0a1 · 2026-02-25T19:31:26.000-03:00
diff --git a/.env.example b/.env.example
@@ -15,3 +15,8 @@ POSTGRES_DB=ithaka
 # Other environment variables (add as needed)
 # TWILIO_ACCOUNT_SID=your_twilio_sid
 # TWILIO_AUTH_TOKEN=your_twilio_token
+
+# Wizard guardrails
+WIZARD_DETECT_JAILBREAK_ENABLED=true
+WIZARD_DETECT_JAILBREAK_THRESHOLD=0.9
+# GUARDRAILS_HUB_TOKEN=coloca_tu_token_del_guardrails_hub
diff --git a/Dockerfile b/Dockerfile
@@ -12,6 +12,8 @@ WORKDIR /app
 
 # Pin uv to a trusted version; bump deliberately during dependency maintenance.
 ARG UV_VERSION=0.5.31
+# Optional Guardrails Hub token to pre-install validators during build.
+ARG GUARDRAILS_HUB_TOKEN=""
 
 # Install system dependencies
 RUN apt-get update \
@@ -34,6 +36,15 @@ RUN chmod +x start.sh
 # Create a non-root user
 RUN adduser --disabled-password --gecos '' appuser \
     && chown -R appuser:appuser /app
+
+# Optionally install Guardrails validators at build-time when a Hub token is provided.
+RUN if [ -n "$GUARDRAILS_HUB_TOKEN" ]; then \
+        guardrails configure --token "${GUARDRAILS_HUB_TOKEN}" --disable-metrics --disable-remote-inferencing; \
+        guardrails hub install hub://guardrails/detect_jailbreak; \
+    else \
+        echo "Skipping Guardrails Hub install (provide GUARDRAILS_HUB_TOKEN build arg to enable)."; \
+    fi
+
 USER appuser
 
 # Expose port
diff --git a/README.md b/README.md
@@ -72,6 +72,28 @@ TWILIO_ACCOUNT_SID=your-sid
 TWILIO_AUTH_TOKEN=your-token
 ```
 
+### Guardrails para el Wizard
+
+El wizard ahora usa el validador [`DetectJailbreak`](https://guardrailsai.com/hub/validator/guardrails/detect_jailbreak) para bloquear intentos de prompt injection.
+
+1. Instalá las dependencias (el `requirements.txt` ya incluye `guardrails-ai>=0.5.10`).
+2. Descargá el recurso desde Guardrails Hub una sola vez:
+   ```bash
+   guardrails hub install hub://guardrails/detect_jailbreak
+   ```
+3. Configurá (o dejá por defecto) las variables:
+   ```bash
+   WIZARD_DETECT_JAILBREAK_ENABLED=true
+   WIZARD_DETECT_JAILBREAK_THRESHOLD=0.9
+   ```
+4. (Opcional) Para imágenes Docker, pasá el token del Guardrails Hub como build-arg para que el validador quede horneado:
+   ```bash
+   docker compose build --build-arg GUARDRAILS_HUB_TOKEN=tu_token
+   ```
+   También podés definir `GUARDRAILS_HUB_TOKEN` en tu archivo `.env` (no lo comitees) y `docker compose` lo inyectará automáticamente gracias a `build.args`.
+
+Si el validador no está disponible, el sistema seguirá usando el filtro de patrones, pero se recomienda mantener ambos mecanismos activos.
+
 ### 5. Configurar base de datos
 
 #### Crear usuario y base de datos (si no existen):
diff --git a/app/agents/wizard_workflow/nodes.py b/app/agents/wizard_workflow/nodes.py
@@ -1,5 +1,6 @@
-import logging
 import hashlib
+import logging
+import os
 
 from langchain_core.messages import AIMessage
 
@@ -8,6 +9,13 @@
 from app.utils.validators import ValidationError, validate_ci, validate_email, validate_phone
 from app.agents.wizard_workflow.messages import WIZARD_COMPLETION_MESSAGE
 
+try:
+    from guardrails import Guard
+    from guardrails.hub import DetectJailbreak
+except ImportError:  # pragma: no cover - optional dependency during local dev
+    Guard = None
+    DetectJailbreak = None
+
 logger = logging.getLogger(__name__)
 
 MAX_ANSWER_LENGTH = 2000
@@ -26,6 +34,91 @@
     "<system>",
 )
 
+_DEFAULT_JAILBREAK_THRESHOLD = 0.9
+
+
+def _env_flag(value: str | None, *, default: bool = True) -> bool:
+    if value is None:
+        return default
+    return value.strip().lower() not in {"", "0", "false", "no", "off"}
+
+
+_DETECT_JAILBREAK_ENABLED = _env_flag(os.getenv("WIZARD_DETECT_JAILBREAK_ENABLED"), default=True)
+
+
+def _load_detect_jailbreak_guard():
+    if not _DETECT_JAILBREAK_ENABLED:
+        logger.info("[WIZARD/guardrails] DetectJailbreak disabled via env flag.")
+        return None
+    if Guard is None or DetectJailbreak is None:
+        logger.warning(
+            "[WIZARD/guardrails] guardrails-ai is not installed. "
+            "Install guardrails-ai>=0.5.10 and the DetectJailbreak hub package."
+        )
+        return None
+
+    raw_threshold = os.getenv("WIZARD_DETECT_JAILBREAK_THRESHOLD")
+    try:
+        threshold = float(raw_threshold) if raw_threshold is not None else _DEFAULT_JAILBREAK_THRESHOLD
+    except ValueError:
+        threshold = _DEFAULT_JAILBREAK_THRESHOLD
+        logger.warning(
+            "[WIZARD/guardrails] Invalid threshold %r. Falling back to %.2f.",
+            raw_threshold,
+            _DEFAULT_JAILBREAK_THRESHOLD,
+        )
+
+    try:
+        return Guard().use(DetectJailbreak, threshold=threshold)
+    except Exception:
+        logger.exception(
+            "[WIZARD/guardrails] Could not initialize DetectJailbreak. "
+            "Run `guardrails hub install hub://guardrails/detect_jailbreak` and retry."
+        )
+        return None
+
+
+_DETECT_JAILBREAK_GUARD = _load_detect_jailbreak_guard()
+
+
+def _is_detected_as_jailbreak(message: str) -> bool:
+    guard = _DETECT_JAILBREAK_GUARD
+    if guard is None:
+        return False
+
+    try:
+        result = guard.validate(message)
+    except Exception:
+        logger.exception("[WIZARD/guardrails] DetectJailbreak validation failed. Allowing message as fallback.")
+        return False
+
+    return not bool(getattr(result, "validation_passed", True))
+
+
+def _blocked_guardrail_response(state: WizardState, current_q: int, cleaned: str, reason: str):
+    msg_preview = cleaned[:64]
+    msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12]
+    logger.warning(
+        "[WIZARD/guardrails] %s session_id=%s current_question=%s msg_preview=%r msg_hash=%s",
+        reason,
+        state.get("wizard_session_id"),
+        current_q,
+        msg_preview,
+        msg_hash,
+    )
+    return {
+        **state,
+        "messages": [
+            AIMessage(
+                content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado."
+            )
+        ],
+        "awaiting_answer": True,
+        "completed": False,
+        "wizard_status": "ACTIVE",
+        "valid": False,
+    }
+
 
 def _normalize_answer(value):
     if isinstance(value, str):
@@ -186,28 +279,17 @@ def input_guardrails_node(state: WizardState):
 
     lowered = cleaned.lower()
     if any(pattern in lowered for pattern in GUARDRAIL_BLOCK_PATTERNS):
-        msg_preview = cleaned[:64]
-        msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12]
-        logger.warning(
-            "[WIZARD/guardrails] Possible prompt-injection-like answer blocked "
-            "session_id=%s current_question=%s msg_preview=%r msg_hash=%s",
-            state.get("wizard_session_id"),
+        return _blocked_guardrail_response(
+            state, current_q, cleaned, "Possible prompt-injection-like answer blocked."
+        )
+
+    if _is_detected_as_jailbreak(cleaned):
+        return _blocked_guardrail_response(
+            state,
             current_q,
-            msg_preview,
-            msg_hash,
+            cleaned,
+            "DetectJailbreak flagged potential jailbreak attempt.",
         )
-        return {
-            **state,
-            "messages": [
-                AIMessage(
-                    content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado."
-                )
-            ],
-            "awaiting_answer": True,
-            "completed": False,
-            "wizard_status": "ACTIVE",
-            "valid": False,
-        }
 
     return {
         **state,
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,6 +5,8 @@ services:
     build:
       context: .
       dockerfile: Dockerfile
+      args:
+        GUARDRAILS_HUB_TOKEN: ${GUARDRAILS_HUB_TOKEN:-}
     container_name: ithaka-backend
     ports:
       - "8000:8000"
diff --git a/requirements.txt b/requirements.txt
@@ -15,6 +15,8 @@ openai==1.99.1
 langgraph==0.2.76
 langgraph-checkpoint==2.1.1
 langgraph-sdk==0.1.74
+guardrails-ai>=0.5.10
+rich<14
 
 # Data validation and serialization
 pydantic==2.11.7