Skip to content

Commit 876f627

Browse files
committed
fix v5
1 parent 0e463a1 commit 876f627

File tree

6 files changed

+145
-21
lines changed

6 files changed

+145
-21
lines changed

.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,8 @@ POSTGRES_DB=ithaka
1515
# Other environment variables (add as needed)
1616
# TWILIO_ACCOUNT_SID=your_twilio_sid
1717
# TWILIO_AUTH_TOKEN=your_twilio_token
18+
19+
# Wizard guardrails
20+
WIZARD_DETECT_JAILBREAK_ENABLED=true
21+
WIZARD_DETECT_JAILBREAK_THRESHOLD=0.9
22+
# GUARDRAILS_HUB_TOKEN=coloca_tu_token_del_guardrails_hub

Dockerfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ WORKDIR /app
1212

1313
# Pin uv to a trusted version; bump deliberately during dependency maintenance.
1414
ARG UV_VERSION=0.5.31
15+
# Optional Guardrails Hub token to pre-install validators during build.
16+
ARG GUARDRAILS_HUB_TOKEN=""
1517

1618
# Install system dependencies
1719
RUN apt-get update \
@@ -34,6 +36,15 @@ RUN chmod +x start.sh
3436
# Create a non-root user
3537
RUN adduser --disabled-password --gecos '' appuser \
3638
&& chown -R appuser:appuser /app
39+
40+
# Optionally install Guardrails validators at build-time when a Hub token is provided.
41+
RUN if [ -n "$GUARDRAILS_HUB_TOKEN" ]; then \
42+
guardrails configure --token "${GUARDRAILS_HUB_TOKEN}" --disable-metrics --disable-remote-inferencing; \
43+
guardrails hub install hub://guardrails/detect_jailbreak; \
44+
else \
45+
echo "Skipping Guardrails Hub install (provide GUARDRAILS_HUB_TOKEN build arg to enable)."; \
46+
fi
47+
3748
USER appuser
3849

3950
# Expose port

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,28 @@ TWILIO_ACCOUNT_SID=your-sid
7272
TWILIO_AUTH_TOKEN=your-token
7373
```
7474

75+
### Guardrails para el Wizard
76+
77+
El wizard ahora usa el validador [`DetectJailbreak`](https://guardrailsai.com/hub/validator/guardrails/detect_jailbreak) para bloquear intentos de prompt injection.
78+
79+
1. Instalá las dependencias (el `requirements.txt` ya incluye `guardrails-ai>=0.5.10`).
80+
2. Descargá el recurso desde Guardrails Hub una sola vez:
81+
```bash
82+
guardrails hub install hub://guardrails/detect_jailbreak
83+
```
84+
3. Configurá (o dejá por defecto) las variables:
85+
```bash
86+
WIZARD_DETECT_JAILBREAK_ENABLED=true
87+
WIZARD_DETECT_JAILBREAK_THRESHOLD=0.9
88+
```
89+
4. (Opcional) Para imágenes Docker, pasá el token del Guardrails Hub como build-arg para que el validador quede horneado:
90+
```bash
91+
docker compose build --build-arg GUARDRAILS_HUB_TOKEN=tu_token
92+
```
93+
También podés definir `GUARDRAILS_HUB_TOKEN` en tu archivo `.env` (no lo comitees) y `docker compose` lo inyectará automáticamente gracias a `build.args`.
94+
95+
Si el validador no está disponible, el sistema seguirá usando el filtro de patrones, pero se recomienda mantener ambos mecanismos activos.
96+
7597
### 5. Configurar base de datos
7698

7799
#### Crear usuario y base de datos (si no existen):

app/agents/wizard_workflow/nodes.py

Lines changed: 103 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
import logging
21
import hashlib
2+
import logging
3+
import os
34

45
from langchain_core.messages import AIMessage
56

@@ -8,6 +9,13 @@
89
from app.utils.validators import ValidationError, validate_ci, validate_email, validate_phone
910
from app.agents.wizard_workflow.messages import WIZARD_COMPLETION_MESSAGE
1011

12+
try:
13+
from guardrails import Guard
14+
from guardrails.hub import DetectJailbreak
15+
except ImportError: # pragma: no cover - optional dependency during local dev
16+
Guard = None
17+
DetectJailbreak = None
18+
1119
logger = logging.getLogger(__name__)
1220

1321
MAX_ANSWER_LENGTH = 2000
@@ -26,6 +34,91 @@
2634
"<system>",
2735
)
2836

37+
_DEFAULT_JAILBREAK_THRESHOLD = 0.9
38+
39+
40+
def _env_flag(value: str | None, *, default: bool = True) -> bool:
41+
if value is None:
42+
return default
43+
return value.strip().lower() not in {"", "0", "false", "no", "off"}
44+
45+
46+
_DETECT_JAILBREAK_ENABLED = _env_flag(os.getenv("WIZARD_DETECT_JAILBREAK_ENABLED"), default=True)
47+
48+
49+
def _load_detect_jailbreak_guard():
50+
if not _DETECT_JAILBREAK_ENABLED:
51+
logger.info("[WIZARD/guardrails] DetectJailbreak disabled via env flag.")
52+
return None
53+
if Guard is None or DetectJailbreak is None:
54+
logger.warning(
55+
"[WIZARD/guardrails] guardrails-ai is not installed. "
56+
"Install guardrails-ai>=0.5.10 and the DetectJailbreak hub package."
57+
)
58+
return None
59+
60+
raw_threshold = os.getenv("WIZARD_DETECT_JAILBREAK_THRESHOLD")
61+
try:
62+
threshold = float(raw_threshold) if raw_threshold is not None else _DEFAULT_JAILBREAK_THRESHOLD
63+
except ValueError:
64+
threshold = _DEFAULT_JAILBREAK_THRESHOLD
65+
logger.warning(
66+
"[WIZARD/guardrails] Invalid threshold %r. Falling back to %.2f.",
67+
raw_threshold,
68+
_DEFAULT_JAILBREAK_THRESHOLD,
69+
)
70+
71+
try:
72+
return Guard().use(DetectJailbreak, threshold=threshold)
73+
except Exception:
74+
logger.exception(
75+
"[WIZARD/guardrails] Could not initialize DetectJailbreak. "
76+
"Run `guardrails hub install hub://guardrails/detect_jailbreak` and retry."
77+
)
78+
return None
79+
80+
81+
_DETECT_JAILBREAK_GUARD = _load_detect_jailbreak_guard()
82+
83+
84+
def _is_detected_as_jailbreak(message: str) -> bool:
85+
guard = _DETECT_JAILBREAK_GUARD
86+
if guard is None:
87+
return False
88+
89+
try:
90+
result = guard.validate(message)
91+
except Exception:
92+
logger.exception("[WIZARD/guardrails] DetectJailbreak validation failed. Allowing message as fallback.")
93+
return False
94+
95+
return not bool(getattr(result, "validation_passed", True))
96+
97+
98+
def _blocked_guardrail_response(state: WizardState, current_q: int, cleaned: str, reason: str):
99+
msg_preview = cleaned[:64]
100+
msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12]
101+
logger.warning(
102+
"[WIZARD/guardrails] %s session_id=%s current_question=%s msg_preview=%r msg_hash=%s",
103+
reason,
104+
state.get("wizard_session_id"),
105+
current_q,
106+
msg_preview,
107+
msg_hash,
108+
)
109+
return {
110+
**state,
111+
"messages": [
112+
AIMessage(
113+
content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado."
114+
)
115+
],
116+
"awaiting_answer": True,
117+
"completed": False,
118+
"wizard_status": "ACTIVE",
119+
"valid": False,
120+
}
121+
29122

30123
def _normalize_answer(value):
31124
if isinstance(value, str):
@@ -186,28 +279,17 @@ def input_guardrails_node(state: WizardState):
186279

187280
lowered = cleaned.lower()
188281
if any(pattern in lowered for pattern in GUARDRAIL_BLOCK_PATTERNS):
189-
msg_preview = cleaned[:64]
190-
msg_hash = hashlib.sha256(cleaned.encode("utf-8")).hexdigest()[:12]
191-
logger.warning(
192-
"[WIZARD/guardrails] Possible prompt-injection-like answer blocked "
193-
"session_id=%s current_question=%s msg_preview=%r msg_hash=%s",
194-
state.get("wizard_session_id"),
282+
return _blocked_guardrail_response(
283+
state, current_q, cleaned, "Possible prompt-injection-like answer blocked."
284+
)
285+
286+
if _is_detected_as_jailbreak(cleaned):
287+
return _blocked_guardrail_response(
288+
state,
195289
current_q,
196-
msg_preview,
197-
msg_hash,
290+
cleaned,
291+
"DetectJailbreak flagged potential jailbreak attempt.",
198292
)
199-
return {
200-
**state,
201-
"messages": [
202-
AIMessage(
203-
content="Tu mensaje parece una instruccion para alterar el asistente. Responde solo con el dato solicitado."
204-
)
205-
],
206-
"awaiting_answer": True,
207-
"completed": False,
208-
"wizard_status": "ACTIVE",
209-
"valid": False,
210-
}
211293

212294
return {
213295
**state,

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ services:
55
build:
66
context: .
77
dockerfile: Dockerfile
8+
args:
9+
GUARDRAILS_HUB_TOKEN: ${GUARDRAILS_HUB_TOKEN:-}
810
container_name: ithaka-backend
911
ports:
1012
- "8000:8000"

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ openai==1.99.1
1515
langgraph==0.2.76
1616
langgraph-checkpoint==2.1.1
1717
langgraph-sdk==0.1.74
18+
guardrails-ai>=0.5.10
19+
rich<14
1820

1921
# Data validation and serialization
2022
pydantic==2.11.7

0 commit comments

Comments
 (0)