|
2 | 2 |
|
3 | 3 | import logging |
4 | 4 |
|
5 | | -from guardrails import Guard |
6 | | -from guardrails.hub import DetectJailbreak, ToxicLanguage |
| 5 | +# Guardrails disabled for now |
| 6 | +# from guardrails import Guard |
| 7 | +# from guardrails.hub import DetectJailbreak, ToxicLanguage |
7 | 8 |
|
8 | 9 | from app.agents.state import AgentState |
9 | | -from app.core.config import settings |
| 10 | +# from app.core.config import settings |
10 | 11 |
|
11 | 12 | logger = logging.getLogger(__name__) |
12 | 13 |
|
13 | | -# Initialize Guard with DetectJailbreak and ToxicLanguage validators |
14 | | -# Note: The validators must be installed via: |
15 | | -# guardrails hub install hub://guardrails/detect_jailbreak |
16 | | -# guardrails hub install hub://guardrails/toxic_language |
17 | | -_guard_inicial = Guard().use( |
18 | | - DetectJailbreak( |
19 | | - threshold=settings.guardrails_jailbreak_threshold, |
20 | | - device=settings.guardrails_device, |
21 | | - on_fail="noop", # Don't raise exceptions, handle via state flags |
22 | | - ) |
23 | | -).use( |
24 | | - ToxicLanguage( |
25 | | - on_fail="noop", # Don't raise exceptions, handle via state flags |
26 | | - ) |
27 | | -) |
| 14 | +# Guardrails disabled for now - just pass through |
| 15 | +# # Initialize Guard with DetectJailbreak and ToxicLanguage validators |
| 16 | +# # Note: The validators must be installed via: |
| 17 | +# # guardrails hub install hub://guardrails/detect_jailbreak |
| 18 | +# # guardrails hub install hub://guardrails/toxic_language |
| 19 | +# _guard_inicial = Guard().use( |
| 20 | +# DetectJailbreak( |
| 21 | +# threshold=settings.guardrails_jailbreak_threshold, |
| 22 | +# device=settings.guardrails_device, |
| 23 | +# on_fail="noop", # Don't raise exceptions, handle via state flags |
| 24 | +# ) |
| 25 | +# ).use( |
| 26 | +# ToxicLanguage( |
| 27 | +# on_fail="noop", # Don't raise exceptions, handle via state flags |
| 28 | +# ) |
| 29 | +# ) |
28 | 30 |
|
29 | 31 |
|
30 | 32 | def guard_inicial(state: AgentState) -> AgentState: |
@@ -53,31 +55,36 @@ def guard_inicial(state: AgentState) -> AgentState: |
53 | 55 | updated_state["error_message"] = None |
54 | 56 | return updated_state |
55 | 57 |
|
56 | | - try: |
57 | | - # Validate the prompt using Guardrails |
58 | | - validation_result = _guard_inicial.validate(prompt) |
59 | | - |
60 | | - # Check if validation passed |
61 | | - # The validator returns ValidationResult with outcome |
62 | | - # If validation fails, outcome will indicate failure |
63 | | - if validation_result.validation_passed: |
64 | | - updated_state["is_malicious"] = False |
65 | | - updated_state["error_message"] = None |
66 | | - logger.debug("Prompt passed jailbreak and toxic language detection") |
67 | | - else: |
68 | | - # Jailbreak or toxic language detected |
69 | | - updated_state["is_malicious"] = True |
70 | | - updated_state["error_message"] = ( |
71 | | - "Contenido malicioso detectado. Tu solicitud contiene contenido que viola las políticas de seguridad." |
72 | | - ) |
73 | | - logger.warning("Malicious content detected. Prompt content not logged for security.") |
74 | | - |
75 | | - except Exception as e: |
76 | | - # If validation fails due to error, log it but don't block the request |
77 | | - # This is a safety measure - if Guardrails fails, we allow the request |
78 | | - # but log the error for monitoring |
79 | | - logger.error(f"Error during jailbreak detection: {e}") |
80 | | - updated_state["is_malicious"] = False |
81 | | - updated_state["error_message"] = None |
| 58 | + # Guardrails disabled for now - just pass through |
| 59 | + updated_state["is_malicious"] = False |
| 60 | + updated_state["error_message"] = None |
| 61 | + logger.debug("Guardrails disabled - prompt passed through without validation") |
| 62 | + |
| 63 | + # try: |
| 64 | + # # Validate the prompt using Guardrails |
| 65 | + # validation_result = _guard_inicial.validate(prompt) |
| 66 | + # |
| 67 | + # # Check if validation passed |
| 68 | + # # The validator returns ValidationResult with outcome |
| 69 | + # # If validation fails, outcome will indicate failure |
| 70 | + # if validation_result.validation_passed: |
| 71 | + # updated_state["is_malicious"] = False |
| 72 | + # updated_state["error_message"] = None |
| 73 | + # logger.debug("Prompt passed jailbreak and toxic language detection") |
| 74 | + # else: |
| 75 | + # # Jailbreak or toxic language detected |
| 76 | + # updated_state["is_malicious"] = True |
| 77 | + # updated_state["error_message"] = ( |
| 78 | + # "Contenido malicioso detectado. Tu solicitud contiene contenido que viola las políticas de seguridad." |
| 79 | + # ) |
| 80 | + # logger.warning("Malicious content detected. Prompt content not logged for security.") |
| 81 | + # |
| 82 | + # except Exception as e: |
| 83 | + # # If validation fails due to error, log it but don't block the request |
| 84 | + # # This is a safety measure - if Guardrails fails, we allow the request |
| 85 | + # # but log the error for monitoring |
| 86 | + # logger.error(f"Error during jailbreak detection: {e}") |
| 87 | + # updated_state["is_malicious"] = False |
| 88 | + # updated_state["error_message"] = None |
82 | 89 |
|
83 | 90 | return updated_state |
0 commit comments