From dd8ee5fc53ad460c408ab22904bde5fb48feda23 Mon Sep 17 00:00:00 2001 From: JPAmorin Date: Sun, 14 Dec 2025 18:21:21 -0300 Subject: [PATCH 1/2] Separated guard and fallback logic into initial and final. --- RAGManager/app/agents/graph.py | 64 +++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/RAGManager/app/agents/graph.py b/RAGManager/app/agents/graph.py index bbe2040..056a4d1 100644 --- a/RAGManager/app/agents/graph.py +++ b/RAGManager/app/agents/graph.py @@ -8,11 +8,15 @@ fallback_final, fallback_inicial, generator, - guard, + guard_final, + guard_inicial, parafraseo, retriever, ) -from app.agents.routing import route_after_fallback_final, route_after_guard +from app.agents.routing import ( + route_after_guard_final, + route_after_guard_inicial, +) from app.agents.state import AgentState @@ -22,14 +26,15 @@ def create_agent_graph() -> StateGraph: The graph implements the following flow: 1. START -> agent_host (Nodo 1) - 2. agent_host -> guard (Nodo 2) - 3. guard -> [conditional] -> fallback_inicial (Nodo 3) or END - 4. fallback_inicial -> parafraseo (Nodo 4) - 5. parafraseo -> retriever (Nodo 5) - 6. retriever -> context_builder (Nodo 6) - 7. context_builder -> generator (Nodo 7) - 8. generator -> fallback_final (Nodo 8) - 9. fallback_final -> [conditional] -> END (with final_response) or END (with error) + 2. agent_host -> guard_inicial (Nodo 2) + 3. guard_inicial -> [conditional] -> fallback_inicial (if malicious) or parafraseo (if continue) + 4. fallback_inicial -> END (error: breaks rules) + 5. parafraseo -> retriever (Nodo 4) + 6. retriever -> context_builder (Nodo 5) + 7. context_builder -> generator (Nodo 6) + 8. generator -> guard_final (Nodo 7) + 9. guard_final -> [conditional] -> fallback_final (if risky) or END (if continue) + 10. fallback_final -> END (error: classified info) Returns: Configured StateGraph instance ready for execution @@ -39,33 +44,34 @@ def create_agent_graph() -> StateGraph: # Add nodes workflow.add_node("agent_host", agent_host) - workflow.add_node("guard", guard) + workflow.add_node("guard_inicial", guard_inicial) workflow.add_node("fallback_inicial", fallback_inicial) workflow.add_node("parafraseo", parafraseo) workflow.add_node("retriever", retriever) workflow.add_node("context_builder", context_builder) workflow.add_node("generator", generator) + workflow.add_node("guard_final", guard_final) workflow.add_node("fallback_final", fallback_final) # Define edges # Start -> agent_host workflow.add_edge(START, "agent_host") - # agent_host -> guard - workflow.add_edge("agent_host", "guard") + # agent_host -> guard_inicial + workflow.add_edge("agent_host", "guard_inicial") - # guard -> conditional routing + # guard_inicial -> conditional routing workflow.add_conditional_edges( - "guard", - route_after_guard, + "guard_inicial", + route_after_guard_inicial, { - "malicious": END, # End with error if malicious - "continue": "fallback_inicial", # Continue to fallback_inicial if valid + "malicious": "fallback_inicial", # Exception path: malicious content detected + "continue": "parafraseo", # Normal path: continue processing }, ) - # fallback_inicial -> parafraseo - workflow.add_edge("fallback_inicial", "parafraseo") + # fallback_inicial -> END (stop flow with error message) + workflow.add_edge("fallback_inicial", END) # parafraseo -> retriever workflow.add_edge("parafraseo", "retriever") @@ -77,19 +83,21 @@ def create_agent_graph() -> StateGraph: # Note: Primary LLM is called within context_builder node workflow.add_edge("context_builder", "generator") - # generator -> fallback_final - workflow.add_edge("generator", "fallback_final") + # generator -> guard_final + workflow.add_edge("generator", "guard_final") - # fallback_final -> conditional routing + # guard_final -> conditional routing workflow.add_conditional_edges( - "fallback_final", - route_after_fallback_final, + "guard_final", + route_after_guard_final, { - "risky": END, # End with error if risky - "continue": END, # End with final_response if valid - # Note: Final LLM is called within fallback_final node + "risky": "fallback_final", # Exception path: risky content detected + "continue": END, # Normal path: end successfully }, ) + # fallback_final -> END (stop flow with error message) + workflow.add_edge("fallback_final", END) + # Compile the graph return workflow.compile() From 28a971fe0df9d9750b6ef8374e6f2fbae29b1a32 Mon Sep 17 00:00:00 2001 From: JPAmorin Date: Sun, 14 Dec 2025 19:29:16 -0300 Subject: [PATCH 2/2] Edited logging of PII containing responses and Jailbreak attempting prompts. --- RAGManager/app/agents/nodes/__init__.py | 6 +- RAGManager/app/agents/nodes/fallback_final.py | 36 ++++----- .../app/agents/nodes/fallback_inicial.py | 36 +++------ RAGManager/app/agents/nodes/guard_final.py | 74 +++++++++++++++++++ .../nodes/{guard.py => guard_inicial.py} | 12 +-- RAGManager/app/agents/routing.py | 8 +- RAGManager/app/core/config.py | 13 ++++ RAGManager/pyproject.toml | 4 +- 8 files changed, 126 insertions(+), 63 deletions(-) create mode 100644 RAGManager/app/agents/nodes/guard_final.py rename RAGManager/app/agents/nodes/{guard.py => guard_inicial.py} (84%) diff --git a/RAGManager/app/agents/nodes/__init__.py b/RAGManager/app/agents/nodes/__init__.py index 0525938..3f3f674 100644 --- a/RAGManager/app/agents/nodes/__init__.py +++ b/RAGManager/app/agents/nodes/__init__.py @@ -5,13 +5,15 @@ from app.agents.nodes.fallback_final import fallback_final from app.agents.nodes.fallback_inicial import fallback_inicial from app.agents.nodes.generator import generator -from app.agents.nodes.guard import guard +from app.agents.nodes.guard_final import guard_final +from app.agents.nodes.guard_inicial import guard_inicial from app.agents.nodes.parafraseo import parafraseo from app.agents.nodes.retriever import retriever __all__ = [ "agent_host", - "guard", + "guard_inicial", + "guard_final", "fallback_inicial", "parafraseo", "retriever", diff --git a/RAGManager/app/agents/nodes/fallback_final.py b/RAGManager/app/agents/nodes/fallback_final.py index 5c9f31d..d6b1d73 100644 --- a/RAGManager/app/agents/nodes/fallback_final.py +++ b/RAGManager/app/agents/nodes/fallback_final.py @@ -1,40 +1,30 @@ -"""Nodo 8: Fallback Final - Final validation for risky/sensitive content.""" +"""Nodo 8: Fallback Final - Stops processing when risky content is detected.""" + +import logging from app.agents.state import AgentState +logger = logging.getLogger(__name__) + def fallback_final(state: AgentState) -> AgentState: """ - Fallback Final node - Validates response for risky/sensitive content. + Fallback Final node - Stops processing when risky content is detected. This node: - 1. Analyzes the generated response for risky/sensitive content - 2. Sets is_risky flag - 3. Sets error_message if risky content is detected - 4. If valid, calls Final LLM to generate final response + 1. Sets error message indicating that the information requested is classified or not free to know + 2. Stops the flow by routing to END Args: - state: Agent state containing generated_response + state: Agent state containing the response flagged as risky Returns: - Updated state with is_risky, error_message, and final_response set + Updated state with error_message set, ready to route to END """ - # TODO: Implement risky content detection and final LLM call - # This should: - # 1. Check generated_response for sensitive/risky content - # 2. Set is_risky = True if risky content is detected - # 3. Set error_message with appropriate message if risky - # 4. If not risky, call Final LLM with generated_response - # 5. Store Final LLM response in final_response - - # Placeholder: For now, we'll assume all responses are safe updated_state = state.copy() - updated_state["is_risky"] = False - updated_state["error_message"] = None - # TODO: Call Final LLM here if not risky - # if not updated_state["is_risky"]: - # updated_state["final_response"] = call_final_llm(updated_state["generated_response"]) - updated_state["final_response"] = updated_state.get("generated_response") + # Set error message for risky content + updated_state["error_message"] = "The information requested is classified or not free to know." + logger.warning("Risky content detected. Stopping processing. Response content not logged for security.") return updated_state diff --git a/RAGManager/app/agents/nodes/fallback_inicial.py b/RAGManager/app/agents/nodes/fallback_inicial.py index 735e7d1..e687bf8 100644 --- a/RAGManager/app/agents/nodes/fallback_inicial.py +++ b/RAGManager/app/agents/nodes/fallback_inicial.py @@ -1,4 +1,4 @@ -"""Nodo 3: Fallback Inicial - Initial fallback processing.""" +"""Nodo 3: Fallback Inicial - Stops processing when malicious content is detected.""" import logging @@ -9,40 +9,22 @@ def fallback_inicial(state: AgentState) -> AgentState: """ - Fallback Inicial node - Performs initial fallback processing. + Fallback Inicial node - Stops processing when malicious content is detected. This node: - 1. Defensively checks if the prompt was flagged as malicious - 2. Adjusts the text if needed (e.g., formatting, normalization) - 3. Prepares text for paraphrasing step + 1. Sets error message indicating that the user's intentions break the chatbot's rules + 2. Stops the flow by routing to END Args: - state: Agent state containing the prompt or initial context + state: Agent state containing the prompt flagged as malicious Returns: - Updated state with adjusted_text set (if applicable) or error_message if malicious + Updated state with error_message set, ready to route to END """ updated_state = state.copy() - # Defensive check: Verify that the prompt was not flagged as malicious - # This should not happen due to routing, but serves as an extra safety layer - if state.get("is_malicious", False): - logger.warning( - "Defensive check triggered: Malicious prompt reached fallback_inicial node. " - "This indicates a potential routing issue." - ) - updated_state["error_message"] = "The requested information or action is not possible by the agent." - updated_state["adjusted_text"] = None - return updated_state - - # TODO: Implement initial fallback logic - # This should: - # 1. Normalize text (remove extra spaces, fix encoding, etc.) - # 2. Apply any necessary text adjustments - # 3. Set adjusted_text if adjustments were made, otherwise None - - # Placeholder: For now, we'll use the prompt as-is - prompt = state.get("prompt", "") - updated_state["adjusted_text"] = prompt if prompt else None + # Set error message for malicious content + updated_state["error_message"] = "The user's intentions break the chatbot's rules." + logger.warning("Malicious content detected. Stopping processing. Prompt content not logged for security.") return updated_state diff --git a/RAGManager/app/agents/nodes/guard_final.py b/RAGManager/app/agents/nodes/guard_final.py new file mode 100644 index 0000000..3e017c3 --- /dev/null +++ b/RAGManager/app/agents/nodes/guard_final.py @@ -0,0 +1,74 @@ +"""Nodo Guard Final - Validates generated response for PII (risky information detection).""" + +import logging + +from guardrails import Guard +from guardrails.hub import DetectPII + +from app.agents.state import AgentState +from app.core.config import settings + +logger = logging.getLogger(__name__) + +# Initialize Guard with DetectPII validator +# Note: The validator must be installed via: guardrails hub install hub://guardrails/detect_pii +_guard_final = Guard().use( + DetectPII( + pii_entities=settings.guardrails_pii_entities, + on_fail="noop", # Don't raise exceptions, handle via state flags + ) +) + + +def guard_final(state: AgentState) -> AgentState: + """ + Guard final node - Validates generated response for PII using Guardrails DetectPII. + + This node: + 1. Validates the generated_response using Guardrails DetectPII validator + 2. Sets is_risky flag if PII is detected + 3. Sets error_message if risky content is detected + + Args: + state: Agent state containing the generated_response + + Returns: + Updated state with is_risky and error_message set + """ + updated_state = state.copy() + generated_response = state.get("generated_response", "") + + if not generated_response: + # Empty response is considered safe + updated_state["is_risky"] = False + updated_state["error_message"] = None + return updated_state + + try: + # Validate the generated response using Guardrails + validation_result = _guard_final.validate(generated_response) + + # Check if validation passed + # The validator returns ValidationResult with outcome + # If validation fails, outcome will indicate failure + if validation_result.validation_passed: + updated_state["is_risky"] = False + updated_state["error_message"] = None + logger.debug("Generated response passed PII detection") + else: + # PII detected + updated_state["is_risky"] = True + updated_state["error_message"] = ( + "PII detected in generated response. The information requested is classified or not free to know." + ) + logger.warning("PII detected in generated response. Response content not logged for security.") + + except Exception as e: + # If validation fails due to error, log it but don't block the request + # This is a safety measure - if Guardrails fails, we allow the request + # but log the error for monitoring + logger.error(f"Error during PII detection: {e}") + updated_state["is_risky"] = False + updated_state["error_message"] = None + + return updated_state diff --git a/RAGManager/app/agents/nodes/guard.py b/RAGManager/app/agents/nodes/guard_inicial.py similarity index 84% rename from RAGManager/app/agents/nodes/guard.py rename to RAGManager/app/agents/nodes/guard_inicial.py index af8af2c..28b510c 100644 --- a/RAGManager/app/agents/nodes/guard.py +++ b/RAGManager/app/agents/nodes/guard_inicial.py @@ -1,4 +1,4 @@ -"""Nodo 2: Guard - Validates for malicious content.""" +"""Nodo 2: Guard Inicial - Validates for malicious content (jailbreak detection).""" import logging @@ -12,7 +12,7 @@ # Initialize Guard with DetectJailbreak validator # Note: The validator must be installed via: guardrails hub install hub://guardrails/detect_jailbreak -_guard = Guard().use( +_guard_inicial = Guard().use( DetectJailbreak( threshold=settings.guardrails_jailbreak_threshold, device=settings.guardrails_device, @@ -21,9 +21,9 @@ ) -def guard(state: AgentState) -> AgentState: +def guard_inicial(state: AgentState) -> AgentState: """ - Guard node - Validates user input for malicious content using Guardrails DetectJailbreak. + Guard inicial node - Validates user input for jailbreak attempts using Guardrails DetectJailbreak. This node: 1. Validates the prompt using Guardrails DetectJailbreak validator @@ -47,7 +47,7 @@ def guard(state: AgentState) -> AgentState: try: # Validate the prompt using Guardrails - validation_result = _guard.validate(prompt) + validation_result = _guard_inicial.validate(prompt) # Check if validation passed # The validator returns ValidationResult with outcome @@ -62,7 +62,7 @@ def guard(state: AgentState) -> AgentState: updated_state["error_message"] = ( "Jailbreak attempt detected. Your request contains content that violates security policies." ) - logger.warning(f"Jailbreak attempt detected in prompt: {prompt[:100]}...") + logger.warning("Jailbreak attempt detected. Prompt content not logged for security.") except Exception as e: # If validation fails due to error, log it but don't block the request diff --git a/RAGManager/app/agents/routing.py b/RAGManager/app/agents/routing.py index cd88a77..5807313 100644 --- a/RAGManager/app/agents/routing.py +++ b/RAGManager/app/agents/routing.py @@ -3,9 +3,9 @@ from app.agents.state import AgentState -def route_after_guard(state: AgentState) -> str: +def route_after_guard_inicial(state: AgentState) -> str: """ - Route after Guard node (Nodo 2) validation. + Route after Guard Inicial node validation. Determines the next step based on whether the prompt was flagged as malicious. @@ -20,9 +20,9 @@ def route_after_guard(state: AgentState) -> str: return "continue" -def route_after_fallback_final(state: AgentState) -> str: +def route_after_guard_final(state: AgentState) -> str: """ - Route after Fallback Final node (Nodo 8) validation. + Route after Guard Final node validation. Determines the next step based on whether the response was flagged as risky. diff --git a/RAGManager/app/core/config.py b/RAGManager/app/core/config.py index d9e5a7d..b96d119 100644 --- a/RAGManager/app/core/config.py +++ b/RAGManager/app/core/config.py @@ -47,6 +47,19 @@ class Settings(BaseSettings): default="cpu", description="Device for model inference.", ) + guardrails_pii_entities: list[str] = Field( + default=[ + "EMAIL_ADDRESS", + "PHONE_NUMBER", + "CREDIT_CARD", + "SSN", + "US_PASSPORT", + "US_DRIVER_LICENSE", + "IBAN_CODE", + "IP_ADDRESS", + ], + description="List of PII entity types to detect using DetectPII validator.", + ) model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", diff --git a/RAGManager/pyproject.toml b/RAGManager/pyproject.toml index 284bbb7..e7191ac 100644 --- a/RAGManager/pyproject.toml +++ b/RAGManager/pyproject.toml @@ -17,7 +17,9 @@ dependencies = [ "pydantic-settings>=2.0.0", "typing-extensions>=4.15.0", "uvicorn>=0.38.0", - "guardrails-ai>=0.5.10", + "guardrails-ai>=0.6.2", + "presidio-analyzer>=2.2.360", + "presidio-anonymizer>=2.2.360", ] [project.optional-dependencies]