From dd8ee5fc53ad460c408ab22904bde5fb48feda23 Mon Sep 17 00:00:00 2001
From: JPAmorin <juanpabloamorinjusto@gmail.com>
Date: Sun, 14 Dec 2025 18:21:21 -0300
Subject: [PATCH 1/2] Separated guard and fallback logic into initial and
 final.

---
 RAGManager/app/agents/graph.py | 64 +++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 28 deletions(-)

diff --git a/RAGManager/app/agents/graph.py b/RAGManager/app/agents/graph.py
index bbe2040..056a4d1 100644
--- a/RAGManager/app/agents/graph.py
+++ b/RAGManager/app/agents/graph.py
@@ -8,11 +8,15 @@
     fallback_final,
     fallback_inicial,
     generator,
-    guard,
+    guard_final,
+    guard_inicial,
     parafraseo,
     retriever,
 )
-from app.agents.routing import route_after_fallback_final, route_after_guard
+from app.agents.routing import (
+    route_after_guard_final,
+    route_after_guard_inicial,
+)
 from app.agents.state import AgentState
 
 
@@ -22,14 +26,15 @@ def create_agent_graph() -> StateGraph:
 
     The graph implements the following flow:
     1. START -> agent_host (Nodo 1)
-    2. agent_host -> guard (Nodo 2)
-    3. guard -> [conditional] -> fallback_inicial (Nodo 3) or END
-    4. fallback_inicial -> parafraseo (Nodo 4)
-    5. parafraseo -> retriever (Nodo 5)
-    6. retriever -> context_builder (Nodo 6)
-    7. context_builder -> generator (Nodo 7)
-    8. generator -> fallback_final (Nodo 8)
-    9. fallback_final -> [conditional] -> END (with final_response) or END (with error)
+    2. agent_host -> guard_inicial (Nodo 2)
+    3. guard_inicial -> [conditional] -> fallback_inicial (if malicious) or parafraseo (if continue)
+    4. fallback_inicial -> END (error: breaks rules)
+    5. parafraseo -> retriever (Nodo 4)
+    6. retriever -> context_builder (Nodo 5)
+    7. context_builder -> generator (Nodo 6)
+    8. generator -> guard_final (Nodo 7)
+    9. guard_final -> [conditional] -> fallback_final (if risky) or END (if continue)
+    10. fallback_final -> END (error: classified info)
 
     Returns:
         Configured StateGraph instance ready for execution
@@ -39,33 +44,34 @@ def create_agent_graph() -> StateGraph:
 
     # Add nodes
     workflow.add_node("agent_host", agent_host)
-    workflow.add_node("guard", guard)
+    workflow.add_node("guard_inicial", guard_inicial)
     workflow.add_node("fallback_inicial", fallback_inicial)
     workflow.add_node("parafraseo", parafraseo)
     workflow.add_node("retriever", retriever)
     workflow.add_node("context_builder", context_builder)
     workflow.add_node("generator", generator)
+    workflow.add_node("guard_final", guard_final)
     workflow.add_node("fallback_final", fallback_final)
 
     # Define edges
     # Start -> agent_host
     workflow.add_edge(START, "agent_host")
 
-    # agent_host -> guard
-    workflow.add_edge("agent_host", "guard")
+    # agent_host -> guard_inicial
+    workflow.add_edge("agent_host", "guard_inicial")
 
-    # guard -> conditional routing
+    # guard_inicial -> conditional routing
     workflow.add_conditional_edges(
-        "guard",
-        route_after_guard,
+        "guard_inicial",
+        route_after_guard_inicial,
         {
-            "malicious": END,  # End with error if malicious
-            "continue": "fallback_inicial",  # Continue to fallback_inicial if valid
+            "malicious": "fallback_inicial",  # Exception path: malicious content detected
+            "continue": "parafraseo",  # Normal path: continue processing
         },
     )
 
-    # fallback_inicial -> parafraseo
-    workflow.add_edge("fallback_inicial", "parafraseo")
+    # fallback_inicial -> END (stop flow with error message)
+    workflow.add_edge("fallback_inicial", END)
 
     # parafraseo -> retriever
     workflow.add_edge("parafraseo", "retriever")
@@ -77,19 +83,21 @@ def create_agent_graph() -> StateGraph:
     # Note: Primary LLM is called within context_builder node
     workflow.add_edge("context_builder", "generator")
 
-    # generator -> fallback_final
-    workflow.add_edge("generator", "fallback_final")
+    # generator -> guard_final
+    workflow.add_edge("generator", "guard_final")
 
-    # fallback_final -> conditional routing
+    # guard_final -> conditional routing
     workflow.add_conditional_edges(
-        "fallback_final",
-        route_after_fallback_final,
+        "guard_final",
+        route_after_guard_final,
         {
-            "risky": END,  # End with error if risky
-            "continue": END,  # End with final_response if valid
-            # Note: Final LLM is called within fallback_final node
+            "risky": "fallback_final",  # Exception path: risky content detected
+            "continue": END,  # Normal path: end successfully
         },
     )
 
+    # fallback_final -> END (stop flow with error message)
+    workflow.add_edge("fallback_final", END)
+
     # Compile the graph
     return workflow.compile()

From 28a971fe0df9d9750b6ef8374e6f2fbae29b1a32 Mon Sep 17 00:00:00 2001
From: JPAmorin <juanpabloamorinjusto@gmail.com>
Date: Sun, 14 Dec 2025 19:29:16 -0300
Subject: [PATCH 2/2] Edited logging of PII containing responses and Jailbreak
 attempting prompts.

---
 RAGManager/app/agents/nodes/__init__.py       |  6 +-
 RAGManager/app/agents/nodes/fallback_final.py | 36 ++++-----
 .../app/agents/nodes/fallback_inicial.py      | 36 +++------
 RAGManager/app/agents/nodes/guard_final.py    | 74 +++++++++++++++++++
 .../nodes/{guard.py => guard_inicial.py}      | 12 +--
 RAGManager/app/agents/routing.py              |  8 +-
 RAGManager/app/core/config.py                 | 13 ++++
 RAGManager/pyproject.toml                     |  4 +-
 8 files changed, 126 insertions(+), 63 deletions(-)
 create mode 100644 RAGManager/app/agents/nodes/guard_final.py
 rename RAGManager/app/agents/nodes/{guard.py => guard_inicial.py} (84%)

diff --git a/RAGManager/app/agents/nodes/__init__.py b/RAGManager/app/agents/nodes/__init__.py
index 0525938..3f3f674 100644
--- a/RAGManager/app/agents/nodes/__init__.py
+++ b/RAGManager/app/agents/nodes/__init__.py
@@ -5,13 +5,15 @@
 from app.agents.nodes.fallback_final import fallback_final
 from app.agents.nodes.fallback_inicial import fallback_inicial
 from app.agents.nodes.generator import generator
-from app.agents.nodes.guard import guard
+from app.agents.nodes.guard_final import guard_final
+from app.agents.nodes.guard_inicial import guard_inicial
 from app.agents.nodes.parafraseo import parafraseo
 from app.agents.nodes.retriever import retriever
 
 __all__ = [
     "agent_host",
-    "guard",
+    "guard_inicial",
+    "guard_final",
     "fallback_inicial",
     "parafraseo",
     "retriever",
diff --git a/RAGManager/app/agents/nodes/fallback_final.py b/RAGManager/app/agents/nodes/fallback_final.py
index 5c9f31d..d6b1d73 100644
--- a/RAGManager/app/agents/nodes/fallback_final.py
+++ b/RAGManager/app/agents/nodes/fallback_final.py
@@ -1,40 +1,30 @@
-"""Nodo 8: Fallback Final - Final validation for risky/sensitive content."""
+"""Nodo 8: Fallback Final - Stops processing when risky content is detected."""
+
+import logging
 
 from app.agents.state import AgentState
 
+logger = logging.getLogger(__name__)
+
 
 def fallback_final(state: AgentState) -> AgentState:
     """
-    Fallback Final node - Validates response for risky/sensitive content.
+    Fallback Final node - Stops processing when risky content is detected.
 
     This node:
-    1. Analyzes the generated response for risky/sensitive content
-    2. Sets is_risky flag
-    3. Sets error_message if risky content is detected
-    4. If valid, calls Final LLM to generate final response
+    1. Sets error message indicating that the information requested is classified or not free to know
+    2. Stops the flow by routing to END
 
     Args:
-        state: Agent state containing generated_response
+        state: Agent state containing the response flagged as risky
 
     Returns:
-        Updated state with is_risky, error_message, and final_response set
+        Updated state with error_message set, ready to route to END
     """
-    # TODO: Implement risky content detection and final LLM call
-    # This should:
-    # 1. Check generated_response for sensitive/risky content
-    # 2. Set is_risky = True if risky content is detected
-    # 3. Set error_message with appropriate message if risky
-    # 4. If not risky, call Final LLM with generated_response
-    # 5. Store Final LLM response in final_response
-
-    # Placeholder: For now, we'll assume all responses are safe
     updated_state = state.copy()
-    updated_state["is_risky"] = False
-    updated_state["error_message"] = None
 
-    # TODO: Call Final LLM here if not risky
-    # if not updated_state["is_risky"]:
-    #     updated_state["final_response"] = call_final_llm(updated_state["generated_response"])
-    updated_state["final_response"] = updated_state.get("generated_response")
+    # Set error message for risky content
+    updated_state["error_message"] = "The information requested is classified or not free to know."
+    logger.warning("Risky content detected. Stopping processing. Response content not logged for security.")
 
     return updated_state
diff --git a/RAGManager/app/agents/nodes/fallback_inicial.py b/RAGManager/app/agents/nodes/fallback_inicial.py
index 735e7d1..e687bf8 100644
--- a/RAGManager/app/agents/nodes/fallback_inicial.py
+++ b/RAGManager/app/agents/nodes/fallback_inicial.py
@@ -1,4 +1,4 @@
-"""Nodo 3: Fallback Inicial - Initial fallback processing."""
+"""Nodo 3: Fallback Inicial - Stops processing when malicious content is detected."""
 
 import logging
 
@@ -9,40 +9,22 @@
 
 def fallback_inicial(state: AgentState) -> AgentState:
     """
-    Fallback Inicial node - Performs initial fallback processing.
+    Fallback Inicial node - Stops processing when malicious content is detected.
 
     This node:
-    1. Defensively checks if the prompt was flagged as malicious
-    2. Adjusts the text if needed (e.g., formatting, normalization)
-    3. Prepares text for paraphrasing step
+    1. Sets error message indicating that the user's intentions break the chatbot's rules
+    2. Stops the flow by routing to END
 
     Args:
-        state: Agent state containing the prompt or initial context
+        state: Agent state containing the prompt flagged as malicious
 
     Returns:
-        Updated state with adjusted_text set (if applicable) or error_message if malicious
+        Updated state with error_message set, ready to route to END
     """
     updated_state = state.copy()
 
-    # Defensive check: Verify that the prompt was not flagged as malicious
-    # This should not happen due to routing, but serves as an extra safety layer
-    if state.get("is_malicious", False):
-        logger.warning(
-            "Defensive check triggered: Malicious prompt reached fallback_inicial node. "
-            "This indicates a potential routing issue."
-        )
-        updated_state["error_message"] = "The requested information or action is not possible by the agent."
-        updated_state["adjusted_text"] = None
-        return updated_state
-
-    # TODO: Implement initial fallback logic
-    # This should:
-    # 1. Normalize text (remove extra spaces, fix encoding, etc.)
-    # 2. Apply any necessary text adjustments
-    # 3. Set adjusted_text if adjustments were made, otherwise None
-
-    # Placeholder: For now, we'll use the prompt as-is
-    prompt = state.get("prompt", "")
-    updated_state["adjusted_text"] = prompt if prompt else None
+    # Set error message for malicious content
+    updated_state["error_message"] = "The user's intentions break the chatbot's rules."
+    logger.warning("Malicious content detected. Stopping processing. Prompt content not logged for security.")
 
     return updated_state
diff --git a/RAGManager/app/agents/nodes/guard_final.py b/RAGManager/app/agents/nodes/guard_final.py
new file mode 100644
index 0000000..3e017c3
--- /dev/null
+++ b/RAGManager/app/agents/nodes/guard_final.py
@@ -0,0 +1,74 @@
+"""Nodo Guard Final - Validates generated response for PII (risky information detection)."""
+
+import logging
+
+from guardrails import Guard
+from guardrails.hub import DetectPII
+
+from app.agents.state import AgentState
+from app.core.config import settings
+
+logger = logging.getLogger(__name__)
+
+# Initialize Guard with DetectPII validator
+# Note: The validator must be installed via: guardrails hub install hub://guardrails/detect_pii
+_guard_final = Guard().use(
+    DetectPII(
+        pii_entities=settings.guardrails_pii_entities,
+        on_fail="noop",  # Don't raise exceptions, handle via state flags
+    )
+)
+
+
+def guard_final(state: AgentState) -> AgentState:
+    """
+    Guard final node - Validates generated response for PII using Guardrails DetectPII.
+
+    This node:
+    1. Validates the generated_response using Guardrails DetectPII validator
+    2. Sets is_risky flag if PII is detected
+    3. Sets error_message if risky content is detected
+
+    Args:
+        state: Agent state containing the generated_response
+
+    Returns:
+        Updated state with is_risky and error_message set
+    """
+    updated_state = state.copy()
+    generated_response = state.get("generated_response", "")
+
+    if not generated_response:
+        # Empty response is considered safe
+        updated_state["is_risky"] = False
+        updated_state["error_message"] = None
+        return updated_state
+
+    try:
+        # Validate the generated response using Guardrails
+        validation_result = _guard_final.validate(generated_response)
+
+        # Check if validation passed
+        # The validator returns ValidationResult with outcome
+        # If validation fails, outcome will indicate failure
+        if validation_result.validation_passed:
+            updated_state["is_risky"] = False
+            updated_state["error_message"] = None
+            logger.debug("Generated response passed PII detection")
+        else:
+            # PII detected
+            updated_state["is_risky"] = True
+            updated_state["error_message"] = (
+                "PII detected in generated response. The information requested is classified or not free to know."
+            )
+            logger.warning("PII detected in generated response. Response content not logged for security.")
+
+    except Exception as e:
+        # If validation fails due to error, log it but don't block the request
+        # This is a safety measure - if Guardrails fails, we allow the request
+        # but log the error for monitoring
+        logger.error(f"Error during PII detection: {e}")
+        updated_state["is_risky"] = False
+        updated_state["error_message"] = None
+
+    return updated_state
diff --git a/RAGManager/app/agents/nodes/guard.py b/RAGManager/app/agents/nodes/guard_inicial.py
similarity index 84%
rename from RAGManager/app/agents/nodes/guard.py
rename to RAGManager/app/agents/nodes/guard_inicial.py
index af8af2c..28b510c 100644
--- a/RAGManager/app/agents/nodes/guard.py
+++ b/RAGManager/app/agents/nodes/guard_inicial.py
@@ -1,4 +1,4 @@
-"""Nodo 2: Guard - Validates for malicious content."""
+"""Nodo 2: Guard Inicial - Validates for malicious content (jailbreak detection)."""
 
 import logging
 
@@ -12,7 +12,7 @@
 
 # Initialize Guard with DetectJailbreak validator
 # Note: The validator must be installed via: guardrails hub install hub://guardrails/detect_jailbreak
-_guard = Guard().use(
+_guard_inicial = Guard().use(
     DetectJailbreak(
         threshold=settings.guardrails_jailbreak_threshold,
         device=settings.guardrails_device,
@@ -21,9 +21,9 @@
 )
 
 
-def guard(state: AgentState) -> AgentState:
+def guard_inicial(state: AgentState) -> AgentState:
     """
-    Guard node - Validates user input for malicious content using Guardrails DetectJailbreak.
+    Guard inicial node - Validates user input for jailbreak attempts using Guardrails DetectJailbreak.
 
     This node:
     1. Validates the prompt using Guardrails DetectJailbreak validator
@@ -47,7 +47,7 @@ def guard(state: AgentState) -> AgentState:
 
     try:
         # Validate the prompt using Guardrails
-        validation_result = _guard.validate(prompt)
+        validation_result = _guard_inicial.validate(prompt)
 
         # Check if validation passed
         # The validator returns ValidationResult with outcome
@@ -62,7 +62,7 @@ def guard(state: AgentState) -> AgentState:
             updated_state["error_message"] = (
                 "Jailbreak attempt detected. Your request contains content that violates security policies."
             )
-            logger.warning(f"Jailbreak attempt detected in prompt: {prompt[:100]}...")
+            logger.warning("Jailbreak attempt detected. Prompt content not logged for security.")
 
     except Exception as e:
         # If validation fails due to error, log it but don't block the request
diff --git a/RAGManager/app/agents/routing.py b/RAGManager/app/agents/routing.py
index cd88a77..5807313 100644
--- a/RAGManager/app/agents/routing.py
+++ b/RAGManager/app/agents/routing.py
@@ -3,9 +3,9 @@
 from app.agents.state import AgentState
 
 
-def route_after_guard(state: AgentState) -> str:
+def route_after_guard_inicial(state: AgentState) -> str:
     """
-    Route after Guard node (Nodo 2) validation.
+    Route after Guard Inicial node validation.
 
     Determines the next step based on whether the prompt was flagged as malicious.
 
@@ -20,9 +20,9 @@ def route_after_guard(state: AgentState) -> str:
     return "continue"
 
 
-def route_after_fallback_final(state: AgentState) -> str:
+def route_after_guard_final(state: AgentState) -> str:
     """
-    Route after Fallback Final node (Nodo 8) validation.
+    Route after Guard Final node validation.
 
     Determines the next step based on whether the response was flagged as risky.
 
diff --git a/RAGManager/app/core/config.py b/RAGManager/app/core/config.py
index d9e5a7d..b96d119 100644
--- a/RAGManager/app/core/config.py
+++ b/RAGManager/app/core/config.py
@@ -47,6 +47,19 @@ class Settings(BaseSettings):
         default="cpu",
         description="Device for model inference.",
     )
+    guardrails_pii_entities: list[str] = Field(
+        default=[
+            "EMAIL_ADDRESS",
+            "PHONE_NUMBER",
+            "CREDIT_CARD",
+            "SSN",
+            "US_PASSPORT",
+            "US_DRIVER_LICENSE",
+            "IBAN_CODE",
+            "IP_ADDRESS",
+        ],
+        description="List of PII entity types to detect using DetectPII validator.",
+    )
     model_config = SettingsConfigDict(
         env_file=".env",
         env_file_encoding="utf-8",
diff --git a/RAGManager/pyproject.toml b/RAGManager/pyproject.toml
index 284bbb7..e7191ac 100644
--- a/RAGManager/pyproject.toml
+++ b/RAGManager/pyproject.toml
@@ -17,7 +17,9 @@ dependencies = [
     "pydantic-settings>=2.0.0",
     "typing-extensions>=4.15.0",
     "uvicorn>=0.38.0",
-    "guardrails-ai>=0.5.10",
+    "guardrails-ai>=0.6.2",
+    "presidio-analyzer>=2.2.360",
+    "presidio-anonymizer>=2.2.360",
 ]
 
 [project.optional-dependencies]