amd
diff --git a/‎src/gaia/agents/base/agent.py‎
Lines changed: 57 additions & 25 deletions b/‎src/gaia/agents/base/agent.py‎
Lines changed: 57 additions & 25 deletions
diff --git a/‎src/gaia/agents/builder/agent.py‎
Lines changed: 54 additions & 9 deletions b/‎src/gaia/agents/builder/agent.py‎
Lines changed: 54 additions & 9 deletions
diff --git a/‎src/gaia/agents/builder/system_prompt.py‎
Lines changed: 7 additions & 0 deletions b/‎src/gaia/agents/builder/system_prompt.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/gaia/agents/registry.py‎
Lines changed: 29 additions & 0 deletions b/‎src/gaia/agents/registry.py‎
Lines changed: 29 additions & 0 deletions
@@ -695,6 +695,13 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
         "Let me search for that.\n{"thought": "...", "tool": "query_documents",
          "tool_args": {"query": "..."}}"
 
+        Decision logic over all {…} candidates that contain a "tool" key,
+        each tagged fenced/unfenced:
+          1. ≥1 unfenced candidate → return the first (unchanged — zero regression).
+          2. else exactly one fenced candidate → return it (the fix for #1428).
+          3. else >1 fenced, 0 unfenced → ambiguous (looks like docs) → None + warning.
+          4. else → None.
+
         This method finds the JSON block using brace-depth matching and returns
         the parsed tool call if it contains a "tool" key.  Returns None if no
         embedded tool call is found, allowing the caller to treat the response
@@ -705,7 +712,6 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
             return None
 
         # Build a set of character ranges inside code fences (```...```)
-        # so we don't accidentally extract example JSON from markdown.
         _code_ranges: list[tuple[int, int]] = []
         _search_from = 0
         while True:
@@ -723,17 +729,31 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
         def _inside_code_fence(pos: int) -> bool:
             return any(start <= pos < end for start, end in _code_ranges)
 
-        # Walk through looking for { that starts a JSON-like block with "tool"
+        def _parse_candidate(raw: str) -> Optional[Dict[str, Any]]:
+            """Return the parsed dict if raw is valid JSON with a 'tool' key, else None."""
+            try:
+                fixed = re.sub(r",\s*}", "}", raw)
+                fixed = re.sub(r",\s*]", "]", fixed)
+                parsed = json.loads(fixed)
+                if isinstance(parsed, dict) and "tool" in parsed:
+                    if "tool_args" not in parsed:
+                        parsed["tool_args"] = {}
+                    return parsed
+            except json.JSONDecodeError:
+                pass
+            return None
+
+        # Collect all tool-call candidates, tagged by whether they are inside a fence
+        unfenced: list[Dict[str, Any]] = []
+        fenced: list[Dict[str, Any]] = []
+
         idx = 0
         while idx < len(response):
             brace_pos = response.find("{", idx)
             if brace_pos == -1:
                 break
 
-            # Skip JSON inside markdown code fences (example/documentation)
-            if _inside_code_fence(brace_pos):
-                idx = brace_pos + 1
-                continue
+            is_fenced = _inside_code_fence(brace_pos)
 
             # Look ahead for "tool" near this brace (within 200 chars)
             look_ahead = response[brace_pos : brace_pos + 200]
@@ -766,31 +786,43 @@ def _inside_code_fence(pos: int) -> bool:
                             break
 
             if depth != 0:
-                # Unclosed braces — skip
                 idx = brace_pos + 1
                 continue
 
-            candidate = response[brace_pos : end_pos + 1]
-            try:
-                # Fix common trailing comma issues
-                fixed = re.sub(r",\s*}", "}", candidate)
-                fixed = re.sub(r",\s*]", "]", fixed)
-                parsed = json.loads(fixed)
-
-                # Only accept if it has a "tool" key (it's a tool call)
-                if isinstance(parsed, dict) and "tool" in parsed:
-                    if "tool_args" not in parsed:
-                        parsed["tool_args"] = {}
-                    logger.debug(
-                        f"[PARSE] Extracted embedded tool call: "
-                        f"{parsed.get('tool')}"
-                    )
-                    return parsed
-            except json.JSONDecodeError:
-                pass
+            raw = response[brace_pos : end_pos + 1]
+            parsed = _parse_candidate(raw)
+            if parsed is not None:
+                if is_fenced:
+                    fenced.append(parsed)
+                else:
+                    unfenced.append(parsed)
 
             idx = brace_pos + 1
 
+        # Decision logic
+        if unfenced:
+            # Rule 1: prefer unfenced (unchanged behaviour — zero regression)
+            logger.debug(
+                "[PARSE] Extracted embedded tool call: %s", unfenced[0].get("tool")
+            )
+            return unfenced[0]
+
+        if len(fenced) == 1:
+            # Rule 2: exactly one fenced call — trust it (fix for #1428)
+            logger.debug(
+                "[PARSE] Extracted fenced tool call: %s", fenced[0].get("tool")
+            )
+            return fenced[0]
+
+        if len(fenced) > 1:
+            # Rule 3: multiple fenced calls — ambiguous, likely documentation examples
+            logger.warning(
+                "[PARSE] ambiguous: %d fenced tool-call candidates found and no "
+                "unfenced call; cannot determine which is real — returning None",
+                len(fenced),
+            )
+            return None
+
         return None
 
     def _extract_json_from_response(self, response: str) -> Optional[Dict[str, Any]]:
 
@@ -177,8 +177,11 @@ def create_agent(
                     Valid options: "rag" (document Q&A), "file_search" (fuzzy file
                     search), "file_io" (read/write/edit files), "shell" (sandboxed
                     shell), "screenshot" (screen capture), "sd" (image generation),
-                    "vlm" (vision LLM). Combine freely; they are added to the
-                    class's base list alongside Agent.
+                    "vlm" (vision LLM), "code_index" (semantic code search),
+                    "filesystem" (file system navigation), "scratchpad" (SQL scratch
+                    tables for data analysis), "browser" (web search and page fetch).
+                    Combine freely; they are added to the class's base list alongside
+                    Agent.
 
             Returns:
                 Confirmation message with the path to the created agent.py.
@@ -230,6 +233,7 @@ def _process_query_impl(  # type: ignore[override]
         self.console.print_processing_start(user_input, steps_limit, self.model_id)
 
         final_answer: Optional[str] = None
+        created_ok = False
         steps_taken = 0
 
         while steps_taken < steps_limit and final_answer is None:
@@ -287,6 +291,18 @@ def _process_query_impl(  # type: ignore[override]
                     if isinstance(tool_result, dict)
                     else str(tool_result)
                 )
+                # Fail loudly: if create_agent returned an error, don't let the
+                # LLM fabricate a success message — end immediately with an honest answer.
+                if tool_name == "create_agent" and str(tool_result).startswith(
+                    "Error:"
+                ):
+                    final_answer = (
+                        f"I was unable to create the agent: {tool_result}\n\n"
+                        "Please check the name is valid and try again."
+                    )
+                    break
+                if tool_name == "create_agent":
+                    created_ok = True
                 messages.append(
                     {
                         "role": "user",
@@ -295,16 +311,45 @@ def _process_query_impl(  # type: ignore[override]
                 )
                 # Continue loop so the LLM can summarize the result
             else:
-                final_answer = (
-                    parsed.get("answer")
-                    or response.strip()
-                    or ("I wasn't able to generate a response. Please try again.")
-                )
+                # No tool call was extracted. If creation already succeeded this
+                # is a post-success summary — return it without triggering the
+                # fabrication check (which misfires on ✅ in a real summary).
+                if created_ok:
+                    candidate = parsed.get("answer") or response.strip()
+                    final_answer = candidate or "Agent created successfully."
+                # Otherwise check whether the model hallucinated success markers.
+                elif any(
+                    m in (parsed.get("answer") or response.strip())
+                    for m in ("Agent Created", "✅", "File location")
+                ):
+                    # The model wrote a fake success — push a corrective turn and
+                    # loop again (steps_limit guards infinite recursion).
+                    logger.warning(
+                        "BuilderAgent: fabricated success detected without tool call; "
+                        "injecting corrective user turn"
+                    )
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": (
+                                "You did not actually call create_agent. "
+                                "Output ONLY the bare JSON tool call, no prose, "
+                                "no code fences."
+                            ),
+                        }
+                    )
+                    # Do not set final_answer — the loop will continue
+                else:
+                    final_answer = (
+                        parsed.get("answer")
+                        or response.strip()
+                        or "I wasn't able to generate a response. Please try again."
+                    )
 
         if final_answer is None:
             final_answer = (
-                "I've used the maximum number of steps. "
-                "Check ~/.gaia/agents/ for any agents that were created."
+                "I was unable to create the agent after several attempts. "
+                "Please try again with a clear agent name."
             )
 
         self.console.print_final_answer(final_answer, streaming=self.streaming)
 
@@ -24,6 +24,10 @@
    - "Take screenshots" → tools=["screenshot"]
    - "Generate images (Stable Diffusion)" → tools=["sd"]
    - "Vision / image understanding" → tools=["vlm"]
+   - "Semantic code search" → tools=["code_index"]
+   - "File system navigation" → tools=["filesystem"]
+   - "Data analysis with SQL scratch tables" → tools=["scratchpad"]
+   - "Web search and page fetch" → tools=["browser"]
    You can combine them, e.g. tools=["rag", "file_search"] for a research assistant.
    If the user wants none of these, skip the tools argument.
 5. Ask if they would like MCP server support. Explain briefly: \
@@ -39,6 +43,9 @@
 ## Rules
 - ALWAYS call the `create_agent` tool once you have a name and have asked about \
   capabilities + MCP. Do not just describe what you would do — actually call the tool.
+- When calling a tool, output ONLY the bare JSON object — no prose before or after, \
+  no ``` code fences, and never write your own success message. The system writes the \
+  confirmation after the tool actually runs.
 - If the user provides a name in their very first message, skip the greeting \
   pleasantries but still ask about capabilities and MCP before calling the tool.
 - Keep responses concise and friendly.
 
@@ -427,6 +427,33 @@ def __init__(self):
         self._lemonade_models: Optional[List[str]] = None  # cache
         self._lemonade_models_last_fail: Optional[float] = None  # monotonic timestamp
         self._lock = threading.Lock()
+        # Records agent IDs whose load failed during discover() / register_from_dir().
+        # Populated by _record_load_error(); read by get_load_error() and Stage D.
+        self._load_errors: Dict[str, str] = {}
+
+    # ------------------------------------------------------------------
+    # Load-error tracking
+    # ------------------------------------------------------------------
+
+    def _record_load_error(self, agent_id: str, reason: str) -> None:
+        """Record a concise load-failure reason for *agent_id*.
+
+        Kept in ``_load_errors`` so Stage D (chat helpers) can surface a
+        helpful message when the user requests a broken agent.  The existing
+        discovery try/except is unchanged — this is additive only.
+        """
+        with self._lock:
+            self._load_errors[agent_id] = reason
+
+    def get_load_error(self, agent_id: str) -> Optional[str]:
+        """Return the recorded load-error reason for *agent_id*, or None.
+
+        Errors are keyed by the agent's directory name (e.g. 'my-bot'),
+        which matches the resolved agent id.  A caller that passes a type
+        string that was normalised differently will get None gracefully.
+        None also means the agent loaded fine or was never attempted.
+        """
+        return self._load_errors.get(agent_id)
 
     # ------------------------------------------------------------------
     # Legacy ID resolution
@@ -471,6 +498,7 @@ def discover(self) -> None:
                     logger.warning(
                         "registry: Failed to load agent from %s: %s", agent_dir, e
                     )
+                    self._record_load_error(agent_dir.name, f"{type(e).__name__}: {e}")
         else:
             logger.info("registry: No custom agent directory found at %s", agents_dir)
 
@@ -1306,6 +1334,7 @@ def register_from_dir(self, agent_dir: Path) -> None:
             logger.warning(
                 "registry: Failed to hot-load agent from %s: %s", agent_dir, exc
             )
+            self._record_load_error(agent_dir.name, f"{type(exc).__name__}: {exc}")
             raise
 
     # ------------------------------------------------------------------