Skip to content

Commit e3a67dd

Browse files
authored
fix(agents): resilient fenced tool-call extraction + builder fail-loudly (#1430)
The Agent Builder told users "✅ Agent Created!" while writing no agent to disk, ~40% of the time. The model wraps its real \`create_agent\` call in a \` \`\`\`json \` fence; the base parser skipped all fenced JSON to avoid grabbing documentation examples — so the tool never ran and the model's hallucinated success text was returned verbatim. This also exposed four sites in the UI that silently swap any unresolvable agent for ChatAgent, and swallowed per-agent load errors in the registry. After this fix: fenced tool calls are extracted when they are the only candidate (unfenced wins when both exist; multiple fenced with no unfenced is treated as ambiguous docs → None). The builder only surfaces success after \`create_agent\` actually runs and the file exists. Unresolvable agents surface a user-friendly error. Registry load failures are recorded and exposed. Cross-reference: #1394 (separate bug — cold-start pre-flight timeout for large models, not fixed here). ## Test plan - [x] `pytest tests/unit/agents/test_tool_call_extraction.py` — 20 tests including verbatim Zephyr regression fixture — **PASS** - [x] `pytest tests/unit/agents/test_builder_fail_loudly.py` — error result and fabricated-success paths — **PASS** - [x] `pytest tests/unit/agents/test_builder_fenced_integration.py` — fenced call fires create_agent end-to-end (mocked LLM) — **PASS** - [x] `pytest tests/unit/agents/test_registry_load_errors.py` — broken agent dir records error, discovery continues — **PASS** - [x] `pytest tests/unit/chat/ui/test_agent_unavailable.py` — all 4 fallback sites return user-friendly error, not ChatAgent — **PASS** - [x] `gaia eval agent --category tool_selection` vs `qwen-3.5-35b-3b51ca92` baseline — **no regression** (Strix Halo class hardware; `multi_step_plan` improved FAIL→PASS; `smart_discovery` timing outlier unrelated to parser change) - [x] Real-world builder loop ×5 with distinct names against `Qwen3.5-35B-A3B-GGUF` — **all 5 agents created**, `🔧 Executing operation / Tool: create_agent` confirmed in server logs for every run, all 5 appear in `GET /api/agents` via hot-reload (`orion`, `nebula`, `pulsar`, `quasar`, `vortex`) Closes #1428
1 parent 30d5654 commit e3a67dd

10 files changed

Lines changed: 857 additions & 93 deletions

src/gaia/agents/base/agent.py

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,13 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
695695
"Let me search for that.\n{"thought": "...", "tool": "query_documents",
696696
"tool_args": {"query": "..."}}"
697697
698+
Decision logic over all {…} candidates that contain a "tool" key,
699+
each tagged fenced/unfenced:
700+
1. ≥1 unfenced candidate → return the first (unchanged — zero regression).
701+
2. else exactly one fenced candidate → return it (the fix for #1428).
702+
3. else >1 fenced, 0 unfenced → ambiguous (looks like docs) → None + warning.
703+
4. else → None.
704+
698705
This method finds the JSON block using brace-depth matching and returns
699706
the parsed tool call if it contains a "tool" key. Returns None if no
700707
embedded tool call is found, allowing the caller to treat the response
@@ -705,7 +712,6 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
705712
return None
706713

707714
# Build a set of character ranges inside code fences (```...```)
708-
# so we don't accidentally extract example JSON from markdown.
709715
_code_ranges: list[tuple[int, int]] = []
710716
_search_from = 0
711717
while True:
@@ -723,17 +729,31 @@ def _extract_embedded_tool_call(self, response: str) -> Optional[Dict[str, Any]]
723729
def _inside_code_fence(pos: int) -> bool:
724730
return any(start <= pos < end for start, end in _code_ranges)
725731

726-
# Walk through looking for { that starts a JSON-like block with "tool"
732+
def _parse_candidate(raw: str) -> Optional[Dict[str, Any]]:
733+
"""Return the parsed dict if raw is valid JSON with a 'tool' key, else None."""
734+
try:
735+
fixed = re.sub(r",\s*}", "}", raw)
736+
fixed = re.sub(r",\s*]", "]", fixed)
737+
parsed = json.loads(fixed)
738+
if isinstance(parsed, dict) and "tool" in parsed:
739+
if "tool_args" not in parsed:
740+
parsed["tool_args"] = {}
741+
return parsed
742+
except json.JSONDecodeError:
743+
pass
744+
return None
745+
746+
# Collect all tool-call candidates, tagged by whether they are inside a fence
747+
unfenced: list[Dict[str, Any]] = []
748+
fenced: list[Dict[str, Any]] = []
749+
727750
idx = 0
728751
while idx < len(response):
729752
brace_pos = response.find("{", idx)
730753
if brace_pos == -1:
731754
break
732755

733-
# Skip JSON inside markdown code fences (example/documentation)
734-
if _inside_code_fence(brace_pos):
735-
idx = brace_pos + 1
736-
continue
756+
is_fenced = _inside_code_fence(brace_pos)
737757

738758
# Look ahead for "tool" near this brace (within 200 chars)
739759
look_ahead = response[brace_pos : brace_pos + 200]
@@ -766,31 +786,43 @@ def _inside_code_fence(pos: int) -> bool:
766786
break
767787

768788
if depth != 0:
769-
# Unclosed braces — skip
770789
idx = brace_pos + 1
771790
continue
772791

773-
candidate = response[brace_pos : end_pos + 1]
774-
try:
775-
# Fix common trailing comma issues
776-
fixed = re.sub(r",\s*}", "}", candidate)
777-
fixed = re.sub(r",\s*]", "]", fixed)
778-
parsed = json.loads(fixed)
779-
780-
# Only accept if it has a "tool" key (it's a tool call)
781-
if isinstance(parsed, dict) and "tool" in parsed:
782-
if "tool_args" not in parsed:
783-
parsed["tool_args"] = {}
784-
logger.debug(
785-
f"[PARSE] Extracted embedded tool call: "
786-
f"{parsed.get('tool')}"
787-
)
788-
return parsed
789-
except json.JSONDecodeError:
790-
pass
792+
raw = response[brace_pos : end_pos + 1]
793+
parsed = _parse_candidate(raw)
794+
if parsed is not None:
795+
if is_fenced:
796+
fenced.append(parsed)
797+
else:
798+
unfenced.append(parsed)
791799

792800
idx = brace_pos + 1
793801

802+
# Decision logic
803+
if unfenced:
804+
# Rule 1: prefer unfenced (unchanged behaviour — zero regression)
805+
logger.debug(
806+
"[PARSE] Extracted embedded tool call: %s", unfenced[0].get("tool")
807+
)
808+
return unfenced[0]
809+
810+
if len(fenced) == 1:
811+
# Rule 2: exactly one fenced call — trust it (fix for #1428)
812+
logger.debug(
813+
"[PARSE] Extracted fenced tool call: %s", fenced[0].get("tool")
814+
)
815+
return fenced[0]
816+
817+
if len(fenced) > 1:
818+
# Rule 3: multiple fenced calls — ambiguous, likely documentation examples
819+
logger.warning(
820+
"[PARSE] ambiguous: %d fenced tool-call candidates found and no "
821+
"unfenced call; cannot determine which is real — returning None",
822+
len(fenced),
823+
)
824+
return None
825+
794826
return None
795827

796828
def _extract_json_from_response(self, response: str) -> Optional[Dict[str, Any]]:

src/gaia/agents/builder/agent.py

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,11 @@ def create_agent(
177177
Valid options: "rag" (document Q&A), "file_search" (fuzzy file
178178
search), "file_io" (read/write/edit files), "shell" (sandboxed
179179
shell), "screenshot" (screen capture), "sd" (image generation),
180-
"vlm" (vision LLM). Combine freely; they are added to the
181-
class's base list alongside Agent.
180+
"vlm" (vision LLM), "code_index" (semantic code search),
181+
"filesystem" (file system navigation), "scratchpad" (SQL scratch
182+
tables for data analysis), "browser" (web search and page fetch).
183+
Combine freely; they are added to the class's base list alongside
184+
Agent.
182185
183186
Returns:
184187
Confirmation message with the path to the created agent.py.
@@ -230,6 +233,7 @@ def _process_query_impl( # type: ignore[override]
230233
self.console.print_processing_start(user_input, steps_limit, self.model_id)
231234

232235
final_answer: Optional[str] = None
236+
created_ok = False
233237
steps_taken = 0
234238

235239
while steps_taken < steps_limit and final_answer is None:
@@ -287,6 +291,18 @@ def _process_query_impl( # type: ignore[override]
287291
if isinstance(tool_result, dict)
288292
else str(tool_result)
289293
)
294+
# Fail loudly: if create_agent returned an error, don't let the
295+
# LLM fabricate a success message — end immediately with an honest answer.
296+
if tool_name == "create_agent" and str(tool_result).startswith(
297+
"Error:"
298+
):
299+
final_answer = (
300+
f"I was unable to create the agent: {tool_result}\n\n"
301+
"Please check the name is valid and try again."
302+
)
303+
break
304+
if tool_name == "create_agent":
305+
created_ok = True
290306
messages.append(
291307
{
292308
"role": "user",
@@ -295,16 +311,45 @@ def _process_query_impl( # type: ignore[override]
295311
)
296312
# Continue loop so the LLM can summarize the result
297313
else:
298-
final_answer = (
299-
parsed.get("answer")
300-
or response.strip()
301-
or ("I wasn't able to generate a response. Please try again.")
302-
)
314+
# No tool call was extracted. If creation already succeeded this
315+
# is a post-success summary — return it without triggering the
316+
# fabrication check (which misfires on ✅ in a real summary).
317+
if created_ok:
318+
candidate = parsed.get("answer") or response.strip()
319+
final_answer = candidate or "Agent created successfully."
320+
# Otherwise check whether the model hallucinated success markers.
321+
elif any(
322+
m in (parsed.get("answer") or response.strip())
323+
for m in ("Agent Created", "✅", "File location")
324+
):
325+
# The model wrote a fake success — push a corrective turn and
326+
# loop again (steps_limit guards infinite recursion).
327+
logger.warning(
328+
"BuilderAgent: fabricated success detected without tool call; "
329+
"injecting corrective user turn"
330+
)
331+
messages.append(
332+
{
333+
"role": "user",
334+
"content": (
335+
"You did not actually call create_agent. "
336+
"Output ONLY the bare JSON tool call, no prose, "
337+
"no code fences."
338+
),
339+
}
340+
)
341+
# Do not set final_answer — the loop will continue
342+
else:
343+
final_answer = (
344+
parsed.get("answer")
345+
or response.strip()
346+
or "I wasn't able to generate a response. Please try again."
347+
)
303348

304349
if final_answer is None:
305350
final_answer = (
306-
"I've used the maximum number of steps. "
307-
"Check ~/.gaia/agents/ for any agents that were created."
351+
"I was unable to create the agent after several attempts. "
352+
"Please try again with a clear agent name."
308353
)
309354

310355
self.console.print_final_answer(final_answer, streaming=self.streaming)

src/gaia/agents/builder/system_prompt.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
- "Take screenshots" → tools=["screenshot"]
2525
- "Generate images (Stable Diffusion)" → tools=["sd"]
2626
- "Vision / image understanding" → tools=["vlm"]
27+
- "Semantic code search" → tools=["code_index"]
28+
- "File system navigation" → tools=["filesystem"]
29+
- "Data analysis with SQL scratch tables" → tools=["scratchpad"]
30+
- "Web search and page fetch" → tools=["browser"]
2731
You can combine them, e.g. tools=["rag", "file_search"] for a research assistant.
2832
If the user wants none of these, skip the tools argument.
2933
5. Ask if they would like MCP server support. Explain briefly: \
@@ -39,6 +43,9 @@
3943
## Rules
4044
- ALWAYS call the `create_agent` tool once you have a name and have asked about \
4145
capabilities + MCP. Do not just describe what you would do — actually call the tool.
46+
- When calling a tool, output ONLY the bare JSON object — no prose before or after, \
47+
no ``` code fences, and never write your own success message. The system writes the \
48+
confirmation after the tool actually runs.
4249
- If the user provides a name in their very first message, skip the greeting \
4350
pleasantries but still ask about capabilities and MCP before calling the tool.
4451
- Keep responses concise and friendly.

src/gaia/agents/registry.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,33 @@ def __init__(self):
427427
self._lemonade_models: Optional[List[str]] = None # cache
428428
self._lemonade_models_last_fail: Optional[float] = None # monotonic timestamp
429429
self._lock = threading.Lock()
430+
# Records agent IDs whose load failed during discover() / register_from_dir().
431+
# Populated by _record_load_error(); read by get_load_error() and Stage D.
432+
self._load_errors: Dict[str, str] = {}
433+
434+
# ------------------------------------------------------------------
435+
# Load-error tracking
436+
# ------------------------------------------------------------------
437+
438+
def _record_load_error(self, agent_id: str, reason: str) -> None:
439+
"""Record a concise load-failure reason for *agent_id*.
440+
441+
Kept in ``_load_errors`` so Stage D (chat helpers) can surface a
442+
helpful message when the user requests a broken agent. The existing
443+
discovery try/except is unchanged — this is additive only.
444+
"""
445+
with self._lock:
446+
self._load_errors[agent_id] = reason
447+
448+
def get_load_error(self, agent_id: str) -> Optional[str]:
449+
"""Return the recorded load-error reason for *agent_id*, or None.
450+
451+
Errors are keyed by the agent's directory name (e.g. 'my-bot'),
452+
which matches the resolved agent id. A caller that passes a type
453+
string that was normalised differently will get None gracefully.
454+
None also means the agent loaded fine or was never attempted.
455+
"""
456+
return self._load_errors.get(agent_id)
430457

431458
# ------------------------------------------------------------------
432459
# Legacy ID resolution
@@ -471,6 +498,7 @@ def discover(self) -> None:
471498
logger.warning(
472499
"registry: Failed to load agent from %s: %s", agent_dir, e
473500
)
501+
self._record_load_error(agent_dir.name, f"{type(e).__name__}: {e}")
474502
else:
475503
logger.info("registry: No custom agent directory found at %s", agents_dir)
476504

@@ -1306,6 +1334,7 @@ def register_from_dir(self, agent_dir: Path) -> None:
13061334
logger.warning(
13071335
"registry: Failed to hot-load agent from %s: %s", agent_dir, exc
13081336
)
1337+
self._record_load_error(agent_dir.name, f"{type(exc).__name__}: {exc}")
13091338
raise
13101339

13111340
# ------------------------------------------------------------------

0 commit comments

Comments
 (0)