fix(agents): support parallel tool calls and improve recovery prompt

theonlychant · theonlychant · commit 57fbc6b1f49d · 2026-05-02T21:11:56.000-05:00
diff --git a/src/gaia/agents/base/agent.py b/src/gaia/agents/base/agent.py
@@ -976,10 +976,45 @@ def _parse_llm_response(self, response: str) -> Dict[str, Any]:
                     f"finishing the call — increase AgentConfig.max_tokens."
                 )
             if len(tool_calls) > 1:
-                raise NotImplementedError(
-                    "Parallel tool calls (multiple tool_calls in one response) are not yet supported. "
-                    f"Received {len(tool_calls)} tool calls."
+                # Support multiple native tool_calls by returning a list of
+                # parsed tool call dicts. Each element has the same shape
+                # as the single-call return value so callers can either
+                # handle a list (preferred for tool-calling models) or
+                # fall back to the old single-dict behaviour.
+                parsed_calls = []
+                for tc in tool_calls:
+                    name = tc["function"]["name"]
+                    arguments_raw = tc["function"].get("arguments")
+
+                    if arguments_raw is None or arguments_raw == "":
+                        tool_args = {}
+                    elif isinstance(arguments_raw, dict):
+                        tool_args = arguments_raw
+                    elif isinstance(arguments_raw, (str, bytes, bytearray)):
+                        try:
+                            tool_args = json.loads(arguments_raw)
+                        except json.JSONDecodeError as exc:
+                            raise ValueError(
+                                f"Malformed tool_call arguments for '{name}': {exc}. "
+                                f"Raw arguments: {str(arguments_raw)[:200]}"
+                            ) from exc
+                    else:
+                        raise ValueError(
+                            f"Malformed tool_call arguments for '{name}': expected "
+                            f"str or dict, got {type(arguments_raw).__name__}"
+                        )
+
+                    parsed_calls.append({
+                        "thought": "",
+                        "goal": "",
+                        "tool": name,
+                        "tool_args": tool_args,
+                    })
+                logger.debug(
+                    "[PARSE] Native tool_calls: returning %d parsed calls",
+                    len(parsed_calls),
                 )
+                return parsed_calls
             tc = tool_calls[0]
             name = tc["function"]["name"]
             arguments_raw = tc["function"].get("arguments")
@@ -2514,33 +2549,149 @@ def process_query(
                         "rephrase or break the request into smaller pieces?"
                     )
                     break
+
                 # Push a synthetic assistant turn + recovery user message so the
                 # next LLM call has context. Don't include the raw envelope to
                 # keep noise out of the conversation history.
-                messages.append(
-                    {
-                        "role": "assistant",
+                recovery_assistant = {
+                    "role": "assistant",
+                    "content": "[I tried to call a tool but my arguments were malformed.]",
+                }
+                messages.append(recovery_assistant)
+                conversation.append(recovery_assistant)
+
+                # Provide different guidance depending on the parse failure type.
+                if isinstance(parse_exc, NotImplementedError):
+                    # NotImplementedError historically meant "multiple tool_calls"
+                    # when native tool-calling models returned parallel calls.
+                    # Give the model a clear instruction to either emit a single
+                    # tool call or a JSON `plan` describing multiple steps.
+                    recovery_user = {
+                        "role": "user",
                         "content": (
-                            "[I tried to call a tool but my arguments were "
-                            "malformed.]"
+                            "Your last response contained MULTIPLE tool calls in a single reply. "
+                            "This agent prefers either a single tool call per response, "
+                            "or a structured JSON 'plan' containing an ordered array of steps. "
+                            "Please either: (A) output a single tool call JSON object, "
+                            "or (B) output a JSON plan in the format: {\"plan\": [{\"tool\": \"name\", \"tool_args\": {...}}]}. "
+                            "If you don't need to call a tool, answer in plain text."
                         ),
                     }
-                )
-                messages.append(
-                    {
+                    messages.append(recovery_user)
+                    conversation.append(recovery_user)
+                else:
+                    # ValueError or other parse errors usually mean malformed args.
+                    recovery_user = {
                         "role": "user",
                         "content": (
                             "Your last tool call had malformed arguments. "
                             "Please try again. Use ONLY the documented enum "
-                            "values for each argument (e.g. 'brief', "
-                            "'detailed', 'bullets' — never a long sentence). "
+                            "values for each argument (e.g. 'brief', 'detailed', 'bullets'). "
                             "If you don't need a tool, answer in plain text."
                         ),
                     }
-                )
+                    messages.append(recovery_user)
+                    conversation.append(recovery_user)
+
                 steps_taken += 1
                 continue
             logger.debug(f"Parsed response: {parsed}")
+
+            # If the parser returned multiple native tool calls, execute them
+            # sequentially in this same LLM turn (one LLM turn -> N tool turns).
+            if isinstance(parsed, list):
+                # Record assistant turn containing multiple tool_calls
+                conversation.append({"role": "assistant", "content": {"tool_calls": parsed}})
+                # Preserve raw assistant response for history
+                messages.append({"role": "assistant", "content": response})
+
+                for call in parsed:
+                    if not call.get("tool") or "tool_args" not in call:
+                        continue
+
+                    tool_name = call["tool"]
+                    tool_args = call["tool_args"]
+                    logger.debug(f"Sequential native tool call: {tool_name} {tool_args}")
+
+                    # Display the tool call in real-time
+                    self.console.print_tool_usage(tool_name)
+                    if tool_args:
+                        self.console.pretty_print_json(tool_args, "Arguments")
+
+                    # Start progress indicator for tool execution
+                    self.console.start_progress(f"Executing {tool_name}")
+
+                    # Track call history and detect repeats
+                    current_call = (tool_name, str(tool_args))
+                    tool_call_history.append(current_call)
+                    tool_call_log.append(current_call)
+                    if len(tool_call_history) > 5:
+                        tool_call_history.pop(0)
+
+                    consecutive_count = 0
+                    for c in reversed(tool_call_history):
+                        if c == current_call:
+                            consecutive_count += 1
+                        else:
+                            break
+                    if consecutive_count >= self.max_consecutive_repeats:
+                        self.console.stop_progress()
+                        final_answer = f"Task completed with {tool_name}. No further action needed."
+                        self.console.print_repeated_tool_warning()
+                        break
+
+                    # Execute the tool
+                    tool_result = self._execute_tool(tool_name, tool_args)
+
+                    # Stop progress indicator
+                    self.console.stop_progress()
+
+                    # Domain-specific post-processing
+                    self._post_process_tool_result(tool_name, tool_args, tool_result)
+
+                    # Handle and append large tool results
+                    truncated_result = self._handle_large_tool_result(
+                        tool_name, tool_result, conversation, tool_args
+                    )
+
+                    # Display the tool result
+                    self.console.pretty_print_json(tool_result, "Result")
+                    self.console.print_tool_complete()
+
+                    previous_outputs.append({"tool": tool_name, "args": tool_args, "result": truncated_result})
+                    step_results.append(tool_result)
+
+                    # Share tool output with subsequent LLM calls
+                    messages.append(self._create_tool_message(tool_name, truncated_result))
+
+                    # Error handling
+                    is_error = isinstance(tool_result, dict) and (
+                        tool_result.get("status") == "error"
+                        or tool_result.get("success") is False
+                        or tool_result.get("has_errors") is True
+                        or tool_result.get("return_code", 0) != 0
+                    )
+                    if is_error:
+                        error_count += 1
+                        last_error = (
+                            tool_result.get("error_brief")
+                            or tool_result.get("error")
+                            or tool_result.get("stderr")
+                            or tool_result.get("hint")
+                            or tool_result.get("suggested_fix")
+                            or f"Command failed with return code {tool_result.get('return_code')}"
+                        )
+                        logger.warning(f"Tool execution error in sequential calls (count: {error_count}): {last_error}")
+                        if not tool_result.get("error_displayed"):
+                            self.console.print_error(last_error)
+                        self.execution_state = self.STATE_ERROR_RECOVERY
+                        # Continue processing remaining calls (or break?) — prefer to continue
+
+                # After executing all sequential native calls, continue the main loop
+                # so the LLM can process the combined tool results.
+                continue
+
+            # Single parsed response — append as before
             conversation.append({"role": "assistant", "content": parsed})
 
             # Add assistant response to messages for chat history
diff --git a/tests/unit/test_agent_parallel_tool_calls.py b/tests/unit/test_agent_parallel_tool_calls.py
@@ -0,0 +1,58 @@
+import json
+import pytest
+
+from gaia.agents.base.agent import Agent
+from gaia.agents.base.tools import _TOOL_REGISTRY
+
+
+def test_process_query_executes_multiple_native_tool_calls(monkeypatch):
+    # Register two simple tools for the test
+    def tool_one(a=""):
+        return {"status": "success", "value": f"one:{a}"}
+
+    def tool_two(b=""):
+        return {"status": "success", "value": f"two:{b}"}
+
+    _TOOL_REGISTRY["tool_one"] = {
+        "function": tool_one,
+        "parameters": {"a": {"type": "str", "required": False}},
+        "description": "Test tool one",
+    }
+    _TOOL_REGISTRY["tool_two"] = {
+        "function": tool_two,
+        "parameters": {"b": {"type": "str", "required": False}},
+        "description": "Test tool two",
+    }
+
+    class DummyAgent(Agent):
+        def _register_tools(self):
+            # No-op; tests inject tools directly into registry
+            return None
+
+    agent = DummyAgent(skip_lemonade=True, silent_mode=True)
+
+    # Prepare a native envelope with two tool_calls (as Lemonade encodes them)
+    envelope = {
+        "__tool_calls__": [
+            {"function": {"name": "tool_one", "arguments": json.dumps({"a": "X"})}},
+            {"function": {"name": "tool_two", "arguments": json.dumps({"b": "Y"})}},
+        ],
+        "finish_reason": "",
+    }
+
+    # Monkeypatch send_messages to return our envelope as the LLM response
+    # AgentSDK.send_messages returns an object with .text and .stats attributes
+    monkeypatch.setattr(
+        agent.chat,
+        "send_messages",
+        lambda messages, system_prompt, tools: type(
+            "R", (), {"text": json.dumps(envelope), "stats": {}}
+        )(),
+    )
+
+    result = agent.process_query("execute both tools", max_steps=6)
+
+    # Verify both tool results were appended to conversation
+    tool_names = [m.get("name") for m in result["conversation"] if m.get("role") == "tool"]
+    assert "tool_one" in tool_names
+    assert "tool_two" in tool_names
diff --git a/tests/unit/test_agent_parallel_tool_calls_extra.py b/tests/unit/test_agent_parallel_tool_calls_extra.py
@@ -0,0 +1,112 @@
+import json
+import pytest
+
+from gaia.agents.base.agent import Agent
+from gaia.agents.base.tools import _TOOL_REGISTRY
+
+
+def _make_agent(monkeypatch):
+    class DummyAgent(Agent):
+        def _register_tools(self):
+            return None
+
+    agent = DummyAgent(skip_lemonade=True, silent_mode=True)
+    return agent
+
+
+def test_parallel_calls_with_error(monkeypatch):
+    # Tools: two success, one error
+    def t_ok1(x=""):
+        return {"status": "success", "value": f"ok1:{x}"}
+
+    def t_err(y=""):
+        return {"status": "error", "error": "boom"}
+
+    def t_ok2(z=""):
+        return {"status": "success", "value": f"ok2:{z}"}
+
+    _TOOL_REGISTRY["ok1"] = {"function": t_ok1, "parameters": {}, "description": ""}
+    _TOOL_REGISTRY["errtool"] = {"function": t_err, "parameters": {}, "description": ""}
+    _TOOL_REGISTRY["ok2"] = {"function": t_ok2, "parameters": {}, "description": ""}
+
+    agent = _make_agent(monkeypatch)
+
+    envelope = {
+        "__tool_calls__": [
+            {"function": {"name": "ok1", "arguments": json.dumps({"x": "A"})}},
+            {"function": {"name": "errtool", "arguments": json.dumps({"y": "B"})}},
+            {"function": {"name": "ok2", "arguments": json.dumps({"z": "C"})}},
+        ],
+        "finish_reason": "",
+    }
+
+    # make send_messages return envelope
+    responses = [type("R", (), {"text": json.dumps(envelope), "stats": {}})()]
+
+    monkeypatch.setattr(agent.chat, "send_messages", lambda messages, system_prompt, tools: responses.pop(0))
+
+    result = agent.process_query("run three tools", max_steps=10)
+
+    # Ensure we got three tool entries in conversation
+    tool_entries = [m for m in result["conversation"] if m.get("role") == "tool"]
+    names = [t.get("name") for t in tool_entries]
+    assert "ok1" in names and "errtool" in names and "ok2" in names
+
+    # Find the errtool result and ensure it's an error
+    err_entry = next((t for t in tool_entries if t.get("name") == "errtool"), None)
+    assert err_entry is not None
+    assert isinstance(err_entry.get("content"), dict) and err_entry["content"].get("status") == "error"
+
+
+def test_plan_then_native_tool_calls(monkeypatch):
+    # Tools
+    def q(a=""):
+        return {"status": "success", "value": f"q:{a}"}
+
+    _TOOL_REGISTRY["q"] = {"function": q, "parameters": {}, "description": ""}
+
+    agent = _make_agent(monkeypatch)
+
+    envelope = {
+        "__tool_calls__": [
+            {"function": {"name": "q", "arguments": json.dumps({"a": "1"})}},
+            {"function": {"name": "q", "arguments": json.dumps({"a": "2"})}},
+        ],
+        "finish_reason": "",
+    }
+
+    # Second LLM response will be a final answer
+    final_answer = {"answer": "All done"}
+
+    responses = [
+        type("R", (), {"text": json.dumps(envelope), "stats": {}})(),
+        type("R", (), {"text": json.dumps(final_answer), "stats": {}})(),
+    ]
+
+    def fake_send(messages, system_prompt, tools):
+        return responses.pop(0)
+
+    monkeypatch.setattr(agent.chat, "send_messages", fake_send)
+
+    result = agent.process_query("do q twice and answer", max_steps=10)
+
+    # Should have run two q tool calls and then returned the final answer
+    tool_entries = [m for m in result["conversation"] if m.get("role") == "tool"]
+    assert len([t for t in tool_entries if t.get("name") == "q"]) == 2
+    assert result.get("result") and "All done" in result.get("result")
+
+
+def test_notimplementederror_recovery_message(monkeypatch):
+    agent = _make_agent(monkeypatch)
+
+    # Make the parser raise NotImplementedError
+    monkeypatch.setattr(agent, "_parse_llm_response", lambda r: (_ for _ in ()).throw(NotImplementedError("multiple")))
+
+    # Make send_messages return something (will be ignored by parser)
+    monkeypatch.setattr(agent.chat, "send_messages", lambda messages, system_prompt, tools: type("R", (), {"text": "{\"bad\":1}", "stats": {}})())
+
+    result = agent.process_query("trigger parse error", max_steps=3)
+
+    # Last user message in conversation should instruct about multiple tool calls
+    user_msgs = [m for m in result["conversation"] if m.get("role") == "user"]
+    assert any("MULTIPLE tool calls" in str(m.get("content")) or "single tool call" in str(m.get("content")) for m in user_msgs)