fix(agent): harden backend execution and playwright mcp error handling

Jerry-Terrasse · Jerry-Terrasse · commit 10545acfbd47 · 2026-04-05T18:03:36.000+08:00
diff --git a/agent/run_agent.py b/agent/run_agent.py
@@ -182,9 +182,17 @@ def main() -> None:
         "You are testing a text-based adventure game. Focus on exploration, items, and puzzle logic.",
     )
     report = orchestrator.run(game_profile)
-    game_base_url = game_config.get("base_url") or (
-        f"http://localhost:{game_config['port']}/api/agent"
+    port = game_config.get("port")
+    if port is None:
+        raise ValueError(f"Game config for '{args.game}' is missing required 'port'")
+    game_base_url = str(game_config.get("base_url") or "").strip() or (
+        f"http://localhost:{port}/api/agent"
     )
+    frontend_url = str(
+        game_config.get("frontend_url")
+        or backend_spec.settings.get("frontend_url")
+        or f"http://localhost:{port}"
+    ).strip()
     report.metadata["llm"] = {
         "model": model,
         "platform": resolved_platform,
@@ -199,6 +207,7 @@ def main() -> None:
         "name": args.game,
         "port": game_config.get("port"),
         "base_url": game_base_url,
+        "frontend_url": frontend_url,
         "backend_type": backend_spec.backend_type,
         "have_ground_truth": bool(game_config.get("ground_truth", False)),
         "profile": game_profile,
diff --git a/agent/src/computeruse/mcp_client.py b/agent/src/computeruse/mcp_client.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from collections import deque
 import json
 import subprocess
 import threading
@@ -32,8 +33,11 @@ def __init__(
         self._startup_timeout = startup_timeout
         self._process: subprocess.Popen[bytes] | None = None
         self._reader: threading.Thread | None = None
+        self._stderr_reader: threading.Thread | None = None
         self._messages: Queue[Dict[str, Any]] = Queue()
         self._next_id = 1
+        self._stderr_lines: deque[str] = deque(maxlen=200)
+        self._stderr_lock = threading.Lock()
 
     def start(self) -> None:
         if self._process is not None:
@@ -47,6 +51,11 @@ def start(self) -> None:
         )
         self._reader = threading.Thread(target=self._read_loop, daemon=True)
         self._reader.start()
+        self._stderr_reader = threading.Thread(
+            target=self._read_stderr_loop,
+            daemon=True,
+        )
+        self._stderr_reader.start()
         self.initialize()
         self.notify("notifications/initialized", {})
 
@@ -152,25 +161,31 @@ def _raise_if_process_exited(self, method: str) -> None:
         return_code = process.poll()
         if return_code is None:
             return
-        stderr_text = self._read_stderr(process)
+        stderr_text = self._recent_stderr()
         detail = (
             f"MCP process exited with code {return_code} while waiting for {method}."
         )
         if stderr_text:
             detail = f"{detail} stderr: {stderr_text}"
         raise McpProtocolError(detail)
 
-    @staticmethod
-    def _read_stderr(process: subprocess.Popen[bytes]) -> str:
-        if process.stderr is None:
-            return ""
-        try:
-            raw = process.stderr.read()
-        except Exception:  # noqa: BLE001
-            return ""
-        if not raw:
-            return ""
-        return raw.decode("utf-8", errors="replace").strip()
+    def _read_stderr_loop(self) -> None:
+        if not self._process or not self._process.stderr:
+            return
+        stderr = self._process.stderr
+        while True:
+            raw_line = stderr.readline()
+            if not raw_line:
+                return
+            text = raw_line.decode("utf-8", errors="replace").rstrip()
+            if not text:
+                continue
+            with self._stderr_lock:
+                self._stderr_lines.append(text)
+
+    def _recent_stderr(self) -> str:
+        with self._stderr_lock:
+            return "\n".join(self._stderr_lines).strip()
 
 
 def default_mcp_cwd(root_path: str) -> str:
diff --git a/agent/src/computeruse/playwright_backend.py b/agent/src/computeruse/playwright_backend.py
@@ -50,6 +50,8 @@ def __init__(
         *,
         client_factory: Optional[Callable[[], StdioMcpClient]] = None,
     ) -> None:
+        if not settings.command:
+            raise ValueError("Playwright MCP command must not be empty")
         self._settings = settings
         self._client_factory = client_factory or (
             lambda: StdioMcpClient(
@@ -95,21 +97,25 @@ def from_config(
     def start_session(self, run_context: Dict[str, Any]) -> SessionHandle:
         del run_context
         client = self._client_factory()
-        client.start()
-        tools = client.list_tools()
-        self._call_tool(
-            client,
-            self._settings.navigate_tool,
-            {"url": self._settings.frontend_url},
-        )
-        initial_observation = self._snapshot_observation(client)
-        return SessionHandle(
-            session_id=str(uuid4()),
-            backend_type=self.backend_type,
-            raw={"client": client, "tools": tools},
-            metadata={"frontend_url": self._settings.frontend_url},
-            initial_observation=initial_observation,
-        )
+        try:
+            client.start()
+            tools = client.list_tools()
+            self._call_tool(
+                client,
+                self._settings.navigate_tool,
+                {"url": self._settings.frontend_url},
+            )
+            initial_observation = self._snapshot_observation(client)
+            return SessionHandle(
+                session_id=str(uuid4()),
+                backend_type=self.backend_type,
+                raw={"client": client, "tools": tools},
+                metadata={"frontend_url": self._settings.frontend_url},
+                initial_observation=initial_observation,
+            )
+        except Exception:
+            client.close()
+            raise
 
     def describe_capabilities(
         self,
@@ -200,40 +206,33 @@ def execute(
             attempt.final_status = "completed"
             observation.execution = {
                 "attempts": [self._attempt_to_dict(attempt)],
-                "diagnostics": {"backend_type": self.backend_type},
+                "diagnostics": {
+                    "backend_type": self.backend_type,
+                    "per_call_results": per_call_results,
+                },
             }
             return BackendExecutionResult(
                 observation=observation,
                 attempts=[attempt],
-                diagnostics={"backend_type": self.backend_type},
+                diagnostics={
+                    "backend_type": self.backend_type,
+                    "per_call_results": per_call_results,
+                },
             )
         except McpProtocolError as exc:
-            error_text = str(exc)
-            error_kind = self._error_kind(error_text)
-            attempt.per_call_results = per_call_results
-            attempt.error = error_text
-            attempt.suspected_origin = "execution"
-            observation = Observation(
-                success=False,
-                message=error_text,
-                state={},
-                summary=f"Execution failure in Playwright MCP: {error_text}",
-                env_state={},
-                artifacts={},
-                execution={
-                    "attempts": [self._attempt_to_dict(attempt)],
-                    "diagnostics": {
-                        "backend_type": self.backend_type,
-                        "error": error_text,
-                        "error_kind": error_kind,
-                    },
-                    "suspected_origin": "execution",
-                },
+            return self._execution_failure_result(
+                attempt=attempt,
+                per_call_results=per_call_results,
+                error_text=str(exc),
+                error_kind=self._error_kind(str(exc)),
             )
-            return BackendExecutionResult(
-                observation=observation,
-                attempts=[attempt],
-                diagnostics={"backend_type": self.backend_type, "error": error_text},
+        except Exception as exc:  # noqa: BLE001
+            return self._execution_failure_result(
+                attempt=attempt,
+                per_call_results=per_call_results,
+                error_text=str(exc),
+                error_kind="backend_exception",
+                exception_type=type(exc).__name__,
             )
 
     def close_session(self, session: SessionHandle) -> None:
@@ -249,6 +248,9 @@ def _snapshot_observation(
         screenshots: Optional[List[Dict[str, Any]]] = None,
     ) -> Observation:
         snapshot = self._call_tool(client, self._settings.snapshot_tool, {})
+        snapshot_error = self._tool_result_error(snapshot)
+        if snapshot_error:
+            raise McpProtocolError(snapshot_error)
         snapshot_text = self._extract_text(snapshot)
         env_state = self._extract_env_state(snapshot_text)
         artifacts = {"snapshot": snapshot}
@@ -292,7 +294,9 @@ def _map_call(self, call: ExecutionCall) -> tuple[str, Dict[str, Any]]:
         if call.kind == "press":
             return self._settings.press_tool, {"key": call.text or call.target}
         if call.kind == "wait":
-            return self._settings.wait_tool, {"time": call.duration_ms or 1000}
+            return self._settings.wait_tool, {
+                "time": max(call.duration_ms or 1000, 0) / 1000.0
+            }
         if call.kind == "screenshot":
             filename = self._screenshot_filename(call)
             arguments: Dict[str, Any] = {"filename": filename}
@@ -501,6 +505,45 @@ def _result_excerpt(payload: Dict[str, Any]) -> str:
             return text[:500]
         return json.dumps(payload, ensure_ascii=False)[:500]
 
+    def _execution_failure_result(
+        self,
+        *,
+        attempt: ExecutionAttempt,
+        per_call_results: List[Dict[str, Any]],
+        error_text: str,
+        error_kind: str,
+        exception_type: str = "",
+    ) -> BackendExecutionResult:
+        attempt.per_call_results = per_call_results
+        attempt.error = error_text
+        attempt.suspected_origin = "execution"
+        diagnostics: Dict[str, Any] = {
+            "backend_type": self.backend_type,
+            "error": error_text,
+            "error_kind": error_kind,
+            "per_call_results": per_call_results,
+        }
+        if exception_type:
+            diagnostics["exception_type"] = exception_type
+        observation = Observation(
+            success=False,
+            message=error_text,
+            state={},
+            summary=f"Execution failure in Playwright MCP: {error_text}",
+            env_state={},
+            artifacts={},
+            execution={
+                "attempts": [self._attempt_to_dict(attempt)],
+                "diagnostics": diagnostics,
+                "suspected_origin": "execution",
+            },
+        )
+        return BackendExecutionResult(
+            observation=observation,
+            attempts=[attempt],
+            diagnostics=diagnostics,
+        )
+
     @staticmethod
     def _attempt_to_dict(attempt: ExecutionAttempt) -> Dict[str, Any]:
         payload = {
diff --git a/agent/src/execution_backends.py b/agent/src/execution_backends.py
@@ -118,7 +118,7 @@ def execute(
                 summary="No executable calls were produced for this step.",
                 env_state={},
                 execution={
-                    "attempts": [attempt.__dict__],
+                    "attempts": [self._attempt_to_dict(attempt)],
                     "diagnostics": {"error": "empty_execution_request"},
                     "suspected_origin": "execution",
                 },
@@ -144,7 +144,7 @@ def execute(
                 summary=f"Execution failure while sending command: {exc}",
                 env_state={},
                 execution={
-                    "attempts": [attempt.__dict__],
+                    "attempts": [self._attempt_to_dict(attempt)],
                     "diagnostics": {"error": str(exc), "backend_type": self.backend_type},
                     "suspected_origin": "execution",
                 },
diff --git a/agent/src/operator.py b/agent/src/operator.py
@@ -80,7 +80,24 @@ def execute(
                     self._translation_failure_result(str(exc)),
                     attempts,
                 )
-            result = backend.execute(session, request)
+            try:
+                result = backend.execute(session, request)
+            except Exception as exc:  # noqa: BLE001
+                attempts.append(
+                    ExecutionAttempt(
+                        attempt=operator_attempt,
+                        translated_calls=request.calls,
+                        retry_reason=retry_reason,
+                        success=False,
+                        final_status="backend_exception",
+                        suspected_origin="execution",
+                        error=str(exc),
+                    )
+                )
+                return self._merge_attempts(
+                    self._backend_exception_result(exc),
+                    attempts,
+                )
             attempt = self._coerce_attempt(
                 request=request,
                 result=result,
@@ -305,6 +322,36 @@ def _translation_failure_result(error_text: str) -> BackendExecutionResult:
             },
         )
 
+    @staticmethod
+    def _backend_exception_result(exc: Exception) -> BackendExecutionResult:
+        error_text = str(exc)
+        observation = Observation(
+            success=False,
+            message=error_text,
+            state={},
+            summary=f"Execution backend raised an unexpected exception: {error_text}",
+            env_state={},
+            artifacts={},
+            execution={
+                "attempts": [],
+                "diagnostics": {
+                    "error": error_text,
+                    "error_kind": "backend_exception",
+                    "exception_type": type(exc).__name__,
+                },
+                "suspected_origin": "execution",
+            },
+        )
+        return BackendExecutionResult(
+            observation=observation,
+            attempts=[],
+            diagnostics={
+                "error": error_text,
+                "error_kind": "backend_exception",
+                "exception_type": type(exc).__name__,
+            },
+        )
+
     @staticmethod
     def _describe_capabilities(
         capability: CapabilityDescriptor,