chore: browser read image in context (#3535)

nitpicker55555 · fengju0213 · web-flow · commit bce7af252e8f · 2025-12-18T13:32:12.000+08:00
Co-authored-by: Tao Sun &lt;168447269+fengju0213@users.noreply.github.com&gt;
Co-authored-by: Sun Tao &lt;2605127667@qq.com&gt;
diff --git a/camel/agents/chat_agent.py b/camel/agents/chat_agent.py
@@ -105,6 +105,7 @@
 )
 from camel.utils.commons import dependencies_required
 from camel.utils.context_utils import ContextUtility
+from camel.utils.tool_result import ToolResult
 
 TOKEN_LIMIT_ERROR_MARKERS = (
     "context_length_exceeded",
@@ -4036,6 +4037,65 @@ def _record_tool_calling(
                 cast(List[MemoryRecord], func_records),
             )
 
+        if isinstance(result, ToolResult) and result.images:
+            try:
+                import base64
+                import io
+
+                try:
+                    from PIL import Image
+                except ImportError:
+                    logger.warning(
+                        f"Tool '{func_name}' returned images but PIL "
+                        "is not installed. Install with: pip install "
+                        "Pillow. Skipping visual context injection."
+                    )
+                    # Continue without injecting images
+                    result = (
+                        result.text if hasattr(result, 'text') else str(result)
+                    )
+                else:
+                    logger.info(
+                        f"Tool '{func_name}' returned ToolResult with "
+                        f"{len(result.images)} image(s), injecting into "
+                        "context"
+                    )
+
+                    # Convert base64 images to PIL Image objects
+                    pil_images: List[Union[Image.Image, str]] = []
+                    for img_data in result.images:
+                        if img_data.startswith('data:image/'):
+                            # Extract base64 data
+                            base64_str = img_data.split(',', 1)[1]
+                            img_bytes = base64.b64decode(base64_str)
+                            pil_img = Image.open(io.BytesIO(img_bytes))
+                            pil_images.append(pil_img)
+
+                    if pil_images:
+                        # Create a user message with the image(s)
+                        visual_msg = BaseMessage.make_user_message(
+                            role_name="Tool",
+                            content=f"[Visual output from {func_name}]",
+                            image_list=pil_images,
+                        )
+
+                        # Inject into conversation context with slight
+                        # timestamp increment
+                        self.update_memory(
+                            visual_msg,
+                            OpenAIBackendRole.USER,
+                            timestamp=base_timestamp + 2e-6,
+                            return_records=False,
+                        )
+                        logger.info(
+                            f"Successfully injected {len(pil_images)} "
+                            "image(s) into agent context"
+                        )
+            except Exception as e:
+                logger.error(
+                    f"Failed to inject visual content from {func_name}: {e}"
+                )
+
         # Record information about this tool call
         tool_record = ToolCallingRecord(
             tool_name=func_name,
diff --git a/camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py b/camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py
@@ -27,10 +27,9 @@
 )
 
 from camel.logger import get_logger
-from camel.messages import BaseMessage
 from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
 from camel.toolkits.function_tool import FunctionTool
-from camel.utils.commons import dependencies_required
+from camel.utils.tool_result import ToolResult
 
 from .config_loader import ConfigLoader
 from .ws_wrapper import WebSocketBrowserWrapper, high_level_action
@@ -554,12 +553,10 @@ async def browser_get_page_snapshot(self) -> str:
             logger.error(f"Failed to get page snapshot: {e}")
             return f"Error capturing snapshot: {e}"
 
-    @dependencies_required('PIL')
     async def browser_get_som_screenshot(
         self,
         read_image: bool = True,
-        instruction: Optional[str] = None,
-    ) -> str:
+    ) -> "str | ToolResult":
         r"""Captures a screenshot with interactive elements highlighted.
 
         "SoM" stands for "Set of Marks". This tool takes a screenshot and
@@ -569,17 +566,17 @@ async def browser_get_som_screenshot(
         textual snapshot is not enough.
 
         Args:
-            read_image (bool, optional): If `True`, the agent will analyze
-                the screenshot. Requires agent to be registered.
+            read_image (bool, optional): If `True`, the screenshot image will
+                be included in the agent's context for direct visual analysis.
+                If `False`, only a text message (including the saved file
+                path) will be returned.
                 (default: :obj:`True`)
-            instruction (Optional[str], optional): A specific question or
-                command for the agent regarding the screenshot, used only if
-                `read_image` is `True`. For example: "Find the login button."
 
         Returns:
-            str: A confirmation message indicating the screenshot was
-                captured, the file path where it was saved, and optionally the
-                agent's analysis if `read_image` is `True`.
+            str | ToolResult: If `read_image` is `True`, returns a ToolResult
+                containing the text message and the screenshot image (which
+                will be automatically added to agent's context). If `False`,
+                returns a string with the file path only.
         """
         import base64
         import datetime
@@ -631,38 +628,19 @@ async def browser_get_som_screenshot(
                         result_text += f" (saved to: {file_path})"
                         break
 
-            if read_image and file_path:
-                if self.agent is None:
-                    logger.error(
-                        "Cannot analyze screenshot: No agent registered. "
-                        "Please pass this toolkit to ChatAgent via "
-                        "toolkits_to_register_agent parameter."
-                    )
-                    result_text += (
-                        " Error: No agent registered for image analysis. "
-                        "Please pass this toolkit to ChatAgent via "
-                        "toolkits_to_register_agent parameter."
-                    )
-                else:
-                    try:
-                        from PIL import Image
-
-                        img = Image.open(file_path)
-                        inst = instruction if instruction is not None else ""
-                        message = BaseMessage.make_user_message(
-                            role_name="User",
-                            content=inst,
-                            image_list=[img],
-                        )
-
-                        response = await self.agent.astep(message)
-                        agent_response = response.msgs[0].content
-                        result_text += f". Agent analysis: {agent_response}"
-                    except Exception as e:
-                        logger.error(f"Error analyzing screenshot: {e}")
-                        result_text += f". Error analyzing screenshot: {e}"
-
-            return result_text
+            # Return ToolResult with image if read_image is True
+            if read_image and result.images:
+                logger.info(
+                    f"Returning ToolResult with {len(result.images)} image(s) "
+                    "for agent context"
+                )
+                return ToolResult(
+                    text=result_text,
+                    images=result.images,  # Base64 images from WebSocket
+                )
+            else:
+                # Return plain text if read_image is False
+                return result_text
         except Exception as e:
             logger.error(f"Failed to get screenshot: {e}")
             return f"Error capturing screenshot: {e}"
diff --git a/examples/toolkits/hybrid_browser_toolkit_example.py b/examples/toolkits/hybrid_browser_toolkit_example.py
@@ -88,7 +88,6 @@
 agent = ChatAgent(
     model=model_backend,
     tools=[*web_toolkit_custom.get_tools()],
-    toolkits_to_register_agent=[web_toolkit_custom],
     max_iteration=10,
 )
 

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,6 @@`
`88`	`88`	`agent = ChatAgent(`
`89`	`89`	`model=model_backend,`
`90`	`90`	`tools=[*web_toolkit_custom.get_tools()],`
`91`		`- toolkits_to_register_agent=[web_toolkit_custom],`
`92`	`91`	`max_iteration=10,`
`93`	`92`	`)`
`94`	`93`