2727)
2828
2929from camel .logger import get_logger
30- from camel .messages import BaseMessage
3130from camel .toolkits .base import BaseToolkit , RegisteredAgentToolkit
3231from camel .toolkits .function_tool import FunctionTool
33- from camel .utils .commons import dependencies_required
32+ from camel .utils .tool_result import ToolResult
3433
3534from .config_loader import ConfigLoader
3635from .ws_wrapper import WebSocketBrowserWrapper , high_level_action
@@ -554,12 +553,10 @@ async def browser_get_page_snapshot(self) -> str:
554553 logger .error (f"Failed to get page snapshot: { e } " )
555554 return f"Error capturing snapshot: { e } "
556555
557- @dependencies_required ('PIL' )
558556 async def browser_get_som_screenshot (
559557 self ,
560558 read_image : bool = True ,
561- instruction : Optional [str ] = None ,
562- ) -> str :
559+ ) -> "str | ToolResult" :
563560 r"""Captures a screenshot with interactive elements highlighted.
564561
565562 "SoM" stands for "Set of Marks". This tool takes a screenshot and
@@ -569,17 +566,17 @@ async def browser_get_som_screenshot(
569566 textual snapshot is not enough.
570567
571568 Args:
572- read_image (bool, optional): If `True`, the agent will analyze
573- the screenshot. Requires agent to be registered.
569+ read_image (bool, optional): If `True`, the screenshot image will
570+ be included in the agent's context for direct visual analysis.
571+ If `False`, only a text message (including the saved file
572+ path) will be returned.
574573 (default: :obj:`True`)
575- instruction (Optional[str], optional): A specific question or
576- command for the agent regarding the screenshot, used only if
577- `read_image` is `True`. For example: "Find the login button."
578574
579575 Returns:
580- str: A confirmation message indicating the screenshot was
581- captured, the file path where it was saved, and optionally the
582- agent's analysis if `read_image` is `True`.
576+ str | ToolResult: If `read_image` is `True`, returns a ToolResult
577+ containing the text message and the screenshot image (which
578+ will be automatically added to agent's context). If `False`,
579+ returns a string with the file path only.
583580 """
584581 import base64
585582 import datetime
@@ -631,38 +628,19 @@ async def browser_get_som_screenshot(
631628 result_text += f" (saved to: { file_path } )"
632629 break
633630
634- if read_image and file_path :
635- if self .agent is None :
636- logger .error (
637- "Cannot analyze screenshot: No agent registered. "
638- "Please pass this toolkit to ChatAgent via "
639- "toolkits_to_register_agent parameter."
640- )
641- result_text += (
642- " Error: No agent registered for image analysis. "
643- "Please pass this toolkit to ChatAgent via "
644- "toolkits_to_register_agent parameter."
645- )
646- else :
647- try :
648- from PIL import Image
649-
650- img = Image .open (file_path )
651- inst = instruction if instruction is not None else ""
652- message = BaseMessage .make_user_message (
653- role_name = "User" ,
654- content = inst ,
655- image_list = [img ],
656- )
657-
658- response = await self .agent .astep (message )
659- agent_response = response .msgs [0 ].content
660- result_text += f". Agent analysis: { agent_response } "
661- except Exception as e :
662- logger .error (f"Error analyzing screenshot: { e } " )
663- result_text += f". Error analyzing screenshot: { e } "
664-
665- return result_text
631+ # Return ToolResult with image if read_image is True
632+ if read_image and result .images :
633+ logger .info (
634+ f"Returning ToolResult with { len (result .images )} image(s) "
635+ "for agent context"
636+ )
637+ return ToolResult (
638+ text = result_text ,
639+ images = result .images , # Base64 images from WebSocket
640+ )
641+ else :
642+ # Return plain text if read_image is False
643+ return result_text
666644 except Exception as e :
667645 logger .error (f"Failed to get screenshot: { e } " )
668646 return f"Error capturing screenshot: { e } "
0 commit comments