Skip to content

Commit bce7af2

Browse files
chore: browser read image in context (#3535)
Co-authored-by: Tao Sun <[email protected]> Co-authored-by: Sun Tao <[email protected]>
1 parent 8faa89f commit bce7af2

File tree

3 files changed

+83
-46
lines changed

3 files changed

+83
-46
lines changed

camel/agents/chat_agent.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@
105105
)
106106
from camel.utils.commons import dependencies_required
107107
from camel.utils.context_utils import ContextUtility
108+
from camel.utils.tool_result import ToolResult
108109

109110
TOKEN_LIMIT_ERROR_MARKERS = (
110111
"context_length_exceeded",
@@ -4036,6 +4037,65 @@ def _record_tool_calling(
40364037
cast(List[MemoryRecord], func_records),
40374038
)
40384039

4040+
if isinstance(result, ToolResult) and result.images:
4041+
try:
4042+
import base64
4043+
import io
4044+
4045+
try:
4046+
from PIL import Image
4047+
except ImportError:
4048+
logger.warning(
4049+
f"Tool '{func_name}' returned images but PIL "
4050+
"is not installed. Install with: pip install "
4051+
"Pillow. Skipping visual context injection."
4052+
)
4053+
# Continue without injecting images
4054+
result = (
4055+
result.text if hasattr(result, 'text') else str(result)
4056+
)
4057+
else:
4058+
logger.info(
4059+
f"Tool '{func_name}' returned ToolResult with "
4060+
f"{len(result.images)} image(s), injecting into "
4061+
"context"
4062+
)
4063+
4064+
# Convert base64 images to PIL Image objects
4065+
pil_images: List[Union[Image.Image, str]] = []
4066+
for img_data in result.images:
4067+
if img_data.startswith('data:image/'):
4068+
# Extract base64 data
4069+
base64_str = img_data.split(',', 1)[1]
4070+
img_bytes = base64.b64decode(base64_str)
4071+
pil_img = Image.open(io.BytesIO(img_bytes))
4072+
pil_images.append(pil_img)
4073+
4074+
if pil_images:
4075+
# Create a user message with the image(s)
4076+
visual_msg = BaseMessage.make_user_message(
4077+
role_name="Tool",
4078+
content=f"[Visual output from {func_name}]",
4079+
image_list=pil_images,
4080+
)
4081+
4082+
# Inject into conversation context with slight
4083+
# timestamp increment
4084+
self.update_memory(
4085+
visual_msg,
4086+
OpenAIBackendRole.USER,
4087+
timestamp=base_timestamp + 2e-6,
4088+
return_records=False,
4089+
)
4090+
logger.info(
4091+
f"Successfully injected {len(pil_images)} "
4092+
"image(s) into agent context"
4093+
)
4094+
except Exception as e:
4095+
logger.error(
4096+
f"Failed to inject visual content from {func_name}: {e}"
4097+
)
4098+
40394099
# Record information about this tool call
40404100
tool_record = ToolCallingRecord(
40414101
tool_name=func_name,

camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py

Lines changed: 23 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,9 @@
2727
)
2828

2929
from camel.logger import get_logger
30-
from camel.messages import BaseMessage
3130
from camel.toolkits.base import BaseToolkit, RegisteredAgentToolkit
3231
from camel.toolkits.function_tool import FunctionTool
33-
from camel.utils.commons import dependencies_required
32+
from camel.utils.tool_result import ToolResult
3433

3534
from .config_loader import ConfigLoader
3635
from .ws_wrapper import WebSocketBrowserWrapper, high_level_action
@@ -554,12 +553,10 @@ async def browser_get_page_snapshot(self) -> str:
554553
logger.error(f"Failed to get page snapshot: {e}")
555554
return f"Error capturing snapshot: {e}"
556555

557-
@dependencies_required('PIL')
558556
async def browser_get_som_screenshot(
559557
self,
560558
read_image: bool = True,
561-
instruction: Optional[str] = None,
562-
) -> str:
559+
) -> "str | ToolResult":
563560
r"""Captures a screenshot with interactive elements highlighted.
564561
565562
"SoM" stands for "Set of Marks". This tool takes a screenshot and
@@ -569,17 +566,17 @@ async def browser_get_som_screenshot(
569566
textual snapshot is not enough.
570567
571568
Args:
572-
read_image (bool, optional): If `True`, the agent will analyze
573-
the screenshot. Requires agent to be registered.
569+
read_image (bool, optional): If `True`, the screenshot image will
570+
be included in the agent's context for direct visual analysis.
571+
If `False`, only a text message (including the saved file
572+
path) will be returned.
574573
(default: :obj:`True`)
575-
instruction (Optional[str], optional): A specific question or
576-
command for the agent regarding the screenshot, used only if
577-
`read_image` is `True`. For example: "Find the login button."
578574
579575
Returns:
580-
str: A confirmation message indicating the screenshot was
581-
captured, the file path where it was saved, and optionally the
582-
agent's analysis if `read_image` is `True`.
576+
str | ToolResult: If `read_image` is `True`, returns a ToolResult
577+
containing the text message and the screenshot image (which
578+
will be automatically added to agent's context). If `False`,
579+
returns a string with the file path only.
583580
"""
584581
import base64
585582
import datetime
@@ -631,38 +628,19 @@ async def browser_get_som_screenshot(
631628
result_text += f" (saved to: {file_path})"
632629
break
633630

634-
if read_image and file_path:
635-
if self.agent is None:
636-
logger.error(
637-
"Cannot analyze screenshot: No agent registered. "
638-
"Please pass this toolkit to ChatAgent via "
639-
"toolkits_to_register_agent parameter."
640-
)
641-
result_text += (
642-
" Error: No agent registered for image analysis. "
643-
"Please pass this toolkit to ChatAgent via "
644-
"toolkits_to_register_agent parameter."
645-
)
646-
else:
647-
try:
648-
from PIL import Image
649-
650-
img = Image.open(file_path)
651-
inst = instruction if instruction is not None else ""
652-
message = BaseMessage.make_user_message(
653-
role_name="User",
654-
content=inst,
655-
image_list=[img],
656-
)
657-
658-
response = await self.agent.astep(message)
659-
agent_response = response.msgs[0].content
660-
result_text += f". Agent analysis: {agent_response}"
661-
except Exception as e:
662-
logger.error(f"Error analyzing screenshot: {e}")
663-
result_text += f". Error analyzing screenshot: {e}"
664-
665-
return result_text
631+
# Return ToolResult with image if read_image is True
632+
if read_image and result.images:
633+
logger.info(
634+
f"Returning ToolResult with {len(result.images)} image(s) "
635+
"for agent context"
636+
)
637+
return ToolResult(
638+
text=result_text,
639+
images=result.images, # Base64 images from WebSocket
640+
)
641+
else:
642+
# Return plain text if read_image is False
643+
return result_text
666644
except Exception as e:
667645
logger.error(f"Failed to get screenshot: {e}")
668646
return f"Error capturing screenshot: {e}"

examples/toolkits/hybrid_browser_toolkit_example.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@
8888
agent = ChatAgent(
8989
model=model_backend,
9090
tools=[*web_toolkit_custom.get_tools()],
91-
toolkits_to_register_agent=[web_toolkit_custom],
9291
max_iteration=10,
9392
)
9493

0 commit comments

Comments
 (0)