Skip to content

Commit 2164371

Browse files
authored
update browser-use agent in alias use base64 data for image context
update browser-use agent in alias use base64 data for image context
2 parents dde19f0 + ad8ba0f commit 2164371

File tree

3 files changed

+38
-81
lines changed

3 files changed

+38
-81
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,6 @@ uv.lock
6161
# Logs
6262
logs/
6363
*.log
64+
65+
# Agent-generated files
66+
**sessions_mount_dir/

alias/.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,9 @@ __pycache__/
77
# Logs
88
logs/
99
src/alias/agent/agents/log/
10-
sessions_mount_dir/
11-
1210
# Python
1311
*.py[cod]
1412
*$py.class
1513

1614
# Package
1715
alias.egg-info/
18-

alias/src/alias/agent/agents/_browser_agent.py

Lines changed: 35 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
from typing import Type, Optional, Any
1212
import asyncio
1313
import copy
14-
import base64
15-
import shutil
1614
from loguru import logger
1715
from pydantic import BaseModel
1816

@@ -24,6 +22,7 @@
2422
TextBlock,
2523
ToolResultBlock,
2624
ImageBlock,
25+
Base64Source,
2726
)
2827
from agentscope.model import ChatModelBase
2928
from agentscope.tool import (
@@ -194,22 +193,6 @@ def __init__(
194193
self.toolkit.register_tool_function(self.browser_subtask_manager)
195194
self.toolkit.register_tool_function(self.image_understanding)
196195

197-
if (
198-
self.model.model_name.startswith("qvq")
199-
or "-vl" in self.model.model_name
200-
or "4o" in self.model.model_name
201-
or "gpt-5" in self.model.model_name
202-
):
203-
# If the model supports multimodal input,
204-
# prepare a directory for screenshots
205-
screenshot_dir = os.path.join(
206-
"./logs/screenshots/",
207-
"tmp" + "_browser_agent",
208-
)
209-
if os.path.exists(screenshot_dir):
210-
shutil.rmtree(screenshot_dir)
211-
os.makedirs(screenshot_dir, exist_ok=True)
212-
self.screenshot_dir = screenshot_dir
213196
self.no_screenshot_tool_list = [
214197
tool
215198
for tool in self.toolkit.get_json_schemas()
@@ -243,7 +226,7 @@ async def reply(
243226
if isinstance(msg, list)
244227
else ""
245228
)
246-
229+
247230
if self.start_url and not self._has_initial_navigated:
248231
await self._navigate_to_start_url()
249232
self._has_initial_navigated = True
@@ -264,7 +247,6 @@ async def reply(
264247
await self._summarize_mem()
265248

266249
msg_reasoning = await self._pure_reasoning()
267-
268250
tool_calls = msg_reasoning.get_content_blocks("tool_use")
269251
if tool_calls and tool_calls[0]["name"] == "browser_snapshot":
270252
msg_reasoning = await self._reasoning_with_observation()
@@ -299,7 +281,7 @@ async def reply(
299281

300282
async def _pure_reasoning(
301283
self,
302-
):
284+
) -> Msg:
303285
msg = Msg(
304286
"user",
305287
content=self.pure_reasoning_prompt.format(
@@ -329,7 +311,7 @@ async def _pure_reasoning(
329311
msg = Msg(self.name, [], "assistant")
330312
async for content_chunk in res:
331313
msg.content = content_chunk.content
332-
await self.print(msg, False)
314+
await self.print(msg)
333315
else:
334316
msg = Msg(self.name, list(res.content), "assistant")
335317
await self.print(msg)
@@ -349,12 +331,6 @@ async def _pure_reasoning(
349331

350332
# Post-process for user interruption
351333
if interrupted_by_user and msg:
352-
# Fake tool results
353-
tool_use_blocks: list = (
354-
msg.get_content_blocks( # pylint: disable=E1133
355-
"tool_use",
356-
)
357-
)
358334
for tool_call in tool_use_blocks: # pylint: disable=E1133
359335
msg_res = Msg(
360336
"system",
@@ -371,7 +347,7 @@ async def _pure_reasoning(
371347
)
372348

373349
await self.memory.add(msg_res)
374-
await self.print(msg_res, True)
350+
await self.print(msg_res)
375351

376352
async def _reasoning_with_observation(
377353
self,
@@ -389,7 +365,6 @@ async def _reasoning_with_observation(
389365

390366
for _ in self.snapshot_in_chunk:
391367
observe_msg = await self._build_observation()
392-
393368
prompt = await self.formatter.format(
394369
msgs=[
395370
Msg("system", self.sys_prompt, "system"),
@@ -448,7 +423,7 @@ async def _reasoning_with_observation(
448423
)
449424

450425
await self.memory.add(msg_res)
451-
await self.print(msg_res, True)
426+
await self.print(msg_res)
452427
if not self.chunk_continue_status:
453428
break
454429

@@ -467,26 +442,18 @@ async def _build_observation(
467442
self,
468443
) -> Msg:
469444
"""Get a snapshot in text before reasoning"""
470-
471-
image_path: Optional[str] = None
445+
image_data: Optional[str] = None
472446
if (
473447
self.model.model_name.startswith("qvq")
474448
or "-vl" in self.model.model_name
475449
or "4o" in self.model.model_name
476450
or "gpt-5" in self.model.model_name
477451
):
478452
# If the model supports multimodal input, take a screenshot
479-
# and pass it to the observation message
480-
img_path = os.path.join(
481-
self.screenshot_dir,
482-
f"screenshot_{self.iter_n}.png",
483-
)
484-
# if the img_path already exists,
485-
# do not need to take a screenshot again
486-
if not os.path.exists(img_path):
487-
image_path = await self._get_screenshot(img_path)
453+
# and pass it to the observation message as base64
454+
image_data = await self._get_screenshot()
488455

489-
observe_msg = self.observe_by_chunk(image_path)
456+
observe_msg = self.observe_by_chunk(image_data)
490457
return observe_msg
491458

492459
async def _update_chunk_observation_status(
@@ -550,7 +517,6 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
550517
Return a message to the user if the `_finish_function` is
551518
called, otherwise return `None`.
552519
"""
553-
554520
tool_res_msg = Msg(
555521
"system",
556522
[
@@ -575,6 +541,7 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
575541
"output"
576542
] = chunk.content
577543
# Return message if generate_response is called successfully
544+
578545
if tool_call[
579546
"name"
580547
] == self.finish_function_name and chunk.metadata.get(
@@ -601,7 +568,8 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
601568
await self.memory.delete(mem_len - 1)
602569
else:
603570
await self.memory.add(tool_res_msg)
604-
await self.print(tool_res_msg, False)
571+
if tool_call["name"] != self.finish_function_name:
572+
await self.print(tool_res_msg)
605573

606574
def _clean_tool_excution_content(
607575
self,
@@ -651,11 +619,11 @@ async def _task_decomposition_and_reformat( # pylint: disable=too-many-statemen
651619
async for content_chunk in res:
652620
decompose_text = content_chunk.content[0]["text"]
653621
print_msg.content = content_chunk.content
654-
await self.print(print_msg, last=False)
622+
await self.print(print_msg, False)
655623
else:
656624
decompose_text = res.content[0]["text"]
657625
print_msg.content = [TextBlock(type="text", text=decompose_text)]
658-
await self.print(print_msg, last=True)
626+
await self.print(print_msg, True)
659627

660628
# Use path relative to this file for robustness
661629
reflection_prompt_path = os.path.join(
@@ -818,7 +786,6 @@ async def _get_snapshot_in_text(self) -> list:
818786
snapshot_in_chunk = self._split_snapshot_by_chunk(
819787
snapshot_str,
820788
)
821-
822789
return snapshot_in_chunk
823790

824791
async def _memory_summarizing(self) -> None:
@@ -902,11 +869,10 @@ async def _memory_summarizing(self) -> None:
902869
for msg in summarized_memory:
903870
await self.memory.add(msg)
904871

905-
async def _get_screenshot(self, img_path: str = "") -> Optional[str]:
872+
async def _get_screenshot(self) -> Optional[str]:
906873
"""
907-
Optionally take a screenshot of the current web page
908-
for use in multimodal prompts.
909-
Returns the path to the image if available, else None.
874+
Optionally take a screenshot of the current web page for multimodal prompts.
875+
Returns base64-encoded PNG data if available, else None.
910876
"""
911877
try:
912878
# Prepare tool call for screenshot
@@ -920,25 +886,20 @@ async def _get_screenshot(self, img_path: str = "") -> Optional[str]:
920886
screenshot_response = await self.toolkit.call_tool_function(
921887
tool_call,
922888
)
923-
# Extract image path from response
889+
# Extract image base64 from response
924890
async for chunk in screenshot_response:
925891
if (
926892
chunk.content
927893
and len(chunk.content) > 1
928894
and "data" in chunk.content[1]
929895
):
930896
image_data = chunk.content[1]["data"]
931-
image_data = base64.b64decode(image_data)
932-
with open(img_path, "wb") as fi:
933-
fi.write(image_data)
934-
returned_img_path = img_path
935-
# Exit loop on success
936897
else:
937-
returned_img_path = None
898+
image_data = None
938899

939900
except Exception:
940-
returned_img_path = None
941-
return returned_img_path
901+
image_data = None
902+
return image_data
942903

943904
@staticmethod
944905
def _filter_execution_text(
@@ -993,7 +954,7 @@ def _split_snapshot_by_chunk(
993954
for i in range(0, len(snapshot_str), max_length)
994955
]
995956

996-
def observe_by_chunk(self, image_path: str | None = "") -> Msg:
957+
def observe_by_chunk(self, image_data: str | None = "") -> Msg:
997958
"""Create an observation message for chunk-based reasoning.
998959
999960
This method formats the current chunk of the webpage snapshot with
@@ -1024,13 +985,14 @@ def observe_by_chunk(self, image_path: str | None = "") -> Msg:
1024985
or "4o" in self.model.model_name
1025986
or "gpt-5" in self.model.model_name
1026987
):
1027-
if image_path:
988+
if image_data:
1028989
image_block = ImageBlock(
1029990
type="image",
1030-
source={
1031-
"type": "url",
1032-
"url": image_path,
1033-
},
991+
source=Base64Source(
992+
type="base64",
993+
media_type="image/png",
994+
data=image_data,
995+
),
1034996
)
1035997
content.append(image_block)
1036998

@@ -1383,20 +1345,15 @@ async def image_understanding(
13831345
),
13841346
]
13851347
# Attach screenshot if available
1348+
13861349
if image_data:
1387-
image_data = base64.b64decode(image_data)
1388-
img_path = os.path.join(
1389-
self.screenshot_dir,
1390-
f"screenshot_image_understanding_{self.iter_n}.png",
1391-
)
1392-
with open(img_path, "wb") as fi:
1393-
fi.write(image_data)
13941350
image_block = ImageBlock(
13951351
type="image",
1396-
source={
1397-
"type": "url",
1398-
"url": img_path,
1399-
},
1352+
source=Base64Source(
1353+
type="base64",
1354+
media_type="image/png",
1355+
data=image_data,
1356+
),
14001357
)
14011358
content_blocks.append(image_block)
14021359

0 commit comments

Comments
 (0)