1111from typing import Type , Optional , Any
1212import asyncio
1313import copy
14- import base64
15- import shutil
1614from loguru import logger
1715from pydantic import BaseModel
1816
2422 TextBlock ,
2523 ToolResultBlock ,
2624 ImageBlock ,
25+ Base64Source ,
2726)
2827from agentscope .model import ChatModelBase
2928from agentscope .tool import (
@@ -194,22 +193,6 @@ def __init__(
194193 self .toolkit .register_tool_function (self .browser_subtask_manager )
195194 self .toolkit .register_tool_function (self .image_understanding )
196195
197- if (
198- self .model .model_name .startswith ("qvq" )
199- or "-vl" in self .model .model_name
200- or "4o" in self .model .model_name
201- or "gpt-5" in self .model .model_name
202- ):
203- # If the model supports multimodal input,
204- # prepare a directory for screenshots
205- screenshot_dir = os .path .join (
206- "./logs/screenshots/" ,
207- "tmp" + "_browser_agent" ,
208- )
209- if os .path .exists (screenshot_dir ):
210- shutil .rmtree (screenshot_dir )
211- os .makedirs (screenshot_dir , exist_ok = True )
212- self .screenshot_dir = screenshot_dir
213196 self .no_screenshot_tool_list = [
214197 tool
215198 for tool in self .toolkit .get_json_schemas ()
@@ -243,7 +226,7 @@ async def reply(
243226 if isinstance (msg , list )
244227 else ""
245228 )
246-
229+
247230 if self .start_url and not self ._has_initial_navigated :
248231 await self ._navigate_to_start_url ()
249232 self ._has_initial_navigated = True
@@ -264,7 +247,6 @@ async def reply(
264247 await self ._summarize_mem ()
265248
266249 msg_reasoning = await self ._pure_reasoning ()
267-
268250 tool_calls = msg_reasoning .get_content_blocks ("tool_use" )
269251 if tool_calls and tool_calls [0 ]["name" ] == "browser_snapshot" :
270252 msg_reasoning = await self ._reasoning_with_observation ()
@@ -299,7 +281,7 @@ async def reply(
299281
300282 async def _pure_reasoning (
301283 self ,
302- ):
284+ ) -> Msg :
303285 msg = Msg (
304286 "user" ,
305287 content = self .pure_reasoning_prompt .format (
@@ -329,7 +311,7 @@ async def _pure_reasoning(
329311 msg = Msg (self .name , [], "assistant" )
330312 async for content_chunk in res :
331313 msg .content = content_chunk .content
332- await self .print (msg , False )
314+ await self .print (msg )
333315 else :
334316 msg = Msg (self .name , list (res .content ), "assistant" )
335317 await self .print (msg )
@@ -349,12 +331,6 @@ async def _pure_reasoning(
349331
350332 # Post-process for user interruption
351333 if interrupted_by_user and msg :
352- # Fake tool results
353- tool_use_blocks : list = (
354- msg .get_content_blocks ( # pylint: disable=E1133
355- "tool_use" ,
356- )
357- )
358334 for tool_call in tool_use_blocks : # pylint: disable=E1133
359335 msg_res = Msg (
360336 "system" ,
@@ -371,7 +347,7 @@ async def _pure_reasoning(
371347 )
372348
373349 await self .memory .add (msg_res )
374- await self .print (msg_res , True )
350+ await self .print (msg_res )
375351
376352 async def _reasoning_with_observation (
377353 self ,
@@ -389,7 +365,6 @@ async def _reasoning_with_observation(
389365
390366 for _ in self .snapshot_in_chunk :
391367 observe_msg = await self ._build_observation ()
392-
393368 prompt = await self .formatter .format (
394369 msgs = [
395370 Msg ("system" , self .sys_prompt , "system" ),
@@ -448,7 +423,7 @@ async def _reasoning_with_observation(
448423 )
449424
450425 await self .memory .add (msg_res )
451- await self .print (msg_res , True )
426+ await self .print (msg_res )
452427 if not self .chunk_continue_status :
453428 break
454429
@@ -467,26 +442,18 @@ async def _build_observation(
467442 self ,
468443 ) -> Msg :
469444 """Get a snapshot in text before reasoning"""
470-
471- image_path : Optional [str ] = None
445+ image_data : Optional [str ] = None
472446 if (
473447 self .model .model_name .startswith ("qvq" )
474448 or "-vl" in self .model .model_name
475449 or "4o" in self .model .model_name
476450 or "gpt-5" in self .model .model_name
477451 ):
478452 # If the model supports multimodal input, take a screenshot
479- # and pass it to the observation message
480- img_path = os .path .join (
481- self .screenshot_dir ,
482- f"screenshot_{ self .iter_n } .png" ,
483- )
484- # if the img_path already exists,
485- # do not need to take a screenshot again
486- if not os .path .exists (img_path ):
487- image_path = await self ._get_screenshot (img_path )
453+ # and pass it to the observation message as base64
454+ image_data = await self ._get_screenshot ()
488455
489- observe_msg = self .observe_by_chunk (image_path )
456+ observe_msg = self .observe_by_chunk (image_data )
490457 return observe_msg
491458
492459 async def _update_chunk_observation_status (
@@ -550,7 +517,6 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
550517 Return a message to the user if the `_finish_function` is
551518 called, otherwise return `None`.
552519 """
553-
554520 tool_res_msg = Msg (
555521 "system" ,
556522 [
@@ -575,6 +541,7 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
575541 "output"
576542 ] = chunk .content
577543 # Return message if generate_response is called successfully
544+
578545 if tool_call [
579546 "name"
580547 ] == self .finish_function_name and chunk .metadata .get (
@@ -601,7 +568,8 @@ async def _acting(self, tool_call: ToolUseBlock) -> Msg | None:
601568 await self .memory .delete (mem_len - 1 )
602569 else :
603570 await self .memory .add (tool_res_msg )
604- await self .print (tool_res_msg , False )
571+ if tool_call ["name" ] != self .finish_function_name :
572+ await self .print (tool_res_msg )
605573
606574 def _clean_tool_excution_content (
607575 self ,
@@ -651,11 +619,11 @@ async def _task_decomposition_and_reformat( # pylint: disable=too-many-statemen
651619 async for content_chunk in res :
652620 decompose_text = content_chunk .content [0 ]["text" ]
653621 print_msg .content = content_chunk .content
654- await self .print (print_msg , last = False )
622+ await self .print (print_msg , False )
655623 else :
656624 decompose_text = res .content [0 ]["text" ]
657625 print_msg .content = [TextBlock (type = "text" , text = decompose_text )]
658- await self .print (print_msg , last = True )
626+ await self .print (print_msg , True )
659627
660628 # Use path relative to this file for robustness
661629 reflection_prompt_path = os .path .join (
@@ -818,7 +786,6 @@ async def _get_snapshot_in_text(self) -> list:
818786 snapshot_in_chunk = self ._split_snapshot_by_chunk (
819787 snapshot_str ,
820788 )
821-
822789 return snapshot_in_chunk
823790
824791 async def _memory_summarizing (self ) -> None :
@@ -902,11 +869,10 @@ async def _memory_summarizing(self) -> None:
902869 for msg in summarized_memory :
903870 await self .memory .add (msg )
904871
905- async def _get_screenshot (self , img_path : str = "" ) -> Optional [str ]:
872+ async def _get_screenshot (self ) -> Optional [str ]:
906873 """
907- Optionally take a screenshot of the current web page
908- for use in multimodal prompts.
909- Returns the path to the image if available, else None.
874+ Optionally take a screenshot of the current web page for multimodal prompts.
875+ Returns base64-encoded PNG data if available, else None.
910876 """
911877 try :
912878 # Prepare tool call for screenshot
@@ -920,25 +886,20 @@ async def _get_screenshot(self, img_path: str = "") -> Optional[str]:
920886 screenshot_response = await self .toolkit .call_tool_function (
921887 tool_call ,
922888 )
923- # Extract image path from response
889+ # Extract image base64 from response
924890 async for chunk in screenshot_response :
925891 if (
926892 chunk .content
927893 and len (chunk .content ) > 1
928894 and "data" in chunk .content [1 ]
929895 ):
930896 image_data = chunk .content [1 ]["data" ]
931- image_data = base64 .b64decode (image_data )
932- with open (img_path , "wb" ) as fi :
933- fi .write (image_data )
934- returned_img_path = img_path
935- # Exit loop on success
936897 else :
937- returned_img_path = None
898+ image_data = None
938899
939900 except Exception :
940- returned_img_path = None
941- return returned_img_path
901+ image_data = None
902+ return image_data
942903
943904 @staticmethod
944905 def _filter_execution_text (
@@ -993,7 +954,7 @@ def _split_snapshot_by_chunk(
993954 for i in range (0 , len (snapshot_str ), max_length )
994955 ]
995956
996- def observe_by_chunk (self , image_path : str | None = "" ) -> Msg :
957+ def observe_by_chunk (self , image_data : str | None = "" ) -> Msg :
997958 """Create an observation message for chunk-based reasoning.
998959
999960 This method formats the current chunk of the webpage snapshot with
@@ -1024,13 +985,14 @@ def observe_by_chunk(self, image_path: str | None = "") -> Msg:
1024985 or "4o" in self .model .model_name
1025986 or "gpt-5" in self .model .model_name
1026987 ):
1027- if image_path :
988+ if image_data :
1028989 image_block = ImageBlock (
1029990 type = "image" ,
1030- source = {
1031- "type" : "url" ,
1032- "url" : image_path ,
1033- },
991+ source = Base64Source (
992+ type = "base64" ,
993+ media_type = "image/png" ,
994+ data = image_data ,
995+ ),
1034996 )
1035997 content .append (image_block )
1036998
@@ -1383,20 +1345,15 @@ async def image_understanding(
13831345 ),
13841346 ]
13851347 # Attach screenshot if available
1348+
13861349 if image_data :
1387- image_data = base64 .b64decode (image_data )
1388- img_path = os .path .join (
1389- self .screenshot_dir ,
1390- f"screenshot_image_understanding_{ self .iter_n } .png" ,
1391- )
1392- with open (img_path , "wb" ) as fi :
1393- fi .write (image_data )
13941350 image_block = ImageBlock (
13951351 type = "image" ,
1396- source = {
1397- "type" : "url" ,
1398- "url" : img_path ,
1399- },
1352+ source = Base64Source (
1353+ type = "base64" ,
1354+ media_type = "image/png" ,
1355+ data = image_data ,
1356+ ),
14001357 )
14011358 content_blocks .append (image_block )
14021359
0 commit comments