@@ -9,8 +9,12 @@ import { BrowserOperator } from '@gui-agent/operator-browser';
99import { ConsoleLogger , AgentEventStream , Tool , z } from '@tarko/mcp-agent' ;
1010import { ImageCompressor , formatBytes } from '@tarko/shared-media-utils' ;
1111import { ActionInputs , PredictionParsed } from '@agent-tars/interface' ;
12+ import { ActionParserHelper } from '@gui-agent/action-parser' ;
13+ import { Coordinates , NormalizeCoordinates } from '@gui-agent/shared/types' ;
14+ import { normalizeActionCoords } from '@gui-agent/shared/utils' ;
1215import {
1316 convertToGUIResponse ,
17+ convertToAgentUIAction ,
1418 createGUIErrorResponse ,
1519 GUIExecuteResult ,
1620} from '@tarko/shared-utils' ;
@@ -37,6 +41,22 @@ export interface GUIAgentOptions {
3741 eventStream ?: AgentEventStream . Processor ;
3842}
3943
44+ const actionParserHelper = new ActionParserHelper ( ) ;
45+
46+ const defaultNormalizeCoords : NormalizeCoordinates = ( rawCoords : Coordinates ) => {
47+ if ( ! rawCoords . raw ) {
48+ return { normalized : rawCoords } ;
49+ }
50+ const normalizedCoords = {
51+ ...rawCoords ,
52+ normalized : {
53+ x : rawCoords . raw . x / 1000 ,
54+ y : rawCoords . raw . y / 1000 ,
55+ } ,
56+ } ;
57+ return { normalized : normalizedCoords } ;
58+ } ;
59+
4060/**
4161 * Browser GUI Agent for visual browser automation
4262 */
@@ -90,7 +110,7 @@ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') -
90110wait() - Wait 5 seconds and take a screenshot to check for changes
91111
92112## Note
93- - Folow user lanuage in in \`thought\` part.
113+ - Follow user language in in \`thought\` part.
94114- Describe your thought in \`step\` part.
95115- Describe your action in \`Step\` part.
96116- Extract the data your see in \`pageData\` part.
@@ -109,30 +129,39 @@ wait() - Wait 5 seconds and take a scree
109129 } ) ,
110130 function : async ( { thought, step, action } ) => {
111131 try {
112- const parsed = this . parseAction ( action ) ;
113- parsed . thought = thought ;
132+ const parsedAction = actionParserHelper . parseActionCallString ( action ) ;
133+ if ( ! parsedAction ) {
134+ return createGUIErrorResponse ( action , 'Invalid action format' ) ;
135+ }
136+ const normalizedCoordsAction = normalizeActionCoords (
137+ parsedAction ,
138+ defaultNormalizeCoords ,
139+ ) ;
114140
115141 this . logger . debug ( {
116142 thought,
117143 step,
118144 action,
119- parsedAction : JSON . stringify ( parsed , null , 2 ) ,
145+ normalizedCoordsAction : JSON . stringify ( normalizedCoordsAction , null , 2 ) ,
120146 screenDimensions : {
121147 width : this . screenWidth ,
122148 height : this . screenHeight ,
123149 } ,
124150 } ) ;
125151
126- const operatorResult : GUIExecuteResult = await this . browserOperator . execute ( {
127- parsedPrediction : parsed ,
128- screenWidth : this . screenWidth || 1920 ,
129- screenHeight : this . screenHeight || 1080 ,
152+ const operatorResult = await this . browserOperator . doExecute ( {
153+ actions : [ normalizedCoordsAction ] ,
130154 } ) ;
155+ this . logger . debug ( 'Browser action completed' , operatorResult ) ;
131156
132157 await sleep ( 500 ) ;
133158
134- const guiResponse = convertToGUIResponse ( action , parsed , operatorResult ) ;
135- return guiResponse ;
159+ return {
160+ success : true ,
161+ action : action ,
162+ normalizedAction : convertToAgentUIAction ( normalizedCoordsAction ) ,
163+ observation : undefined , // Reserved for future implementation
164+ } ;
136165 } catch ( error ) {
137166 this . logger . error (
138167 `Browser action failed: ${ error instanceof Error ? error . message : String ( error ) } ` ,
@@ -164,7 +193,7 @@ wait() - Wait 5 seconds and take a scree
164193 // Record screenshot start time
165194 const startTime = performance . now ( ) ;
166195
167- const output = await this . browserOperator . screenshot ( ) ;
196+ const output = await this . browserOperator . doScreenshot ( ) ;
168197
169198 // Calculate screenshot time
170199 const endTime = performance . now ( ) ;
0 commit comments