diff --git a/.eslintrc.cjs b/.eslintrc.cjs index 3e40cc313d..cc99d4e475 100644 --- a/.eslintrc.cjs +++ b/.eslintrc.cjs @@ -37,5 +37,6 @@ module.exports = { 'no-case-declarations': 'off', 'no-await-in-loop': 'off', 'react/prop-types': 'off', + '@typescript-eslint/no-namespace': 'off', }, }; diff --git a/multimodal/agent-tars/core/src/browser/browser-gui-agent.ts b/multimodal/agent-tars/core/src/browser/browser-gui-agent.ts index dcbcc0441e..4393e352fa 100644 --- a/multimodal/agent-tars/core/src/browser/browser-gui-agent.ts +++ b/multimodal/agent-tars/core/src/browser/browser-gui-agent.ts @@ -8,25 +8,12 @@ import { LocalBrowser, Page, RemoteBrowser } from '@agent-infra/browser'; import { BrowserOperator } from '@gui-agent/operator-browser'; import { ConsoleLogger, AgentEventStream, Tool, z } from '@tarko/mcp-agent'; import { ImageCompressor, formatBytes } from '@tarko/shared-media-utils'; - -/** - * Coordinate type definition - */ -export type Coords = [number, number] | []; - -/** - * Action input parameters for browser actions - */ -export interface ActionInputs { - content?: string; - start_box?: string; - end_box?: string; - key?: string; - hotkey?: string; - direction?: string; - start_coords?: Coords; - end_coords?: Coords; -} +import { ActionInputs, PredictionParsed } from '@agent-tars/interface'; +import { + convertToGUIResponse, + createGUIErrorResponse, + GUIExecuteResult, +} from '@tarko/shared-utils'; function sleep(time: number) { return new Promise(function (resolve) { @@ -34,18 +21,6 @@ function sleep(time: number) { }); } -/** - * Parsed prediction from GUI agent - */ -export interface PredictionParsed { - /** Action inputs parsed from action_type(action_inputs) */ - action_inputs: ActionInputs; - /** Action type parsed from action_type(action_inputs) */ - action_type: string; - /** Thinking content */ - thought?: string; -} - /** * Browser initialization options */ @@ -131,12 +106,8 @@ wait() - Wait 5 seconds and take a scree .string() .describe('Finally summarize the next action (with its target element) in one sentence'), action: z.string().describe('Some action in action space like click or press'), - // pageData: z - // .array(z.object({})) - // .describe("The information you see and extract from the page based on the user's query") - // .optional(), }), - function: async ({ thought, step, action, pageData }) => { + function: async ({ thought, step, action }) => { try { const parsed = this.parseAction(action); parsed.thought = thought; @@ -152,7 +123,7 @@ wait() - Wait 5 seconds and take a scree }, }); - const result = await this.browserOperator.execute({ + const operatorResult: GUIExecuteResult = await this.browserOperator.execute({ parsedPrediction: parsed, screenWidth: this.screenWidth || 1920, screenHeight: this.screenHeight || 1080, @@ -160,106 +131,20 @@ wait() - Wait 5 seconds and take a scree await sleep(500); - // Automatically get page content after browser interaction - // await this.capturePageContentAsEnvironmentInfo(); - - return { action, status: 'success', result, pageData }; + const guiResponse = convertToGUIResponse(action, parsed, operatorResult); + return guiResponse; } catch (error) { this.logger.error( `Browser action failed: ${error instanceof Error ? error.message : String(error)}`, ); - return { - action, - status: 'fail', - error: error instanceof Error ? error.message : String(error), - }; + + // Return error response in new format + return createGUIErrorResponse(action, error); } }, }); } - /** - * Capture page content and add it to event stream as environment info - * This is called automatically after each browser_vision_control action - */ - private async capturePageContentAsEnvironmentInfo(): Promise { - // Only proceed if eventStream is provided - if (!this.eventStream) return; - - try { - const page = await this.getPage(); - - // Get page content as markdown - const markdown = await page.evaluate(() => { - // Simple function to extract page content as markdown - const extractMarkdown = () => { - // Get page title - const title = document.title || 'Untitled Page'; - - const getVisibleText = (node: any) => { - if (node.nodeType === Node.TEXT_NODE) { - return node.textContent || ''; - } - - const style = window.getComputedStyle(node); - if ( - style.display === 'none' || - style.visibility === 'hidden' || - style.opacity === '0' - ) { - return ''; - } - - let text = ''; - for (const child of Array.from(node.childNodes)) { - // @ts-expect-error - if (child.nodeType === Node.ELEMENT_NODE) { - text += getVisibleText(child); - // @ts-expect-error - } else if (child.nodeType === Node.TEXT_NODE) { - // @ts-expect-error - text += child.textContent || ''; - } - } - - return text.trim(); - }; - - // Get main content, prefer article or main elements - const mainContent = - document.querySelector('article, main, #content, .content') || document.body; - const content = getVisibleText(mainContent); - - // Format as markdown - return `# ${title}\n\n${content}`; - }; - - return extractMarkdown(); - }); - - // If content is available, add it to event stream - if (markdown && markdown.trim()) { - // Create an environment input event with the markdown content - const event = this.eventStream.createEvent('environment_input', { - content: markdown, - description: 'Page Content After Browser Action', - metadata: { - type: 'text', - }, - }); - - // Send the event - this.eventStream.sendEvent(event); - this.logger.debug('Added page content to event stream as environment info'); - } - } catch (error) { - // Log error but don't fail the main operation - this.logger.warn( - `Failed to capture page content: ${error instanceof Error ? error.message : String(error)}`, - ); - } - } - /** * Set the event stream instance * @param eventStream - The event stream instance diff --git a/multimodal/pnpm-lock.yaml b/multimodal/pnpm-lock.yaml index 29951432e2..3995c6c416 100644 --- a/multimodal/pnpm-lock.yaml +++ b/multimodal/pnpm-lock.yaml @@ -1777,19 +1777,16 @@ packages: '@computer-use/libnut-darwin@2.7.1': resolution: {integrity: sha512-7B/aPcIYS4a4S7D3IYIHSpZ4B4m8Z3CjlYq0efTr+/JYmEu+LlO67ZhPvisLKifhagxf7goqEfnphg1F4jq5jw==} engines: {node: '>=10.15.3'} - cpu: [x64, arm64] os: [darwin, linux, win32] '@computer-use/libnut-linux@2.7.1': resolution: {integrity: sha512-QJD5URTFJ/2+JwBwRyajRF2BB+3eXpd4+t5btGeRVeiRQLKQ4lgorbMHySo6IrfAbSfnU1OVOrxAUygGxj0cFg==} engines: {node: '>=10.15.3'} - cpu: [x64, arm64] os: [darwin, linux, win32] '@computer-use/libnut-win32@2.7.1': resolution: {integrity: sha512-nDvH5kP1zoO2cBtFYWV0om9xtTu523cc1LIk8r/wizqPrIAm0wCizTU+odF3Fi42zcKJWT6J+Pguy8fKrZyIuA==} engines: {node: '>=10.15.3'} - cpu: [x64, arm64] os: [darwin, linux, win32] '@computer-use/libnut@4.2.0': @@ -1809,7 +1806,6 @@ packages: '@computer-use/nut-js@4.2.0': resolution: {integrity: sha512-Xyq5ixrAEdy6qu/mtX021XfU0Y//YFgL0CW1xF6L05KUCbJcZZqViiEeSPulhKfLdvXuhIjnhf+exMb2T6jQIg==} engines: {node: '>=16'} - cpu: [x64, arm64] os: [linux, darwin, win32] '@computer-use/provider-interfaces@4.2.0': diff --git a/multimodal/tarko/agent-interface/src/gui-agent.ts b/multimodal/tarko/agent-interface/src/gui-agent.ts new file mode 100644 index 0000000000..d3adce4f5a --- /dev/null +++ b/multimodal/tarko/agent-interface/src/gui-agent.ts @@ -0,0 +1,210 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/* + * Copyright (c) 2025 Bytedance, Inc. and its affiliates. + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * FIXME: migrate to GUI-Agent package + * + * GUI Agent types for Tarko Agent Web UI display. + * These types are specifically designed for UI rendering and are not part of the internal GUIAgent protocol. + */ +export namespace GUIAgent { + /** + * Base interface for all GUI Agent action types + * Defines the fundamental structure that all actions must follow + */ + export interface BaseAction< + T extends string = string, + I extends Record = Record, + > { + type: T; + inputs: I; + } + + /** + * Click action with coordinates + */ + export type ClickAction = BaseAction< + 'click', + { + startX: number; // Percentage coordinates (0-1) + startY: number; // Percentage coordinates (0-1) + } + >; + + /** + * Double click action with coordinates + */ + export type DoubleClickAction = BaseAction< + 'double_click' | 'left_double', + { + startX: number; // Percentage coordinates (0-1) + startY: number; // Percentage coordinates (0-1) + } + >; + + /** + * Right click action with coordinates + */ + export type RightClickAction = BaseAction< + 'right_click' | 'right_single', + { + startX: number; // Percentage coordinates (0-1) + startY: number; // Percentage coordinates (0-1) + } + >; + + /** + * Drag action with start and end coordinates + */ + export type DragAction = BaseAction< + 'drag', + { + startX: number; // Percentage coordinates (0-1) + startY: number; // Percentage coordinates (0-1) + endX: number; // Percentage coordinates (0-1) + endY: number; // Percentage coordinates (0-1) + } + >; + + /** + * Type action with text content + */ + export type TypeAction = BaseAction< + 'type', + { + content: string; + } + >; + + /** + * Hotkey action with key combination + */ + export type HotkeyAction = BaseAction< + 'hotkey', + { + key: string; + } + >; + + /** + * Scroll action with coordinates and direction + */ + export type ScrollAction = BaseAction< + 'scroll', + { + startX: number; // Percentage coordinates (0-1) + startY: number; // Percentage coordinates (0-1) + direction: 'up' | 'down' | 'left' | 'right'; + } + >; + + /** + * Wait action with no inputs + */ + export type WaitAction = BaseAction<'wait', Record>; + + /** + * Navigate action with URL + */ + export type NavigateAction = BaseAction< + 'navigate', + { + url: string; + } + >; + + /** + * Navigate back action + */ + export type NavigateBackAction = BaseAction<'navigate_back', Record>; + + /** + * Union type of all possible GUI actions + */ + export type Action = + | ClickAction + | DoubleClickAction + | RightClickAction + | DragAction + | TypeAction + | HotkeyAction + | ScrollAction + | WaitAction + | NavigateAction + | NavigateBackAction; + + /** + * Generic GUI Agent tool response with strict typing + */ + export interface ToolResponse { + /** + * Whether the operation was successful + */ + success: boolean; + + /** + * Raw action string as received from the model + */ + action: string; + + /** + * Parsed and normalized action with strict typing + */ + normalizedAction: T; + + /** + * Optional observation after the action (reserved for future implementation) + */ + observation?: string; + + /** + * Error message if the operation failed + */ + error?: string; + } + + /** + * Type-specific response types for better type safety + */ + export type ClickResponse = ToolResponse; + export type DoubleClickResponse = ToolResponse; + export type RightClickResponse = ToolResponse; + export type DragResponse = ToolResponse; + export type TypeResponse = ToolResponse; + export type HotkeyResponse = ToolResponse; + export type ScrollResponse = ToolResponse; + export type WaitResponse = ToolResponse; + export type NavigateResponse = ToolResponse; + export type NavigateBackResponse = ToolResponse; +} + +/** + * Legacy action inputs interface for backward compatibility + * @deprecated Use the new GUIAgent types instead + */ +export interface ActionInputs { + content?: string; + start_box?: string; + end_box?: string; + key?: string; + hotkey?: string; + direction?: string; + start_coords?: [number, number] | []; + end_coords?: [number, number] | []; +} + +/** + * Legacy parsed prediction interface for backward compatibility + * @deprecated Use the new GUIAgent types instead + */ +export interface PredictionParsed { + /** Action inputs parsed from action_type(action_inputs) */ + action_inputs: ActionInputs; + /** Action type parsed from action_type(action_inputs) */ + action_type: string; + /** Thinking content */ + thought?: string; +} diff --git a/multimodal/tarko/agent-interface/src/index.ts b/multimodal/tarko/agent-interface/src/index.ts index 636840ec9f..9c1b382804 100644 --- a/multimodal/tarko/agent-interface/src/index.ts +++ b/multimodal/tarko/agent-interface/src/index.ts @@ -11,4 +11,5 @@ export * from './agent-constructor'; export * from './tool'; export * from './tool-call-engine'; export * from './agent-event-stream'; +export * from './gui-agent'; export * from '@tarko/model-provider/types'; diff --git a/multimodal/tarko/agent-web-ui/src/standalone/workspace/renderers/BrowserControlRenderer.tsx b/multimodal/tarko/agent-web-ui/src/standalone/workspace/renderers/BrowserControlRenderer.tsx index eb3c44c48a..23e0f1b482 100644 --- a/multimodal/tarko/agent-web-ui/src/standalone/workspace/renderers/BrowserControlRenderer.tsx +++ b/multimodal/tarko/agent-web-ui/src/standalone/workspace/renderers/BrowserControlRenderer.tsx @@ -26,7 +26,7 @@ export const BrowserControlRenderer: React.FC = ({ panelContent, onAction, }) => { - const { activeSessionId, messages, toolResults, replayState } = useSession(); + const { activeSessionId, messages, toolResults } = useSession(); const [relatedImage, setRelatedImage] = useState(null); const [mousePosition, setMousePosition] = useState<{ x: number; y: number } | null>(null); const [previousMousePosition, setPreviousMousePosition] = useState<{ @@ -52,20 +52,38 @@ export const BrowserControlRenderer: React.FC = ({ const sessionResults = toolResults[activeSessionId] || []; const matchingResult = sessionResults.find((result) => result.toolCallId === toolCallId); - if (matchingResult && matchingResult.content && matchingResult.content.result) { - const { startXPercent, startYPercent } = matchingResult.content.result; - - // Save previous position before updating - if (mousePosition) { - setPreviousMousePosition(mousePosition); - } + if (matchingResult?.content?.normalizedAction?.inputs) { + const { normalizedAction } = matchingResult.content; + const { startX, startY } = normalizedAction.inputs; + + // Check if action type supports coordinate display + const coordinateBasedActions = [ + 'click', + 'double_click', + 'left_double', + 'right_click', + 'right_single', + 'drag', + 'scroll', + ]; + + if (coordinateBasedActions.includes(normalizedAction.type)) { + // Save previous position before updating + if (mousePosition) { + setPreviousMousePosition(mousePosition); + } - // Set new position if percentage coordinates are valid - if (typeof startXPercent === 'number' && typeof startYPercent === 'number') { - setMousePosition({ - x: startXPercent * 100, // Convert to percentage - y: startYPercent * 100, // Convert to percentage - }); + // Set new position if percentage coordinates are valid + if (typeof startX === 'number' && typeof startY === 'number') { + setMousePosition({ + x: startX * 100, // Convert to percentage + y: startY * 100, // Convert to percentage + }); + } + } else { + console.log( + `[BrowserControlRenderer] Action type '${normalizedAction.type}' does not support coordinate display`, + ); } } }, [activeSessionId, toolCallId, toolResults]); diff --git a/multimodal/tarko/shared-utils/src/gui-agent.ts b/multimodal/tarko/shared-utils/src/gui-agent.ts new file mode 100644 index 0000000000..da2acd9b3c --- /dev/null +++ b/multimodal/tarko/shared-utils/src/gui-agent.ts @@ -0,0 +1,224 @@ +/** + * Copyright (c) 2025 Bytedance, Inc. and its affiliates. + * SPDX-License-Identifier: Apache-2.0 + */ + +import { GUIAgent, ActionInputs, PredictionParsed } from '@tarko/agent-interface'; + +/** + * Execute result interface for GUI operations + * FIXME: migrate to GUI-Agent package + */ +export interface GUIExecuteResult { + startX?: number | null; + startY?: number | null; + startXPercent?: number | null; + startYPercent?: number | null; + action_inputs: ActionInputs; +} + +/** + * Convert legacy prediction result to new GUI Agent response format + * This utility allows any Agent to easily adopt the new GUI Agent type system + * + * @param actionStr - Raw action string as received from the model + * @param parsed - Parsed prediction with action type and inputs + * @param result - Execute result with coordinates and action inputs + * @returns Standardized GUI Agent tool response + */ +export function convertToGUIResponse( + actionStr: string, + parsed: PredictionParsed, + result: GUIExecuteResult, +): GUIAgent.ToolResponse { + const normalizedAction = convertToNormalizedAction(parsed, result); + + return { + success: true, + action: actionStr, + normalizedAction, + observation: undefined, // Reserved for future implementation + }; +} + +/** + * Convert parsed prediction to normalized GUI action with percentage coordinates + * + * @param parsed - Parsed prediction with action type and inputs + * @param result - Execute result with coordinates + * @returns Normalized GUI action with strict typing + */ +export function convertToNormalizedAction( + parsed: PredictionParsed, + result: GUIExecuteResult, +): GUIAgent.Action { + const { action_type, action_inputs } = parsed; + const { startXPercent, startYPercent } = result; + + switch (action_type) { + case 'click': + case 'left_click': + case 'left_single': { + const clickAction: GUIAgent.ClickAction = { + type: 'click', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + }, + }; + return clickAction; + } + + case 'double_click': + case 'left_double': { + const doubleClickAction: GUIAgent.DoubleClickAction = { + type: 'double_click', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + }, + }; + return doubleClickAction; + } + + case 'right_click': + case 'right_single': { + const rightClickAction: GUIAgent.RightClickAction = { + type: 'right_click', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + }, + }; + return rightClickAction; + } + + case 'drag': { + // Parse end coordinates from action_inputs.end_box + const endBox = action_inputs.end_box; + let endXPercent = 0; + let endYPercent = 0; + if (endBox) { + try { + const coords = JSON.parse(endBox); + if (Array.isArray(coords) && coords.length >= 2) { + endXPercent = coords[0]; + endYPercent = coords[1]; + } + } catch (e) { + console.warn('Failed to parse end_box coordinates:', endBox); + } + } + const dragAction: GUIAgent.DragAction = { + type: 'drag', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + endX: endXPercent, + endY: endYPercent, + }, + }; + return dragAction; + } + + case 'type': { + const typeAction: GUIAgent.TypeAction = { + type: 'type', + inputs: { + content: action_inputs.content || '', + }, + }; + return typeAction; + } + + case 'hotkey': { + const hotkeyAction: GUIAgent.HotkeyAction = { + type: 'hotkey', + inputs: { + key: action_inputs.key || action_inputs.hotkey || '', + }, + }; + return hotkeyAction; + } + + case 'scroll': { + const scrollAction: GUIAgent.ScrollAction = { + type: 'scroll', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + direction: (action_inputs.direction as 'up' | 'down' | 'left' | 'right') || 'down', + }, + }; + return scrollAction; + } + + case 'wait': { + const waitAction: GUIAgent.WaitAction = { + type: 'wait', + inputs: {}, + }; + return waitAction; + } + + case 'navigate': { + const navigateAction: GUIAgent.NavigateAction = { + type: 'navigate', + inputs: { + url: action_inputs.content || '', + }, + }; + return navigateAction; + } + + case 'navigate_back': { + const navigateBackAction: GUIAgent.NavigateBackAction = { + type: 'navigate_back', + inputs: {}, + }; + return navigateBackAction; + } + + default: { + // Fallback to a generic click action for unknown types + console.warn(`Unknown action type: ${action_type}, falling back to click`); + const fallbackAction: GUIAgent.ClickAction = { + type: 'click', + inputs: { + startX: startXPercent || 0, + startY: startYPercent || 0, + }, + }; + return fallbackAction; + } + } +} + +/** + * Create a default error action for failed operations + * + * @returns Default wait action for error scenarios + */ +export function createErrorAction(): GUIAgent.Action { + const errorAction: GUIAgent.WaitAction = { + type: 'wait', + inputs: {}, + }; + return errorAction; +} + +/** + * Create an error response for failed GUI operations + * + * @param actionStr - Raw action string that failed + * @param error - Error that occurred + * @returns Error response in GUI Agent format + */ +export function createGUIErrorResponse(actionStr: string, error: unknown): GUIAgent.ToolResponse { + return { + success: false, + action: actionStr, + normalizedAction: createErrorAction(), + error: error instanceof Error ? error.message : String(error), + }; +} diff --git a/multimodal/tarko/shared-utils/src/index.ts b/multimodal/tarko/shared-utils/src/index.ts index 4171dc431f..5707c3cf08 100644 --- a/multimodal/tarko/shared-utils/src/index.ts +++ b/multimodal/tarko/shared-utils/src/index.ts @@ -7,3 +7,4 @@ export * from './logger'; export * from './deepMerge'; export * from './env'; export * from './filter'; +export * from './gui-agent';