Skip to content

Commit 36eb8d1

Browse files
authored
Merge branch 'main' into patch-1
2 parents 7f897fd + e71fce1 commit 36eb8d1

133 files changed

Lines changed: 3099 additions & 6267 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Quick real-environment runner for BrowserGUIAgent
3+
* - Launches a local Chrome (via Puppeteer)
4+
* - Navigates to a page
5+
* - Uses BrowserGUIAgent.screenshot() and saves a WEBP file
6+
*/
7+
import { AgentTARS } from '../src';
8+
9+
async function main() {
10+
const localAgent = new AgentTARS({
11+
model: {
12+
provider: 'volcengine',
13+
id: 'ep-20250510145437-5sxhs',
14+
apiKey: process.env.ARK_API_KEY,
15+
displayName: 'doubao-1.5-thinking-vision-pro',
16+
},
17+
toolCallEngine: 'structured_outputs',
18+
});
19+
await localAgent.initialize();
20+
const tools = localAgent.getTools();
21+
console.log('\n📋 Available Tools:');
22+
console.log('─'.repeat(80));
23+
tools.forEach((tool, index) => {
24+
const num = (index + 1).toString().padStart(2, ' ');
25+
const name = tool.name.padEnd(30, ' ');
26+
const desc = (tool.description || 'No description').substring(0, 45).replace(/\n/g, ' ');
27+
console.log(`${num}. ${name}${desc}`);
28+
});
29+
console.log('─'.repeat(80));
30+
console.log(`Total: ${tools.length} tools\n`);
31+
32+
// Test tasks to run
33+
const tasks = [
34+
'Open https://seed-tars.com',
35+
'Use gui to go to https://www.producthunt.com/, search the top products for "AI", from the results, identify the top-listed product (the top 3 result). Collect the following information from that product\'s card: 1. Product name 2. Short description 3. Number of upvotes summarize it and report to me.',
36+
'Use gui action, go to https://sample-files.com/documents/pdf/, find the 65KB pdf file, preview it, scroll the file from top to bottom.',
37+
];
38+
39+
// Execute tasks iteratively
40+
for (let i = 0; i < tasks.length; i++) {
41+
const task = tasks[i];
42+
console.log(`\n🚀 Executing Task ${i + 1}:`);
43+
console.log(`📝 ${task}`);
44+
console.log('─'.repeat(80));
45+
46+
try {
47+
const response = await localAgent.run(task);
48+
console.log(`✅ Task ${i + 1} Response:`, response);
49+
} catch (error) {
50+
console.error(`❌ Task ${i + 1} Failed:`, error);
51+
}
52+
53+
console.log('─'.repeat(80));
54+
}
55+
console.log('🎉 All tasks completed. Exiting...');
56+
57+
// Clean up resources and exit
58+
console.log('\n🧹 Cleaning up resources...');
59+
try {
60+
await localAgent.cleanup();
61+
console.log('✅ Cleanup completed');
62+
} catch (error) {
63+
console.error('❌ Cleanup failed:', error);
64+
}
65+
process.exit(0);
66+
}
67+
68+
main().catch((err) => {
69+
console.error('Runner failed:', err);
70+
process.exit(1);
71+
});

multimodal/agent-tars/core/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"prepublishOnly": "pnpm run build",
1818
"test": "vitest run",
1919
"test:watch": "vitest",
20-
"agent:snapshot:genreate": "npx tsx snapshot/runner.ts generate all",
20+
"agent:snapshot:generate": "npx tsx snapshot/runner.ts generate all",
2121
"agent:snapshot:test": "npx vitest snapshot/index.test.ts",
2222
"benchmark:crawl": "cd benchmark/crawl && pnpm start"
2323
},
@@ -29,6 +29,8 @@
2929
},
3030
"devDependencies": {
3131
"@gui-agent/operator-browser": "workspace:*",
32+
"@gui-agent/action-parser": "workspace:*",
33+
"@gui-agent/shared": "workspace:*",
3234
"@agent-infra/mcp-server-browser": "1.1.10",
3335
"@agent-infra/mcp-server-commands": "1.1.10",
3436
"@agent-infra/mcp-server-filesystem": "1.1.10",

multimodal/agent-tars/core/src/environments/local/browser/browser-gui-agent.ts

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@ import { BrowserOperator } from '@gui-agent/operator-browser';
99
import { ConsoleLogger, AgentEventStream, Tool, z } from '@tarko/mcp-agent';
1010
import { ImageCompressor, formatBytes } from '@tarko/shared-media-utils';
1111
import { ActionInputs, PredictionParsed } from '@agent-tars/interface';
12+
import { ActionParserHelper } from '@gui-agent/action-parser';
13+
import { Coordinates, NormalizeCoordinates } from '@gui-agent/shared/types';
14+
import { normalizeActionCoords } from '@gui-agent/shared/utils';
1215
import {
1316
convertToGUIResponse,
17+
convertToAgentUIAction,
1418
createGUIErrorResponse,
1519
GUIExecuteResult,
1620
} from '@tarko/shared-utils';
@@ -37,6 +41,22 @@ export interface GUIAgentOptions {
3741
eventStream?: AgentEventStream.Processor;
3842
}
3943

44+
const actionParserHelper = new ActionParserHelper();
45+
46+
const defaultNormalizeCoords: NormalizeCoordinates = (rawCoords: Coordinates) => {
47+
if (!rawCoords.raw) {
48+
return { normalized: rawCoords };
49+
}
50+
const normalizedCoords = {
51+
...rawCoords,
52+
normalized: {
53+
x: rawCoords.raw.x / 1000,
54+
y: rawCoords.raw.y / 1000,
55+
},
56+
};
57+
return { normalized: normalizedCoords };
58+
};
59+
4060
/**
4161
* Browser GUI Agent for visual browser automation
4262
*/
@@ -90,7 +110,7 @@ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') -
90110
wait() - Wait 5 seconds and take a screenshot to check for changes
91111
92112
## Note
93-
- Folow user lanuage in in \`thought\` part.
113+
- Follow user language in in \`thought\` part.
94114
- Describe your thought in \`step\` part.
95115
- Describe your action in \`Step\` part.
96116
- Extract the data your see in \`pageData\` part.
@@ -109,30 +129,39 @@ wait() - Wait 5 seconds and take a scree
109129
}),
110130
function: async ({ thought, step, action }) => {
111131
try {
112-
const parsed = this.parseAction(action);
113-
parsed.thought = thought;
132+
const parsedAction = actionParserHelper.parseActionCallString(action);
133+
if (!parsedAction) {
134+
return createGUIErrorResponse(action, 'Invalid action format');
135+
}
136+
const normalizedCoordsAction = normalizeActionCoords(
137+
parsedAction,
138+
defaultNormalizeCoords,
139+
);
114140

115141
this.logger.debug({
116142
thought,
117143
step,
118144
action,
119-
parsedAction: JSON.stringify(parsed, null, 2),
145+
normalizedCoordsAction: JSON.stringify(normalizedCoordsAction, null, 2),
120146
screenDimensions: {
121147
width: this.screenWidth,
122148
height: this.screenHeight,
123149
},
124150
});
125151

126-
const operatorResult: GUIExecuteResult = await this.browserOperator.execute({
127-
parsedPrediction: parsed,
128-
screenWidth: this.screenWidth || 1920,
129-
screenHeight: this.screenHeight || 1080,
152+
const operatorResult = await this.browserOperator.doExecute({
153+
actions: [normalizedCoordsAction],
130154
});
155+
this.logger.debug('Browser action completed', operatorResult);
131156

132157
await sleep(500);
133158

134-
const guiResponse = convertToGUIResponse(action, parsed, operatorResult);
135-
return guiResponse;
159+
return {
160+
success: true,
161+
action: action,
162+
normalizedAction: convertToAgentUIAction(normalizedCoordsAction),
163+
observation: undefined, // Reserved for future implementation
164+
};
136165
} catch (error) {
137166
this.logger.error(
138167
`Browser action failed: ${error instanceof Error ? error.message : String(error)}`,
@@ -164,7 +193,7 @@ wait() - Wait 5 seconds and take a scree
164193
// Record screenshot start time
165194
const startTime = performance.now();
166195

167-
const output = await this.browserOperator.screenshot();
196+
const output = await this.browserOperator.doScreenshot();
168197

169198
// Calculate screenshot time
170199
const endTime = performance.now();

multimodal/agent-tars/core/src/environments/local/browser/browser-manager.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ export class BrowserManager {
8080
this.logger.info('🌐 Launching browser instance...');
8181
const browser = this.getBrowser();
8282
await browser.launch(options);
83-
// FIXME: Create new page here to avoid the mcp server browser createing
83+
// FIXME: Create new page here to avoid the mcp server browser creating
8484
// another browser instance, we need a better solution here.
8585
// const openingPage = await browser.createPage();
8686
// await openingPage.goto('about:blank', {

multimodal/agent-tars/core/src/webui-config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,4 +146,7 @@ export const AGENT_TARS_WEBUI_CONFIG: AgentWebUIImplementation = {
146146
defaultLayout: 'narrow-chat',
147147
enableLayoutSwitchButton: true,
148148
},
149+
debug: {
150+
enableEventStreamViewer: true,
151+
},
149152
};

multimodal/agent-tars/core/tests/__snapshots__/browser_tools_hybrid.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') -
1515
wait() - Wait 5 seconds and take a screenshot to check for changes
1616

1717
## Note
18-
- Folow user lanuage in in `thought` part.
18+
- Follow user language in in `thought` part.
1919
- Describe your thought in `step` part.
2020
- Describe your action in `Step` part.
2121
- Extract the data your see in `pageData` part.

multimodal/agent-tars/core/tests/__snapshots__/browser_tools_visual-grounding.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') -
1515
wait() - Wait 5 seconds and take a screenshot to check for changes
1616

1717
## Note
18-
- Folow user lanuage in in `thought` part.
18+
- Follow user language in in `thought` part.
1919
- Describe your thought in `step` part.
2020
- Describe your action in `Step` part.
2121
- Extract the data your see in `pageData` part.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
snapshots
2+
snapshot

multimodal/gui-agent/agent-sdk/examples/configs/browser-ve-15vp.config.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@ import 'dotenv/config';
66
import path from 'path';
77

88
import { defineConfig } from '@tarko/agent-cli';
9-
import { browserOperator } from './operators';
9+
import { browserOperator, remoteBrowserOperator } from './operators';
1010
import { doubao_1_5_vp } from './models';
11-
import { systemPromptTemplate1 } from './promptTemps';
11+
import { systemPromptTemplate2 } from './promptTemps';
1212

1313
export default defineConfig({
14-
operator: browserOperator,
14+
// operator: browserOperator,
15+
operator: remoteBrowserOperator,
1516
model: doubao_1_5_vp,
16-
systemPrompt: systemPromptTemplate1,
17+
systemPrompt: systemPromptTemplate2,
1718
snapshot: {
1819
enable: true,
1920
storageDirectory: path.join(__dirname, '../snapshots/browser-ve-15vp'),

multimodal/gui-agent/agent-sdk/examples/configs/operators.ts

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,27 @@
44
*/
55
import { AdbOperator } from '@gui-agent/operator-adb';
66
import { NutJSOperator } from '@gui-agent/operator-nutjs';
7-
import { Operator, ScreenContext } from '@gui-agent/shared/base';
87
import {
9-
SupportedActionType,
10-
ScreenshotOutput,
11-
ExecuteParams,
12-
ExecuteOutput,
13-
} from 'gui-agent/shared/src/types';
8+
LocalBrowserOperator,
9+
RemoteBrowserOperator,
10+
SearchEngine,
11+
} from '@gui-agent/operator-browser';
1412

1513
const computerOperator = new NutJSOperator();
1614
const androidOperator = new AdbOperator();
15+
const browserOperator = new LocalBrowserOperator({
16+
searchEngine: SearchEngine.GOOGLE,
17+
showActionInfo: false,
18+
showWaterFlow: false,
19+
highlightClickableElements: false,
20+
});
1721

18-
class MockedBrowserOperator extends Operator {
19-
protected initialize(): Promise<void> {
20-
throw new Error('Method not implemented.');
21-
}
22-
protected supportedActions(): Array<SupportedActionType> {
23-
throw new Error('Method not implemented.');
24-
}
25-
protected screenContext(): ScreenContext {
26-
throw new Error('Method not implemented.');
27-
}
28-
protected screenshot(): Promise<ScreenshotOutput> {
29-
throw new Error('Method not implemented.');
30-
}
31-
protected execute(params: ExecuteParams): Promise<ExecuteOutput> {
32-
throw new Error('Method not implemented.');
33-
}
34-
}
22+
const remoteBrowserOperator = new RemoteBrowserOperator({
23+
wsEndpoint: 'ws://localhost:9222/devtools/browser/<id>',
24+
searchEngine: SearchEngine.GOOGLE,
25+
showActionInfo: true,
26+
showWaterFlow: true,
27+
highlightClickableElements: true,
28+
});
3529

36-
const browserOperator = new MockedBrowserOperator();
37-
38-
export { computerOperator, androidOperator, browserOperator };
30+
export { computerOperator, androidOperator, browserOperator, remoteBrowserOperator };

0 commit comments

Comments
 (0)