Skip to content

Commit b8e3d68

Browse files
authored
feat(omni-agent): compress images for gui screenshot (#1647)
1 parent 37babcc commit b8e3d68

4 files changed

Lines changed: 40 additions & 24 deletions

File tree

multimodal/omni-tars/gui-agent/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"@gui-agent/operator-browser": "workspace:*",
3939
"@gui-agent/operator-aio": "workspace:*",
4040
"@gui-agent/action-parser": "workspace:*",
41+
"@tarko/shared-media-utils": "workspace:*",
4142
"lodash.isnumber": "3.0.3"
4243
},
4344
"devDependencies": {
@@ -48,4 +49,4 @@
4849
"openai": "4.93.0",
4950
"@types/lodash.isnumber": "3.0.3"
5051
}
51-
}
52+
}

multimodal/omni-tars/gui-agent/src/GuiAgentPlugin.ts

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,10 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55
import { AgentPlugin, COMPUTER_USE_ENVIRONMENT } from '@omni-tars/core';
6-
import {
7-
Tool,
8-
LLMRequestHookPayload,
9-
LLMResponseHookPayload,
10-
AgentEventStream,
11-
ChatCompletionContentPart,
12-
} from '@tarko/agent';
13-
import {
14-
GUIExecuteResult,
15-
convertToGUIResponse,
16-
createGUIErrorResponse,
17-
} from '@tarko/shared-utils';
6+
import { Tool, LLMRequestHookPayload, ChatCompletionContentPart } from '@tarko/agent';
7+
import { createGUIErrorResponse } from '@tarko/shared-utils';
188
import { Base64ImageParser } from '@agent-infra/media-utils';
9+
import { ImageCompressor, formatBytes } from '@tarko/shared-media-utils';
1910
import { setScreenInfo } from './shared';
2011
import { OperatorManager } from './OperatorManager';
2112
import { BrowserOperator } from '@gui-agent/operator-browser';
@@ -96,22 +87,35 @@ export class GuiAgentPlugin extends AgentPlugin {
9687

9788
const operator = await this.operatorManager.getInstance();
9889
const output = await operator?.doScreenshot();
99-
if (!output) {
90+
if (!output?.base64) {
10091
this.agent.logger.error('Failed to get screenshot');
10192
return;
10293
}
10394
const base64Tool = new Base64ImageParser(output.base64);
104-
const base64Uri = base64Tool.getDataUri();
105-
if (!base64Uri) {
106-
this.agent.logger.error('Failed to get base64 image uri');
107-
return;
108-
}
95+
const originalBuffer = Buffer.from(output.base64, 'base64');
96+
const originalSize = originalBuffer.byteLength;
97+
98+
// Create image compressor with WebP format and 80% quality
99+
const compressor = new ImageCompressor({
100+
quality: 80,
101+
format: 'webp',
102+
});
103+
const compressedBuffer = await compressor.compressToBuffer(originalBuffer);
104+
const compressedBase64 = `data:image/webp;base64,${compressedBuffer.toString('base64')}`;
105+
const compressedSize = compressedBuffer.byteLength;
106+
const compressionRatio = (((originalSize - compressedSize) / originalSize) * 100).toFixed(2);
107+
108+
this.agent.logger.debug(`compression stat: `, {
109+
originalSize: formatBytes(originalSize),
110+
compressedSize: formatBytes(compressedSize),
111+
compressionRatio: `${compressionRatio}% reduction`,
112+
});
109113

110114
const content: ChatCompletionContentPart[] = [
111115
{
112116
type: 'image_url',
113117
image_url: {
114-
url: base64Uri,
118+
url: compressedBase64,
115119
},
116120
},
117121
];

multimodal/omni-tars/omni-agent/tarko.config.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,16 @@ export default {
4949
},
5050
},
5151
logLevel: LogLevel.DEBUG,
52+
webui: {
53+
logo: 'https://lf3-static.bytednsdoc.com/obj/eden-cn/zyha-aulnh/ljhwZthlaukjlkulzlp/appicon.png',
54+
subtitle: 'Offering seamless integration with a wide range of real-world tools.',
55+
welcomTitle: 'Omni Agent',
56+
welcomePrompts: [
57+
'Search for the latest GUI Agent papers',
58+
'Find information about UI TARS',
59+
'Tell me the top 5 most popular projects on ProductHunt today',
60+
'Please book me the earliest flight from Hangzhou to Shenzhen on 10.1',
61+
'What is Agent TARS',
62+
],
63+
},
5264
} as AgentAppConfig;

multimodal/pnpm-lock.yaml

Lines changed: 3 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)