Skip to content

Commit b1d1579

Browse files
committed
feat(gui-agent): add configurable image detail calculator
1 parent 06258c3 commit b1d1579

3 files changed

Lines changed: 51 additions & 1 deletion

File tree

multimodal/gui-agent/agent-sdk/src/GUIAgent.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@ import { GUIAgentToolCallEngine } from './ToolCallEngine';
1313
import { SYSTEM_PROMPT } from './prompts';
1414
import { Base64ImageParser } from '@agent-infra/media-utils';
1515
import { Operator, BaseGUIAgent } from '@gui-agent/shared/base';
16-
import { GUIAgentConfig, NormalizeCoordinates } from '@gui-agent/shared/types';
16+
import {
17+
GUIAgentConfig,
18+
NormalizeCoordinates,
19+
ImageDetailCalculator,
20+
} from '@gui-agent/shared/types';
1721
import {
1822
assembleSystemPrompt,
1923
isSystemPromptTemplate,
@@ -23,6 +27,7 @@ import {
2327
} from '@gui-agent/shared/utils';
2428
import { GUI_ADAPTED_TOOL_NAME } from './constants';
2529
import { convertToAgentUIAction, createGUIErrorResponse } from './utils';
30+
import { defaultDetailCalculator } from './defaultImpls';
2631

2732
const defaultLogger = new ConsoleLogger('[GUIAgent]', LogLevel.DEBUG);
2833

@@ -31,6 +36,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
3136

3237
private operator: Operator | undefined;
3338
private normalizeCoordinates: NormalizeCoordinates;
39+
private detailCalculator: ImageDetailCalculator;
3440
private loopIntervalInMs: number;
3541

3642
constructor(config: GUIAgentConfig<T>) {
@@ -40,6 +46,7 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
4046
systemPrompt,
4147
customeActionParser,
4248
normalizeCoordinates,
49+
detailCalculator,
4350
maxLoopCount,
4451
loopIntervalInMs = 500,
4552
} = config;
@@ -69,6 +76,8 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
6976
});
7077
this.operator = operator;
7178
this.normalizeCoordinates = normalizeCoordinates ?? defaultNormalizeCoords;
79+
// Default detail calculator implementation
80+
this.detailCalculator = detailCalculator ?? defaultDetailCalculator;
7281
this.loopIntervalInMs = loopIntervalInMs;
7382
this.logger = this.logger.spawn('[GUIAgent]');
7483
}
@@ -163,11 +172,17 @@ export class GUIAgent<T extends Operator> extends BaseGUIAgent {
163172
return;
164173
}
165174

175+
const { width: imageWidth, height: imageHeight } = base64Tool.getDimensions() || {
176+
width: -1,
177+
height: -1,
178+
};
179+
166180
const content: ChatCompletionContentPart[] = [
167181
{
168182
type: 'image_url',
169183
image_url: {
170184
url: base64Uri,
185+
detail: this.detailCalculator(imageWidth, imageHeight),
171186
},
172187
},
173188
];
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Copyright (c) 2025 Bytedance, Inc. and its affiliates.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
import { ImageDetailCalculator } from '@gui-agent/shared/types';
6+
7+
/**
8+
* Default implementation for detail calculation based on pixel count
9+
* detail:low mode: 1,048,576 px (1024×1024)
10+
* detail:high mode: 4,014,080 px (2048×1960)
11+
*/
12+
export const defaultDetailCalculator: ImageDetailCalculator = (
13+
width: number,
14+
height: number,
15+
): 'low' | 'high' | 'auto' => {
16+
const LOW_DETAIL_THRESHOLD = 1024 * 1024; // 1,048,576 px
17+
const HIGH_DETAIL_THRESHOLD = 2048 * 1960; // 4,014,080 px
18+
19+
const pixelCount = width * height;
20+
if (pixelCount <= LOW_DETAIL_THRESHOLD) {
21+
return 'low';
22+
} else if (pixelCount <= HIGH_DETAIL_THRESHOLD) {
23+
return 'high';
24+
} else {
25+
// For images larger than high detail threshold, use high detail
26+
return 'auto';
27+
}
28+
};

multimodal/gui-agent/shared/src/types/agents.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ export type ExecuteOutput = {
6464
url?: string; // url of the page
6565
} & Record<string, any>;
6666

67+
/**
68+
* Function type for calculating detail level based on image dimensions
69+
*/
70+
export type ImageDetailCalculator = (width: number, height: number) => 'low' | 'high' | 'auto';
71+
6772
export interface ScreenshotOutput extends ExecuteOutput {
6873
/** screenshot base64, `keep screenshot size as physical pixels` */
6974
base64: string;
@@ -113,6 +118,8 @@ export interface GUIAgentConfig<TOperator> extends AgentOptions {
113118
customeActionParser?: CustomActionParser;
114119
/** The function to normalize raw coordinates */
115120
normalizeCoordinates?: NormalizeCoordinates;
121+
/** The function to calculate detail level based on image dimensions */
122+
detailCalculator?: ImageDetailCalculator;
116123
/** Maximum number of turns for Agent to execute, @default 1000 */
117124
maxLoopCount?: number;
118125
/** Time interval between two loop iterations (in milliseconds), @default 0 */

0 commit comments

Comments
 (0)