Skip to content

Commit 1b2ad9a

Browse files
committed
feat(tarko): switch gui agent to percentage coordinates
Remove devicePixelRatio from screenshot metadata and return percentage coordinates instead of absolute coordinates from browser operator. Simplify BrowserControlRenderer to use percentage coordinates directly. - Remove devicePixelRatio metadata from environment_input events - Calculate percentage coordinates in browser operator execute method - Update BrowserControlRenderer to use percentage positioning - Remove unused getDevicePixelRatio method
1 parent 97ef7ad commit 1b2ad9a

3 files changed

Lines changed: 27 additions & 61 deletions

File tree

multimodal/agent-tars/core/src/browser/browser-gui-agent.ts

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,8 @@ wait() - Wait 5 seconds and take a scree
196196
// Get page title
197197
const title = document.title || 'Untitled Page';
198198

199-
// @ts-expect-error
200199
// Get visible text content
201-
const getVisibleText = (node) => {
200+
const getVisibleText = (node: any) => {
202201
if (node.nodeType === Node.TEXT_NODE) {
203202
return node.textContent || '';
204203
}
@@ -214,13 +213,10 @@ wait() - Wait 5 seconds and take a scree
214213

215214
let text = '';
216215
for (const child of Array.from(node.childNodes)) {
217-
// @ts-expect-error
218-
if (child.nodeType === Node.ELEMENT_NODE) {
216+
if ((child as any).nodeType === Node.ELEMENT_NODE) {
219217
text += getVisibleText(child);
220-
// @ts-expect-error
221-
} else if (child.nodeType === Node.TEXT_NODE) {
222-
// @ts-expect-error
223-
text += child.textContent || '';
218+
} else if ((child as any).nodeType === Node.TEXT_NODE) {
219+
text += (child as any).textContent || '';
224220
}
225221
}
226222

@@ -247,7 +243,7 @@ wait() - Wait 5 seconds and take a scree
247243
description: 'Page Content After Browser Action',
248244
metadata: {
249245
type: 'text',
250-
},
246+
} as AgentEventStream.TextMetadata,
251247
});
252248

253249
// Send the event
@@ -344,7 +340,7 @@ wait() - Wait 5 seconds and take a scree
344340
description: 'Browser Screenshot',
345341
metadata: {
346342
type: 'screenshot',
347-
},
343+
} as AgentEventStream.ScreenshotMetadata,
348344
});
349345

350346
return eventStream.sendEvent(event);
@@ -405,8 +401,7 @@ wait() - Wait 5 seconds and take a scree
405401
description: 'Browser Screenshot',
406402
metadata: {
407403
type: 'screenshot',
408-
devicePixelRatio: await this.getDevicePixelRatio(),
409-
},
404+
} as AgentEventStream.ScreenshotMetadata,
410405
});
411406

412407
eventStream.sendEvent(event);
@@ -603,19 +598,7 @@ wait() - Wait 5 seconds and take a scree
603598
}
604599
}
605600

606-
/**
607-
* Get the device pixel ratio from the browser page
608-
*/
609-
private async getDevicePixelRatio(): Promise<number> {
610-
try {
611-
const page = await this.getPage();
612-
const devicePixelRatio = await page.evaluate(() => window.devicePixelRatio);
613-
return devicePixelRatio || 1;
614-
} catch (error) {
615-
this.logger.warn('Failed to get device pixel ratio, defaulting to 1:', error);
616-
return 1;
617-
}
618-
}
601+
619602

620603
/**
621604
* Get access to the underlying Puppeteer page

multimodal/gui-agent/operator-browser/src/browser-operator.ts

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,13 +297,16 @@ export class BrowserOperator extends Operator {
297297
throw error;
298298
}
299299

300+
// Calculate percentage coordinates for GUI Agent
301+
const startXPercent = startX ? (startX * deviceScaleFactor) / screenWidth : null;
302+
const startYPercent = startY ? (startY * deviceScaleFactor) / screenHeight : null;
303+
300304
return {
301-
// Hand it over to the upper layer to avoid redundancy
302-
// @ts-expect-error fix type later
303-
startX,
304-
startY,
305+
// Return percentage coordinates instead of absolute coordinates
306+
startX: startXPercent,
307+
startY: startYPercent,
305308
action_inputs,
306-
};
309+
} as any;
307310
}
308311

309312
private async handleClick(x: number, y: number) {

multimodal/tarko/agent-web-ui/src/standalone/workspace/renderers/BrowserControlRenderer.tsx

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,11 @@ export const BrowserControlRenderer: React.FC<BrowserControlRendererProps> = ({
3030
}) => {
3131
const { activeSessionId, messages, toolResults, replayState } = useSession();
3232
const [relatedImage, setRelatedImage] = useState<string | null>(null);
33-
const [imageSize, setImageSize] = useState<{ width: number; height: number } | null>(null);
3433
const [mousePosition, setMousePosition] = useState<{ x: number; y: number } | null>(null);
3534
const [previousMousePosition, setPreviousMousePosition] = useState<{
3635
x: number;
3736
y: number;
3837
} | null>(null);
39-
const [devicePixelRatio, setDevicePixelRatio] = useState<number>(window.devicePixelRatio);
4038
const imageRef = useRef<HTMLImageElement>(null);
4139

4240
// Extract the visual operation details from panelContent
@@ -64,11 +62,11 @@ export const BrowserControlRenderer: React.FC<BrowserControlRendererProps> = ({
6462
setPreviousMousePosition(mousePosition);
6563
}
6664

67-
// Set new position if coordinates are valid
65+
// Set new position if coordinates are valid (now as percentages)
6866
if (typeof startX === 'number' && typeof startY === 'number') {
6967
setMousePosition({
70-
x: startX,
71-
y: startY,
68+
x: startX * 100, // Convert to percentage
69+
y: startY * 100, // Convert to percentage
7270
});
7371
}
7472
}
@@ -114,15 +112,6 @@ export const BrowserControlRenderer: React.FC<BrowserControlRendererProps> = ({
114112
if (imgContent && 'image_url' in imgContent && imgContent.image_url.url) {
115113
setRelatedImage(imgContent.image_url.url);
116114
foundImage = true;
117-
118-
// Extract devicePixelRatio from environment input metadata if available
119-
if (
120-
msg.metadata &&
121-
AgentEventStream.isScreenshotMetadata(msg.metadata) &&
122-
msg.metadata.devicePixelRatio
123-
) {
124-
setDevicePixelRatio(msg.metadata.devicePixelRatio);
125-
}
126115
break;
127116
}
128117
}
@@ -137,15 +126,7 @@ export const BrowserControlRenderer: React.FC<BrowserControlRendererProps> = ({
137126
}
138127
}, [activeSessionId, messages, toolCallId, environmentImage]);
139128

140-
// Handler to get image dimensions when loaded
141-
const handleImageLoad = () => {
142-
if (imageRef.current) {
143-
setImageSize({
144-
width: imageRef.current.naturalWidth,
145-
height: imageRef.current.naturalHeight,
146-
});
147-
}
148-
};
129+
149130

150131
return (
151132
<div className="space-y-6">
@@ -159,27 +140,26 @@ export const BrowserControlRenderer: React.FC<BrowserControlRendererProps> = ({
159140
src={relatedImage}
160141
alt="Browser Screenshot"
161142
className="w-full h-auto object-contain max-h-[70vh]"
162-
onLoad={handleImageLoad}
163143
/>
164144

165145
{/* Enhanced mouse cursor overlay */}
166-
{mousePosition && imageSize && (
146+
{mousePosition && (
167147
<motion.div
168148
className="absolute pointer-events-none"
169149
initial={
170150
previousMousePosition
171151
? {
172-
left: `${(previousMousePosition.x / imageSize.width) * 100 * devicePixelRatio}%`,
173-
top: `${(previousMousePosition.y / imageSize.height) * 100 * devicePixelRatio}%`,
152+
left: `${previousMousePosition.x}%`,
153+
top: `${previousMousePosition.y}%`,
174154
}
175155
: {
176-
left: `${(mousePosition.x / imageSize.width) * 100 * devicePixelRatio}%`,
177-
top: `${(mousePosition.y / imageSize.height) * 100 * devicePixelRatio}%`,
156+
left: `${mousePosition.x}%`,
157+
top: `${mousePosition.y}%`,
178158
}
179159
}
180160
animate={{
181-
left: `${(mousePosition.x / imageSize.width) * 100 * devicePixelRatio}%`,
182-
top: `${(mousePosition.y / imageSize.height) * 100 * devicePixelRatio}%`,
161+
left: `${mousePosition.x}%`,
162+
top: `${mousePosition.y}%`,
183163
}}
184164
transition={{ duration: 0.5, ease: [0.16, 1, 0.3, 1] }}
185165
style={{

0 commit comments

Comments
 (0)