Skip to content

Commit 839dc6c

Browse files
authored
feat(browser-event): support drag event (#321)
1 parent 08a159d commit 839dc6c

File tree

13 files changed

+152
-2
lines changed

13 files changed

+152
-2
lines changed

packages/midscene/src/ai-model/ui-tars-planning.ts

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,14 @@ import {
88
} from './prompt/ui-tars-planning';
99
import { call } from './service-caller';
1010

11-
type ActionType = 'click' | 'type' | 'hotkey' | 'finished' | 'scroll' | 'wait';
11+
type ActionType =
12+
| 'click'
13+
| 'drag'
14+
| 'type'
15+
| 'hotkey'
16+
| 'finished'
17+
| 'scroll'
18+
| 'wait';
1219

1320
function capitalize(str: string) {
1421
return str.charAt(0).toUpperCase() + str.slice(1);
@@ -60,6 +67,18 @@ export async function vlmPlanning(options: {
6067
},
6168
param: action.thought || '',
6269
});
70+
} else if (action.action_type === 'drag') {
71+
const startPoint = getPoint(action.action_inputs.start_box, size);
72+
const endPoint = getPoint(action.action_inputs.end_box, size);
73+
transformActions.push({
74+
type: 'Drag',
75+
param: {
76+
start_box: { x: startPoint[0], y: startPoint[1] },
77+
end_box: { x: endPoint[0], y: endPoint[1] },
78+
},
79+
locate: null,
80+
thought: action.thought || '',
81+
});
6382
} else if (action.action_type === 'type') {
6483
transformActions.push({
6584
type: 'Input',
@@ -140,6 +159,14 @@ interface ClickAction extends BaseAction {
140159
};
141160
}
142161

162+
interface DragAction extends BaseAction {
163+
action_type: 'drag';
164+
action_inputs: {
165+
start_box: string; // JSON string of [x, y] coordinates
166+
end_box: string; // JSON string of [x, y] coordinates
167+
};
168+
}
169+
143170
interface WaitAction extends BaseAction {
144171
action_type: 'wait';
145172
action_inputs: {
@@ -175,6 +202,7 @@ interface FinishedAction extends BaseAction {
175202

176203
export type Action =
177204
| ClickAction
205+
| DragAction
178206
| TypeAction
179207
| HotkeyAction
180208
| ScrollAction

packages/midscene/src/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ export interface PlanningAction<ParamType = any> {
221221
type:
222222
| 'Locate'
223223
| 'Tap'
224+
| 'Drag'
224225
| 'Hover'
225226
| 'Input'
226227
| 'KeyboardPress'

packages/web-integration/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,8 @@
107107
"test": "vitest --run",
108108
"test:u": "vitest --run -u",
109109
"test:ai": "AI_TEST_TYPE=web npm run test",
110-
"test:ai:bridge": "BRIDGE_MODE=true npm run test --inspect packages/web-integration/tests/ai/bridge/agent.test.ts",
110+
"test:ai:temp": "AI_TEST_TYPE=web vitest --run tests/ai/bridge/temp.test.ts",
111+
"test:ai:bridge": "BRIDGE_MODE=true npm run test --inspect tests/ai/bridge/agent.test.ts",
111112
"test:ai:cache": "MIDSCENE_CACHE=true AI_TEST_TYPE=web npm run test",
112113
"test:ai:all": "npm run test:ai:web && npm run test:ai:native",
113114
"test:ai:native": "MIDSCENE_CACHE=true AI_TEST_TYPE=native npm run test",

packages/web-integration/src/appium/page.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ export class Page implements AbstractPage {
6363
wheel: (deltaX: number, deltaY: number) =>
6464
this.mouseWheel(deltaX, deltaY),
6565
move: (x: number, y: number) => this.mouseMove(x, y),
66+
drag: (from: { x: number; y: number }, to: { x: number; y: number }) =>
67+
this.mouseDrag(from, to),
6668
};
6769
}
6870

@@ -249,6 +251,25 @@ export class Page implements AbstractPage {
249251
]);
250252
}
251253

254+
private async mouseDrag(
255+
from: { x: number; y: number },
256+
to: { x: number; y: number },
257+
): Promise<void> {
258+
await this.browser.performActions([
259+
{
260+
type: 'pointer',
261+
id: 'mouse',
262+
parameters: { pointerType: 'mouse' },
263+
actions: [
264+
{ type: 'pointerMove', duration: 0, x: from.x, y: from.y },
265+
{ type: 'pointerDown', button: 0 },
266+
{ type: 'pointerMove', duration: 500, x: to.x, y: to.y },
267+
{ type: 'pointerUp', button: 0 },
268+
],
269+
},
270+
]);
271+
}
272+
252273
private async mouseWheel(
253274
deltaX: number,
254275
deltaY: number,

packages/web-integration/src/bridge-mode/agent-cli-side.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ export const getBridgePageInCliSide = (): ChromeExtensionPageCliSide => {
6363
click: bridgeCaller(MouseEvent.Click),
6464
wheel: bridgeCaller(MouseEvent.Wheel),
6565
move: bridgeCaller(MouseEvent.Move),
66+
drag: bridgeCaller(MouseEvent.Drag),
6667
};
6768
return mouse;
6869
}

packages/web-integration/src/bridge-mode/common.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export enum MouseEvent {
2626
Click = 'mouse.click',
2727
Wheel = 'mouse.wheel',
2828
Move = 'mouse.move',
29+
Drag = 'mouse.drag',
2930
}
3031

3132
export enum KeyboardEvent {

packages/web-integration/src/bridge-mode/page-browser-side.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ export class ChromeExtensionPageBrowserSide extends ChromeExtensionProxyPage {
5555

5656
if (method.startsWith(MouseEvent.PREFIX)) {
5757
const actionName = method.split('.')[1] as keyof MouseAction;
58+
if (actionName === 'drag') {
59+
return this.mouse[actionName].apply(this.mouse, args as any);
60+
}
5861
return this.mouse[actionName].apply(this.mouse, args as any);
5962
}
6063

packages/web-integration/src/chrome-extension/page.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,27 @@ export default class ChromeExtensionProxyPage implements AbstractPage {
425425
y,
426426
});
427427
},
428+
drag: async (
429+
from: { x: number; y: number },
430+
to: { x: number; y: number },
431+
) => {
432+
await this.mouse.move(from.x, from.y);
433+
await this.sendCommandToDebugger('Input.dispatchMouseEvent', {
434+
type: 'mousePressed',
435+
x: from.x,
436+
y: from.y,
437+
button: 'left',
438+
clickCount: 1,
439+
});
440+
await this.mouse.move(to.x, to.y);
441+
await this.sendCommandToDebugger('Input.dispatchMouseEvent', {
442+
type: 'mouseReleased',
443+
x: to.x,
444+
y: to.y,
445+
button: 'left',
446+
clickCount: 1,
447+
});
448+
},
428449
};
429450

430451
keyboard = {

packages/web-integration/src/common/tasks.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,25 @@ export class PageTaskExecutor {
311311
},
312312
};
313313
tasks.push(taskActionTap);
314+
} else if (plan.type === 'Drag') {
315+
const taskActionDrag: ExecutionTaskActionApply<{
316+
start_box: { x: number; y: number };
317+
end_box: { x: number; y: number };
318+
}> = {
319+
type: 'Action',
320+
subType: 'Drag',
321+
param: plan.param,
322+
thought: plan.thought,
323+
locate: plan.locate,
324+
executor: async (taskParam) => {
325+
assert(
326+
taskParam?.start_box && taskParam?.end_box,
327+
'No start_box or end_box to drag',
328+
);
329+
await this.page.mouse.drag(taskParam.start_box, taskParam.end_box);
330+
},
331+
};
332+
tasks.push(taskActionDrag);
314333
} else if (plan.type === 'Hover') {
315334
const taskActionHover: ExecutionTaskActionApply<PlanningActionParamHover> =
316335
{

packages/web-integration/src/page.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ export interface MouseAction {
1313
) => Promise<void>;
1414
wheel: (deltaX: number, deltaY: number) => Promise<void>;
1515
move: (x: number, y: number) => Promise<void>;
16+
drag: (
17+
from: { x: number; y: number },
18+
to: { x: number; y: number },
19+
) => Promise<void>;
1620
}
1721

1822
export interface KeyboardAction {
@@ -36,6 +40,10 @@ export abstract class AbstractPage {
3640
) => {},
3741
wheel: async (deltaX: number, deltaY: number) => {},
3842
move: async (x: number, y: number) => {},
43+
drag: async (
44+
from: { x: number; y: number },
45+
to: { x: number; y: number },
46+
) => {},
3947
};
4048
}
4149

0 commit comments

Comments
 (0)