Skip to content

Commit 9d5f2fb

Browse files
yuyutaotaozhoushaw
andauthored
feat(web-extract): extract web content as a tree (#337)
* feat: extract web content as a tree * chore: update test data * chore: update test data * feat: update answer of evaluation * chore: update test cases * chore: remove focusing on cases * fix: ci * fix: put rect in html tree * fix: CI * fix: AI test * fix: lint * fix: CI * fix: static-page compatibility * fix: CI * fix: map by markerId * fix: llm planning prompt * chore: update hash length * chore: ignore writing dump file * fix: lint * fix: ci snapshot * chore: snapshot tree in web extractor * chore: export tree utils in core * chore: export tree utils in core * fix: CI * fix: update test case and evaluation * chore: remove unused file * refactor(extract): modify dependencies (#358) * refactor(extract): modify dependencies * chore: modify files config * chore: add indexId as key for map --------- Co-authored-by: Zhou Xiao <[email protected]>
1 parent 175a895 commit 9d5f2fb

File tree

164 files changed

+165891
-9767
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

164 files changed

+165891
-9767
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ test-results/
9696
playwright-report/
9797
blob-report/
9898
playwright/.cache/
99+
iife-script/
99100

100101
# Midscene.js dump files
101102
__ai_responses__/

biome.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"**/doc_build",
1717
"*-dump.json",
1818
"test-results/**",
19+
"iife-script/**",
1920
"script_get_all_texts.tmp.js",
2021
"**/playwright-report/**",
2122
"**/todo-report.spec.ts-snapshots/**",

packages/cli/tsconfig.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"forceConsistentCasingInFileNames": true,
88
"isolatedModules": true,
99
"jsx": "preserve",
10-
"lib": ["ESNext"],
10+
"lib": ["ESNext", "DOM"],
1111
"moduleResolution": "node",
1212
"paths": {
1313
"@/*": ["./src/*"]

packages/midscene/modern.config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export default defineConfig({
1010
index: 'src/index.ts',
1111
env: 'src/env.ts',
1212
utils: 'src/utils.ts',
13+
tree: 'src/tree.ts',
1314
'ai-model': 'src/ai-model/index.ts',
1415
},
1516
outDir: 'dist/lib',

packages/midscene/package.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,16 @@
1313
".": "./dist/lib/index.js",
1414
"./env": "./dist/lib/env.js",
1515
"./utils": "./dist/lib/utils.js",
16-
"./ai-model": "./dist/lib/ai-model.js"
16+
"./ai-model": "./dist/lib/ai-model.js",
17+
"./tree": "./dist/lib/tree.js"
1718
},
1819
"typesVersions": {
1920
"*": {
2021
".": ["./dist/lib/types/index.d.ts"],
2122
"env": ["./dist/lib/types/env.d.ts"],
2223
"utils": ["./dist/lib/types/utils.d.ts"],
23-
"ai-model": ["./dist/lib/types/ai-model.d.ts"]
24+
"ai-model": ["./dist/lib/types/ai-model.d.ts"],
25+
"tree": ["./dist/lib/types/tree.d.ts"]
2426
}
2527
},
2628
"scripts": {

packages/midscene/src/ai-model/inspect.ts

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import type {
88
AIUsageInfo,
99
BaseElement,
1010
ElementById,
11+
ElementTreeNode,
1112
Size,
1213
UIContext,
1314
} from '@/types';
@@ -52,9 +53,8 @@ function transformToAbsoluteCoords(
5253
// let index = 0;
5354
export async function transformElementPositionToId(
5455
aiResult: AIElementResponse | [number, number],
55-
elementsInfo: BaseElement[],
56+
treeRoot: ElementTreeNode<BaseElement>,
5657
size: { width: number; height: number },
57-
screenshotBase64: string,
5858
) {
5959
if (Array.isArray(aiResult)) {
6060
const relativePosition = aiResult;
@@ -67,7 +67,7 @@ export async function transformElementPositionToId(
6767
);
6868

6969
const element = elementByPositionWithElementInfo(
70-
elementsInfo,
70+
treeRoot,
7171
absolutePosition,
7272
);
7373
assert(
@@ -96,7 +96,7 @@ function getQuickAnswer(
9696
| Partial<AISingleElementResponse>
9797
| Partial<AISingleElementResponseByPosition>
9898
| undefined,
99-
elementsInfo: BaseElement[],
99+
tree: ElementTreeNode<BaseElement>,
100100
elementById: ElementById,
101101
insertElementByPosition: (position: { x: number; y: number }) => BaseElement,
102102
) {
@@ -115,10 +115,7 @@ function getQuickAnswer(
115115
}
116116

117117
if ('position' in quickAnswer && quickAnswer.position) {
118-
let element = elementByPositionWithElementInfo(
119-
elementsInfo,
120-
quickAnswer.position,
121-
);
118+
let element = elementByPositionWithElementInfo(tree, quickAnswer.position);
122119
if (!element) {
123120
element = insertElementByPosition(quickAnswer.position);
124121
}
@@ -156,7 +153,7 @@ export async function AiInspectElement<
156153
// meet quick answer
157154
const quickAnswer = getQuickAnswer(
158155
options.quickAnswer,
159-
context.content,
156+
context.tree,
160157
elementById,
161158
insertElementByPosition,
162159
);
@@ -202,9 +199,8 @@ export async function AiInspectElement<
202199
return {
203200
parseResult: await transformElementPositionToId(
204201
res.content,
205-
context.content,
202+
context.tree,
206203
size,
207-
screenshotBase64,
208204
),
209205
rawResponse: res.content,
210206
elementById,
@@ -282,7 +278,6 @@ export async function AiAssert<
282278
assert(assertion, 'assertion should be a string');
283279

284280
const { screenshotBase64 } = context;
285-
const { description } = await describeUserPage(context, liteContextConfig);
286281
const systemPrompt = systemPromptToAssert();
287282

288283
const msgs: AIArgs = [

packages/midscene/src/ai-model/llm-planning.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@ export async function plan(
2020
): Promise<PlanningAIResponse> {
2121
const { callAI, context } = opts || {};
2222
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
23-
const { description: pageDescription, elementByPosition } =
24-
await describeUserPage(context);
23+
const { description: pageDescription } = await describeUserPage(context);
2524

2625
const systemPrompt = await systemPromptToTaskPlanning();
2726
const taskBackgroundContextText = generateTaskBackgroundContext(

packages/midscene/src/ai-model/prompt/llm-planning.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ You are a versatile professional in software UI automation. Your outstanding con
6565
6666
- All the actions you composed MUST be based on the page context information you get.
6767
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
68-
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
68+
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
6969
- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
7070
7171
## About the \`actions\` field
@@ -80,7 +80,7 @@ type LocateParam = {locateParam}
8080
8181
Each action has a \`type\` and corresponding \`param\`. To be detailed:
8282
- type: 'Tap', tap the located element
83-
* {{ locate: {sample}, param: null }}
83+
* {{ locate: LocateParam, param: null }}
8484
- type: 'Hover', move mouse over to the located element
8585
* {{ locate: LocateParam, param: null }}
8686
- type: 'Input', replace the value in the input field
@@ -133,7 +133,10 @@ The JSON format is as follows:
133133
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
134134
"error"?: string // Use the same language as the user's instruction.
135135
}}
136-
Here is an example of how to decompose a task:
136+
137+
## Examples
138+
139+
### Example 1: Decompose a task
137140
138141
When a user says 'Click the language switch button, wait 1s, click "English"', the user will give you the description like this:
139142
@@ -176,7 +179,7 @@ By viewing the page screenshot and description, you should consider this and out
176179
}}
177180
}}
178181
179-
Here is another example of how to tolerate error situations only when the instruction is an "if" statement:
182+
### Example 2: Tolerate error situations only when the instruction is an "if" statement
180183
181184
If the user says "If there is a popup, close it", you should consider this and output the JSON:
182185
@@ -203,7 +206,7 @@ For contrast, if the user says "Close the popup" in this situation, you should c
203206
"furtherPlan": null
204207
}}
205208
206-
Here is an example of when task is accomplished, don't plan more actions:
209+
### Example 3: When task is accomplished, don't plan more actions
207210
208211
When the user ask to "Wait 4s", you should consider this:
209212
@@ -219,7 +222,7 @@ When the user ask to "Wait 4s", you should consider this:
219222
"furtherPlan": null // All steps have been included in the actions, so no further plan is needed
220223
}}
221224
222-
Here is an example of what NOT to do:
225+
### Example 4: What NOT to do
223226
224227
Wrong output:
225228
@@ -230,7 +233,7 @@ Wrong output:
230233
"thought": "Click the language switch button to open the language options.",
231234
"param": null,
232235
"locate": {{
233-
{sample}, // WRONG:prompt is missing
236+
{{"id": "c81c4e9a33"}}, // WRONG:prompt is missing
234237
}}
235238
}},
236239
{{

0 commit comments

Comments
 (0)