Skip to content

Commit

Permalink
feat(web-extract): extract web content as a tree (#337)
Browse files Browse the repository at this point in the history
* feat: extract web content as a tree

* chore: update test data

* chore: update test data

* feat: update answer of evaluation

* chore: update test cases

* chore: remove focusing on cases

* fix: ci

* fix: put rect in html tree

* fix: CI

* fix: AI test

* fix: lint

* fix: CI

* fix: static-page compatibility

* fix: CI

* fix: map by markerId

* fix: llm planning prompt

* chore: update hash length

* chore: ignore writing dump file

* fix: lint

* fix: ci snapshot

* chore: snapshot tree in web extractor

* chore: export tree utils in core

* chore: export tree utils in core

* fix: CI

* fix: update test case and evaluation

* chore: remove unused file

* refactor(extract): modify dependencies (#358)

* refactor(extract): modify dependencies

* chore: modify files config

* chore: add indexId as key for map

---------

Co-authored-by: Zhou Xiao <[email protected]>
  • Loading branch information
yuyutaotao and zhoushaw authored Feb 7, 2025
1 parent 175a895 commit 9d5f2fb
Show file tree
Hide file tree
Showing 164 changed files with 165,891 additions and 9,767 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ test-results/
playwright-report/
blob-report/
playwright/.cache/
iife-script/

# Midscene.js dump files
__ai_responses__/
Expand Down
1 change: 1 addition & 0 deletions biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"**/doc_build",
"*-dump.json",
"test-results/**",
"iife-script/**",
"script_get_all_texts.tmp.js",
"**/playwright-report/**",
"**/todo-report.spec.ts-snapshots/**",
Expand Down
2 changes: 1 addition & 1 deletion packages/cli/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"forceConsistentCasingInFileNames": true,
"isolatedModules": true,
"jsx": "preserve",
"lib": ["ESNext"],
"lib": ["ESNext", "DOM"],
"moduleResolution": "node",
"paths": {
"@/*": ["./src/*"]
Expand Down
1 change: 1 addition & 0 deletions packages/midscene/modern.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export default defineConfig({
index: 'src/index.ts',
env: 'src/env.ts',
utils: 'src/utils.ts',
tree: 'src/tree.ts',
'ai-model': 'src/ai-model/index.ts',
},
outDir: 'dist/lib',
Expand Down
6 changes: 4 additions & 2 deletions packages/midscene/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@
".": "./dist/lib/index.js",
"./env": "./dist/lib/env.js",
"./utils": "./dist/lib/utils.js",
"./ai-model": "./dist/lib/ai-model.js"
"./ai-model": "./dist/lib/ai-model.js",
"./tree": "./dist/lib/tree.js"
},
"typesVersions": {
"*": {
".": ["./dist/lib/types/index.d.ts"],
"env": ["./dist/lib/types/env.d.ts"],
"utils": ["./dist/lib/types/utils.d.ts"],
"ai-model": ["./dist/lib/types/ai-model.d.ts"]
"ai-model": ["./dist/lib/types/ai-model.d.ts"],
"tree": ["./dist/lib/types/tree.d.ts"]
}
},
"scripts": {
Expand Down
19 changes: 7 additions & 12 deletions packages/midscene/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import type {
AIUsageInfo,
BaseElement,
ElementById,
ElementTreeNode,
Size,
UIContext,
} from '@/types';
Expand Down Expand Up @@ -52,9 +53,8 @@ function transformToAbsoluteCoords(
// let index = 0;
export async function transformElementPositionToId(
aiResult: AIElementResponse | [number, number],
elementsInfo: BaseElement[],
treeRoot: ElementTreeNode<BaseElement>,
size: { width: number; height: number },
screenshotBase64: string,
) {
if (Array.isArray(aiResult)) {
const relativePosition = aiResult;
Expand All @@ -67,7 +67,7 @@ export async function transformElementPositionToId(
);

const element = elementByPositionWithElementInfo(
elementsInfo,
treeRoot,
absolutePosition,
);
assert(
Expand Down Expand Up @@ -96,7 +96,7 @@ function getQuickAnswer(
| Partial<AISingleElementResponse>
| Partial<AISingleElementResponseByPosition>
| undefined,
elementsInfo: BaseElement[],
tree: ElementTreeNode<BaseElement>,
elementById: ElementById,
insertElementByPosition: (position: { x: number; y: number }) => BaseElement,
) {
Expand All @@ -115,10 +115,7 @@ function getQuickAnswer(
}

if ('position' in quickAnswer && quickAnswer.position) {
let element = elementByPositionWithElementInfo(
elementsInfo,
quickAnswer.position,
);
let element = elementByPositionWithElementInfo(tree, quickAnswer.position);
if (!element) {
element = insertElementByPosition(quickAnswer.position);
}
Expand Down Expand Up @@ -156,7 +153,7 @@ export async function AiInspectElement<
// meet quick answer
const quickAnswer = getQuickAnswer(
options.quickAnswer,
context.content,
context.tree,
elementById,
insertElementByPosition,
);
Expand Down Expand Up @@ -202,9 +199,8 @@ export async function AiInspectElement<
return {
parseResult: await transformElementPositionToId(
res.content,
context.content,
context.tree,
size,
screenshotBase64,
),
rawResponse: res.content,
elementById,
Expand Down Expand Up @@ -282,7 +278,6 @@ export async function AiAssert<
assert(assertion, 'assertion should be a string');

const { screenshotBase64 } = context;
const { description } = await describeUserPage(context, liteContextConfig);
const systemPrompt = systemPromptToAssert();

const msgs: AIArgs = [
Expand Down
3 changes: 1 addition & 2 deletions packages/midscene/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ export async function plan(
): Promise<PlanningAIResponse> {
const { callAI, context } = opts || {};
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
const { description: pageDescription, elementByPosition } =
await describeUserPage(context);
const { description: pageDescription } = await describeUserPage(context);

const systemPrompt = await systemPromptToTaskPlanning();
const taskBackgroundContextText = generateTaskBackgroundContext(
Expand Down
17 changes: 10 additions & 7 deletions packages/midscene/src/ai-model/prompt/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ You are a versatile professional in software UI automation. Your outstanding con
- All the actions you composed MUST be based on the page context information you get.
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
## About the \`actions\` field
Expand All @@ -80,7 +80,7 @@ type LocateParam = {locateParam}
Each action has a \`type\` and corresponding \`param\`. To be detailed:
- type: 'Tap', tap the located element
* {{ locate: {sample}, param: null }}
* {{ locate: LocateParam, param: null }}
- type: 'Hover', move mouse over to the located element
* {{ locate: LocateParam, param: null }}
- type: 'Input', replace the value in the input field
Expand Down Expand Up @@ -133,7 +133,10 @@ The JSON format is as follows:
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
"error"?: string // Use the same language as the user's instruction.
}}
Here is an example of how to decompose a task:
## Examples
### Example 1: Decompose a task
When a user says 'Click the language switch button, wait 1s, click "English"', the user will give you the description like this:
Expand Down Expand Up @@ -176,7 +179,7 @@ By viewing the page screenshot and description, you should consider this and out
}}
}}
Here is another example of how to tolerate error situations only when the instruction is an "if" statement:
### Example 2: Tolerate error situations only when the instruction is an "if" statement
If the user says "If there is a popup, close it", you should consider this and output the JSON:
Expand All @@ -203,7 +206,7 @@ For contrast, if the user says "Close the popup" in this situation, you should c
"furtherPlan": null
}}
Here is an example of when task is accomplished, don't plan more actions:
### Example 3: When task is accomplished, don't plan more actions
When the user ask to "Wait 4s", you should consider this:
Expand All @@ -219,7 +222,7 @@ When the user ask to "Wait 4s", you should consider this:
"furtherPlan": null // All steps have been included in the actions, so no further plan is needed
}}
Here is an example of what NOT to do:
### Example 4: What NOT to do
Wrong output:
Expand All @@ -230,7 +233,7 @@ Wrong output:
"thought": "Click the language switch button to open the language options.",
"param": null,
"locate": {{
{sample}, // WRONG:prompt is missing
{{"id": "c81c4e9a33"}}, // WRONG:prompt is missing
}}
}},
{{
Expand Down
Loading

0 comments on commit 9d5f2fb

Please sign in to comment.