diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts index 7f616f9eb4..cc27077436 100644 --- a/packages/core/src/ai-model/inspect.ts +++ b/packages/core/src/ai-model/inspect.ts @@ -170,7 +170,14 @@ export async function AiLocateElement(options: { const targetElementDescriptionText = extraTextFromUserPrompt( targetElementDescription, ); - const userInstructionPrompt = findElementPrompt(targetElementDescriptionText); + const referenceImageNames = + typeof targetElementDescription !== 'string' + ? targetElementDescription.images?.map((img) => img.name) + : undefined; + const userInstructionPrompt = findElementPrompt( + targetElementDescriptionText, + referenceImageNames, + ); const systemPrompt = isAutoGLM(modelFamily) ? getAutoGLMLocatePrompt(modelFamily) : systemPromptToLocateElement(modelFamily); diff --git a/packages/core/src/ai-model/prompt/llm-locator.ts b/packages/core/src/ai-model/prompt/llm-locator.ts index b7af4ae835..2a4e6c56da 100644 --- a/packages/core/src/ai-model/prompt/llm-locator.ts +++ b/packages/core/src/ai-model/prompt/llm-locator.ts @@ -19,6 +19,13 @@ You are an AI assistant that helps identify UI elements. - For example: If an input field is large (both wide and tall) with a placeholder text "Please enter your comment", you should locate only the area where the placeholder text appears, not the entire input field. - This principle applies to all text-containing elements: focus on the visible text region rather than the full element container. +## When Reference Images Are Provided: +- The FIRST image in the conversation is always the MAIN screenshot — you must return the bounding box of the element found in this MAIN screenshot. +- Any images provided after the main screenshot are reference images (visual examples of the element to find). +- When the task mentions a reference image name (e.g., "参考图片1"), it refers to the provided reference image with that name — do NOT search for that name as text on the page. +- Use the reference image as a visual template and locate the element in the MAIN screenshot that visually resembles it. +- The returned bounding box coordinates must be from the MAIN screenshot, not from the reference image. + ## Output Format: \`\`\`json { @@ -49,5 +56,13 @@ When no element is found: `; } -export const findElementPrompt = (targetElementDescription: string) => - `Find: ${targetElementDescription}`; +export const findElementPrompt = ( + targetElementDescription: string, + referenceImageNames?: string[], +) => { + if (referenceImageNames?.length) { + const nameList = referenceImageNames.map((n) => `'${n}'`).join(', '); + return `Find the element in the MAIN screenshot (first image) that visually matches the reference image${referenceImageNames.length > 1 ? 's' : ''} (${nameList}) provided below.\nTask hint: ${targetElementDescription}`; + } + return `Find: ${targetElementDescription}`; +}; \ No newline at end of file diff --git a/packages/core/src/yaml/player.ts b/packages/core/src/yaml/player.ts index b0ed3236f9..9a99ba0d43 100644 --- a/packages/core/src/yaml/player.ts +++ b/packages/core/src/yaml/player.ts @@ -481,10 +481,8 @@ export class ScriptPlayer { // User YAML: aiTap: 'search input box' locatePrompt = aiTap; } else if (typeof locateObj === 'object' && locateObj?.prompt) { - // buildYamlFlowFromPlans: { aiTap: '', locate: { prompt, deepLocate, cacheable } } - const { prompt: lp, ...locateOpts } = locateObj; - locatePrompt = lp; - opts = { ...locateOpts, ...tapOptions }; + // buildYamlFlowFromPlans: { aiTap: { locate: { prompt, images, ... } } } (locate nested in aiTap) + locatePrompt = locateObj; } else { // User YAML: aiTap: { prompt: '...' } or aiTap: null + prompt: '...' locatePrompt = aiTap?.prompt || prompt || locateObj;