web-infra-dev · e790a8 · Mar 5, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -170,7 +170,14 @@ export async function AiLocateElement(options: {
   const targetElementDescriptionText = extraTextFromUserPrompt(
     targetElementDescription,
   );
-  const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);
+  const referenceImageNames =
+    typeof targetElementDescription !== 'string'
+      ? targetElementDescription.images?.map((img) => img.name)
+      : undefined;
+  const userInstructionPrompt = findElementPrompt(
+    targetElementDescriptionText,
+    referenceImageNames,
+  );
   const systemPrompt = isAutoGLM(modelFamily)
     ? getAutoGLMLocatePrompt(modelFamily)
     : systemPromptToLocateElement(modelFamily);

diff --git a/packages/core/src/ai-model/prompt/llm-locator.ts b/packages/core/src/ai-model/prompt/llm-locator.ts
@@ -19,6 +19,13 @@ You are an AI assistant that helps identify UI elements.
 - For example: If an input field is large (both wide and tall) with a placeholder text "Please enter your comment", you should locate only the area where the placeholder text appears, not the entire input field.
 - This principle applies to all text-containing elements: focus on the visible text region rather than the full element container.
 
+## When Reference Images Are Provided:
+- The FIRST image in the conversation is always the MAIN screenshot — you must return the bounding box of the element found in this MAIN screenshot.
+- Any images provided after the main screenshot are reference images (visual examples of the element to find).
+- When the task mentions a reference image name (e.g., "参考图片1"), it refers to the provided reference image with that name — do NOT search for that name as text on the page.
+- Use the reference image as a visual template and locate the element in the MAIN screenshot that visually resembles it.
+- The returned bounding box coordinates must be from the MAIN screenshot, not from the reference image.
+
 ## Output Format:
 \`\`\`json
 {
@@ -49,5 +56,13 @@ When no element is found:
 `;
 }
 
-export const findElementPrompt = (targetElementDescription: string) =>
-  `Find: ${targetElementDescription}`;
+export const findElementPrompt = (
+  targetElementDescription: string,
+  referenceImageNames?: string[],
+) => {
+  if (referenceImageNames?.length) {
+    const nameList = referenceImageNames.map((n) => `'${n}'`).join(', ');
+    return `Find the element in the MAIN screenshot (first image) that visually matches the reference image${referenceImageNames.length > 1 ? 's' : ''} (${nameList}) provided below.\nTask hint: ${targetElementDescription}`;
+  }
+  return `Find: ${targetElementDescription}`;
+};
diff --git a/packages/core/src/yaml/player.ts b/packages/core/src/yaml/player.ts
@@ -481,10 +481,8 @@ export class ScriptPlayer<T extends MidsceneYamlScriptEnv> {
           // User YAML: aiTap: 'search input box'
           locatePrompt = aiTap;
         } else if (typeof locateObj === 'object' && locateObj?.prompt) {
-          // buildYamlFlowFromPlans: { aiTap: '', locate: { prompt, deepLocate, cacheable } }
-          const { prompt: lp, ...locateOpts } = locateObj;
-          locatePrompt = lp;
-          opts = { ...locateOpts, ...tapOptions };
+          // buildYamlFlowFromPlans: { aiTap: { locate: { prompt, images, ... } } }    (locate nested in aiTap)
+          locatePrompt = locateObj;
         } else {
           // User YAML: aiTap: { prompt: '...' } or aiTap: null + prompt: '...'
           locatePrompt = aiTap?.prompt || prompt || locateObj;