web-infra-dev · EAGzzyCSL · Apr 10, 2026
diff --git a/packages/core/src/agent/tasks.ts b/packages/core/src/agent/tasks.ts
@@ -580,19 +580,21 @@ export class TaskExecutor {
         const ifTypeRestricted = type !== 'Query';
         let demandInput = demand;
         let keyOfResult = 'result';
+        const currentScreenshotConstraint =
+          'based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images';
         if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {
           keyOfResult = 'StatementIsTruthy';
           const booleanPrompt =
             type === 'Assert'
-              ? `Boolean, whether the following statement is true: ${demand}`
-              : `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;
+              ? `Boolean, ${currentScreenshotConstraint}, whether the following statement is true: ${demand}`
+              : `Boolean, the user wants to do some 'wait for' operation. ${currentScreenshotConstraint}, please check whether the following statement is true: ${demand}`;
           demandInput = {
             [keyOfResult]: booleanPrompt,
           };
         } else if (ifTypeRestricted) {
           keyOfResult = type;
           demandInput = {
-            [keyOfResult]: `${type}, ${demand}`,
+            [keyOfResult]: `${type}, ${currentScreenshotConstraint}, ${demand}`,
           };
         }
 

diff --git a/packages/core/src/ai-model/inspect.ts b/packages/core/src/ai-model/inspect.ts
@@ -105,7 +105,7 @@ const promptsToChatParam = async (
       content: [
         {
           type: 'text',
-          text: 'Next, I will provide all the reference images.',
+          text: 'Next, I will provide all the reference images. These reference images are supporting context only, not the current screenshot being evaluated, unless the task explicitly asks for comparison or matching.',
         },
       ],
     });
@@ -121,7 +121,7 @@ const promptsToChatParam = async (
         content: [
           {
             type: 'text',
-            text: `this is the reference image named '${item.name}':`,
+            text: `this is the reference image named '${item.name}'. It is a reference image, not the current screenshot:`,
           },
         ],
       });
@@ -560,6 +560,11 @@ export async function AiExtractElementInfo<T>(options: {
   const userContent: ChatCompletionUserMessageParam['content'] = [];
 
   if (extractOption?.screenshotIncluded !== false) {
+    userContent.push({
+      type: 'text',
+      text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',
+    });
+
     userContent.push({
       type: 'image_url',
       image_url: {

diff --git a/packages/core/src/ai-model/prompt/extraction.ts b/packages/core/src/ai-model/prompt/extraction.ts
@@ -52,14 +52,18 @@ export function systemPromptToExtract() {
   return `
 You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
 
-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
+The user will give you a current screenshot to evaluate, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
+
+Base your answer on the current screenshot, and on the contents of it when provided. Treat the current screenshot and its contents as the primary source of truth for what is currently visible or true in the current state.
+
+If reference images are provided, use them only as supporting context unless <DATA_DEMAND> explicitly asks you to compare against them, match against them, or reason about them directly.
+
+Do not conclude that something exists in the current screenshot solely because it appears in a reference image. When the current screenshot or its contents conflict with a reference image, trust the current screenshot and its contents about the current state.
 
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 
 When DATA_DEMAND is a JSON object, the keys in your response must exactly match the keys in DATA_DEMAND. Do not rename, translate, or substitute any key.
 
-If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
-
 
 Return in the following XML format:
 <thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>

diff --git a/packages/core/tests/unit-test/inspect-extract-prompt.test.ts b/packages/core/tests/unit-test/inspect-extract-prompt.test.ts
@@ -0,0 +1,140 @@
+import type { IModelConfig } from '@midscene/shared/env';
+import { createFakeContext } from 'tests/utils';
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('@/ai-model/service-caller/index', async () => {
+  const actual = await vi.importActual<
+    typeof import('@/ai-model/service-caller/index')
+  >('@/ai-model/service-caller/index');
+  return {
+    ...actual,
+    AIResponseParseError: class AIResponseParseError extends Error {},
+    callAI: vi.fn(),
+    callAIWithObjectResponse: vi.fn(),
+    callAIWithStringResponse: vi.fn(),
+  };
+});
+
+vi.mock('@midscene/shared/img', async () => {
+  const actual = await vi.importActual<typeof import('@midscene/shared/img')>(
+    '@midscene/shared/img',
+  );
+  return {
+    ...actual,
+    preProcessImageUrl: vi
+      .fn()
+      .mockResolvedValue('data:image/png;base64,REFERENCE'),
+  };
+});
+
+import { AiExtractElementInfo } from '@/ai-model/inspect';
+import { callAI } from '@/ai-model/service-caller/index';
+import { preProcessImageUrl } from '@midscene/shared/img';
+
+describe('AiExtractElementInfo prompt assembly', () => {
+  const modelConfig: IModelConfig = {
+    modelFamily: 'qwen2.5-vl',
+    modelName: 'test-model',
+    modelDescription: 'test-model-desc',
+    intent: 'insight',
+  };
+
+  beforeEach(() => {
+    vi.clearAllMocks();
+    vi.mocked(callAI).mockResolvedValue({
+      content:
+        '<thought>Looks correct.</thought><data-json>{"result":true}</data-json>',
+      usage: undefined,
+      reasoning_content: undefined,
+    } as any);
+  });
+
+  it('marks the current screenshot as primary and reference images as supporting context', async () => {
+    const context = createFakeContext();
+
+    const result = await AiExtractElementInfo<{ result: boolean }>({
+      context,
+      dataQuery: {
+        StatementIsTruthy:
+          'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, whether the following statement is true: 有点赞按钮',
+      },
+      multimodalPrompt: {
+        images: [
+          {
+            name: 'like-button',
+            url: 'https://example.com/ref.png',
+          },
+        ],
+        convertHttpImage2Base64: true,
+      },
+      modelConfig,
+    });
+
+    expect(result.parseResult.data).toEqual({ result: true });
+    expect(preProcessImageUrl).toHaveBeenCalledWith(
+      'https://example.com/ref.png',
+      true,
+    );
+
+    const msgs = vi.mocked(callAI).mock.calls[0]?.[0];
+    expect(msgs).toHaveLength(5);
+    expect(msgs?.[0]).toMatchObject({
+      role: 'system',
+      content: expect.stringContaining(
+        'Base your answer on the current screenshot, and on the contents of it when provided.',
+      ),
+    });
+    expect(msgs?.[1]).toMatchObject({
+      role: 'user',
+      content: expect.arrayContaining([
+        expect.objectContaining({
+          type: 'text',
+          text: expect.stringContaining(
+            'This is the current screenshot to evaluate.',
+          ),
+        }),
+        expect.objectContaining({
+          type: 'image_url',
+          image_url: expect.objectContaining({
+            url: expect.stringMatching(/^data:image\/png;base64,/),
+          }),
+        }),
+        expect.objectContaining({
+          type: 'text',
+          text: expect.stringContaining('<DATA_DEMAND>'),
+        }),
+      ]),
+    });
+    expect(msgs?.[2]).toMatchObject({
+      role: 'user',
+      content: [
+        expect.objectContaining({
+          type: 'text',
+          text: expect.stringContaining(
+            'reference images are supporting context only',
+          ),
+        }),
+      ],
+    });
+    expect(msgs?.[3]).toMatchObject({
+      role: 'user',
+      content: [
+        expect.objectContaining({
+          type: 'text',
+          text: "this is the reference image named 'like-button'. It is a reference image, not the current screenshot:",
+        }),
+      ],
+    });
+    expect(msgs?.[4]).toMatchObject({
+      role: 'user',
+      content: [
+        expect.objectContaining({
+          type: 'image_url',
+          image_url: expect.objectContaining({
+            url: 'data:image/png;base64,REFERENCE',
+          }),
+        }),
+      ],
+    });
+  });
+});
diff --git a/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap b/packages/core/tests/unit-test/prompt/__snapshots__/prompt.test.ts.snap
@@ -30,14 +30,18 @@ exports[`extract element > systemPromptToExtract 1`] = `
 "
 You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
 
-The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
+The user will give you a current screenshot to evaluate, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
+
+Base your answer on the current screenshot, and on the contents of it when provided. Treat the current screenshot and its contents as the primary source of truth for what is currently visible or true in the current state.
+
+If reference images are provided, use them only as supporting context unless <DATA_DEMAND> explicitly asks you to compare against them, match against them, or reason about them directly.
+
+Do not conclude that something exists in the current screenshot solely because it appears in a reference image. When the current screenshot or its contents conflict with a reference image, trust the current screenshot and its contents about the current state.
 
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
 
 When DATA_DEMAND is a JSON object, the keys in your response must exactly match the keys in DATA_DEMAND. Do not rename, translate, or substitute any key.
 
-If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
-
 
 Return in the following XML format:
 <thought>the thinking process of the extraction, less than 300 words. Use English in this field.</thought>

diff --git a/packages/core/tests/unit-test/tasks-null-data.test.ts b/packages/core/tests/unit-test/tasks-null-data.test.ts
@@ -173,6 +173,18 @@ describe('TaskExecutor - Null Data Handling', () => {
           uiContext: await createEmptyUIContext(),
         } as any),
       ).rejects.toThrow('Assertion failed: Could not verify assertion');
+
+      expect(mockInsight.extract).toHaveBeenCalledWith(
+        {
+          StatementIsTruthy:
+            'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, whether the following statement is true: Page title is correct',
+        },
+        mockModelConfig,
+        {},
+        '',
+        undefined,
+        expect.anything(),
+      );
     });
 
     it('should handle valid data for WaitFor operation', async () => {
@@ -378,7 +390,8 @@ describe('TaskExecutor - Null Data Handling', () => {
 
       expect(mockInsight.extract).toHaveBeenCalledWith(
         {
-          Number: 'Number, Extract the price',
+          Number:
+            'Number, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, Extract the price',
         },
         mockModelConfig,
         {},
@@ -467,7 +480,72 @@ describe('TaskExecutor - Null Data Handling', () => {
         uiContext: await createEmptyUIContext(),
       } as any);
 
+      expect(mockInsight.extract).toHaveBeenCalledWith(
+        {
+          Number:
+            'Number, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, Extract the price',
+        },
+        mockModelConfig,
+        {},
+        '',
+        undefined,
+        expect.anything(),
+      );
       expect(result.output).toBeNull();
     });
+
+    it('should prepend current screenshot guidance for Boolean type query', async () => {
+      const mockInsight = {
+        contextRetrieverFn: vi.fn(async () => await createMockUIContext()),
+        extract: vi.fn(async () => ({
+          data: {
+            Boolean: true,
+          },
+          usage: { totalTokens: 100 },
+          thought: 'The condition is satisfied in the current screenshot',
+          dump: createMockDump(
+            { Boolean: true },
+            'The condition is satisfied in the current screenshot',
+            { totalTokens: 100 },
+          ),
+        })),
+        onceDumpUpdatedFn: undefined,
+      } as any;
+
+      const mockModelConfig: IModelConfig = {
+        modelName: 'mock-model',
+        modelDescription: 'mock-model-description',
+        intent: 'default',
+      };
+
+      const taskExecutor = new TaskExecutor({} as any, mockInsight, {
+        actionSpace: [],
+      });
+
+      const queryTask = await (taskExecutor as any).createTypeQueryTask(
+        'Boolean',
+        'there is a like button',
+        mockModelConfig,
+        {},
+      );
+
+      const result = await queryTask.executor({}, {
+        task: queryTask,
+        uiContext: await createEmptyUIContext(),
+      } as any);
+
+      expect(mockInsight.extract).toHaveBeenCalledWith(
+        {
+          Boolean:
+            'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, there is a like button',
+        },
+        mockModelConfig,
+        {},
+        '',
+        undefined,
+        expect.anything(),
+      );
+      expect(result.output).toBe(true);
+    });
   });
 });