Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions packages/core/src/agent/tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -580,19 +580,21 @@ export class TaskExecutor {
const ifTypeRestricted = type !== 'Query';
let demandInput = demand;
let keyOfResult = 'result';
const currentScreenshotConstraint =
'based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images';
if (ifTypeRestricted && (type === 'Assert' || type === 'WaitFor')) {
keyOfResult = 'StatementIsTruthy';
const booleanPrompt =
type === 'Assert'
? `Boolean, whether the following statement is true: ${demand}`
: `Boolean, the user wants to do some 'wait for' operation, please check whether the following statement is true: ${demand}`;
? `Boolean, ${currentScreenshotConstraint}, whether the following statement is true: ${demand}`
: `Boolean, the user wants to do some 'wait for' operation. ${currentScreenshotConstraint}, please check whether the following statement is true: ${demand}`;
demandInput = {
[keyOfResult]: booleanPrompt,
};
} else if (ifTypeRestricted) {
keyOfResult = type;
demandInput = {
[keyOfResult]: `${type}, ${demand}`,
[keyOfResult]: `${type}, ${currentScreenshotConstraint}, ${demand}`,
};
}

Expand Down
9 changes: 7 additions & 2 deletions packages/core/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ const promptsToChatParam = async (
content: [
{
type: 'text',
text: 'Next, I will provide all the reference images.',
text: 'Next, I will provide all the reference images. These reference images are supporting context only, not the current screenshot being evaluated, unless the task explicitly asks for comparison or matching.',
},
],
});
Expand All @@ -121,7 +121,7 @@ const promptsToChatParam = async (
content: [
{
type: 'text',
text: `this is the reference image named '${item.name}':`,
text: `this is the reference image named '${item.name}'. It is a reference image, not the current screenshot:`,
},
],
});
Expand Down Expand Up @@ -560,6 +560,11 @@ export async function AiExtractElementInfo<T>(options: {
const userContent: ChatCompletionUserMessageParam['content'] = [];

if (extractOption?.screenshotIncluded !== false) {
userContent.push({
type: 'text',
text: 'This is the current screenshot to evaluate. Unless <DATA_DEMAND> explicitly asks for comparison or matching against reference images, base your answer on this screenshot and its contents when provided.',
});

userContent.push({
type: 'image_url',
image_url: {
Expand Down
10 changes: 7 additions & 3 deletions packages/core/src/ai-model/prompt/extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,18 @@ export function systemPromptToExtract() {
return `
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.

The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
The user will give you a current screenshot to evaluate, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.

Base your answer on the current screenshot, and on the contents of it when provided. Treat the current screenshot and its contents as the primary source of truth for what is currently visible or true in the current state.

If reference images are provided, use them only as supporting context unless <DATA_DEMAND> explicitly asks you to compare against them, match against them, or reason about them directly.

Do not conclude that something exists in the current screenshot solely because it appears in a reference image. When the current screenshot or its contents conflict with a reference image, trust the current screenshot and its contents about the current state.

If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.

When DATA_DEMAND is a JSON object, the keys in your response must exactly match the keys in DATA_DEMAND. Do not rename, translate, or substitute any key.

If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.


Return in the following XML format:
<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>
Expand Down
140 changes: 140 additions & 0 deletions packages/core/tests/unit-test/inspect-extract-prompt.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import type { IModelConfig } from '@midscene/shared/env';
import { createFakeContext } from 'tests/utils';
import { beforeEach, describe, expect, it, vi } from 'vitest';

vi.mock('@/ai-model/service-caller/index', async () => {
const actual = await vi.importActual<
typeof import('@/ai-model/service-caller/index')
>('@/ai-model/service-caller/index');
return {
...actual,
AIResponseParseError: class AIResponseParseError extends Error {},
callAI: vi.fn(),
callAIWithObjectResponse: vi.fn(),
callAIWithStringResponse: vi.fn(),
};
});

vi.mock('@midscene/shared/img', async () => {
const actual = await vi.importActual<typeof import('@midscene/shared/img')>(
'@midscene/shared/img',
);
return {
...actual,
preProcessImageUrl: vi
.fn()
.mockResolvedValue('data:image/png;base64,REFERENCE'),
};
});

import { AiExtractElementInfo } from '@/ai-model/inspect';
import { callAI } from '@/ai-model/service-caller/index';
import { preProcessImageUrl } from '@midscene/shared/img';

describe('AiExtractElementInfo prompt assembly', () => {
const modelConfig: IModelConfig = {
modelFamily: 'qwen2.5-vl',
modelName: 'test-model',
modelDescription: 'test-model-desc',
intent: 'insight',
};

beforeEach(() => {
vi.clearAllMocks();
vi.mocked(callAI).mockResolvedValue({
content:
'<thought>Looks correct.</thought><data-json>{"result":true}</data-json>',
usage: undefined,
reasoning_content: undefined,
} as any);
});

it('marks the current screenshot as primary and reference images as supporting context', async () => {
const context = createFakeContext();

const result = await AiExtractElementInfo<{ result: boolean }>({
context,
dataQuery: {
StatementIsTruthy:
'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, whether the following statement is true: 有点赞按钮',
},
multimodalPrompt: {
images: [
{
name: 'like-button',
url: 'https://example.com/ref.png',
},
],
convertHttpImage2Base64: true,
},
modelConfig,
});

expect(result.parseResult.data).toEqual({ result: true });
expect(preProcessImageUrl).toHaveBeenCalledWith(
'https://example.com/ref.png',
true,
);

const msgs = vi.mocked(callAI).mock.calls[0]?.[0];
expect(msgs).toHaveLength(5);
expect(msgs?.[0]).toMatchObject({
role: 'system',
content: expect.stringContaining(
'Base your answer on the current screenshot, and on the contents of it when provided.',
),
});
expect(msgs?.[1]).toMatchObject({
role: 'user',
content: expect.arrayContaining([
expect.objectContaining({
type: 'text',
text: expect.stringContaining(
'This is the current screenshot to evaluate.',
),
}),
expect.objectContaining({
type: 'image_url',
image_url: expect.objectContaining({
url: expect.stringMatching(/^data:image\/png;base64,/),
}),
}),
expect.objectContaining({
type: 'text',
text: expect.stringContaining('<DATA_DEMAND>'),
}),
]),
});
expect(msgs?.[2]).toMatchObject({
role: 'user',
content: [
expect.objectContaining({
type: 'text',
text: expect.stringContaining(
'reference images are supporting context only',
),
}),
],
});
expect(msgs?.[3]).toMatchObject({
role: 'user',
content: [
expect.objectContaining({
type: 'text',
text: "this is the reference image named 'like-button'. It is a reference image, not the current screenshot:",
}),
],
});
expect(msgs?.[4]).toMatchObject({
role: 'user',
content: [
expect.objectContaining({
type: 'image_url',
image_url: expect.objectContaining({
url: 'data:image/png;base64,REFERENCE',
}),
}),
],
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,18 @@ exports[`extract element > systemPromptToExtract 1`] = `
"
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.

The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
The user will give you a current screenshot to evaluate, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.

Base your answer on the current screenshot, and on the contents of it when provided. Treat the current screenshot and its contents as the primary source of truth for what is currently visible or true in the current state.

If reference images are provided, use them only as supporting context unless <DATA_DEMAND> explicitly asks you to compare against them, match against them, or reason about them directly.

Do not conclude that something exists in the current screenshot solely because it appears in a reference image. When the current screenshot or its contents conflict with a reference image, trust the current screenshot and its contents about the current state.

If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.

When DATA_DEMAND is a JSON object, the keys in your response must exactly match the keys in DATA_DEMAND. Do not rename, translate, or substitute any key.

If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.


Return in the following XML format:
<thought>the thinking process of the extraction, less than 300 words. Use English in this field.</thought>
Expand Down
80 changes: 79 additions & 1 deletion packages/core/tests/unit-test/tasks-null-data.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,18 @@ describe('TaskExecutor - Null Data Handling', () => {
uiContext: await createEmptyUIContext(),
} as any),
).rejects.toThrow('Assertion failed: Could not verify assertion');

expect(mockInsight.extract).toHaveBeenCalledWith(
{
StatementIsTruthy:
'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, whether the following statement is true: Page title is correct',
},
mockModelConfig,
{},
'',
undefined,
expect.anything(),
);
});

it('should handle valid data for WaitFor operation', async () => {
Expand Down Expand Up @@ -378,7 +390,8 @@ describe('TaskExecutor - Null Data Handling', () => {

expect(mockInsight.extract).toHaveBeenCalledWith(
{
Number: 'Number, Extract the price',
Number:
'Number, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, Extract the price',
},
mockModelConfig,
{},
Expand Down Expand Up @@ -467,7 +480,72 @@ describe('TaskExecutor - Null Data Handling', () => {
uiContext: await createEmptyUIContext(),
} as any);

expect(mockInsight.extract).toHaveBeenCalledWith(
{
Number:
'Number, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, Extract the price',
},
mockModelConfig,
{},
'',
undefined,
expect.anything(),
);
expect(result.output).toBeNull();
});

it('should prepend current screenshot guidance for Boolean type query', async () => {
const mockInsight = {
contextRetrieverFn: vi.fn(async () => await createMockUIContext()),
extract: vi.fn(async () => ({
data: {
Boolean: true,
},
usage: { totalTokens: 100 },
thought: 'The condition is satisfied in the current screenshot',
dump: createMockDump(
{ Boolean: true },
'The condition is satisfied in the current screenshot',
{ totalTokens: 100 },
),
})),
onceDumpUpdatedFn: undefined,
} as any;

const mockModelConfig: IModelConfig = {
modelName: 'mock-model',
modelDescription: 'mock-model-description',
intent: 'default',
};

const taskExecutor = new TaskExecutor({} as any, mockInsight, {
actionSpace: [],
});

const queryTask = await (taskExecutor as any).createTypeQueryTask(
'Boolean',
'there is a like button',
mockModelConfig,
{},
);

const result = await queryTask.executor({}, {
task: queryTask,
uiContext: await createEmptyUIContext(),
} as any);

expect(mockInsight.extract).toHaveBeenCalledWith(
{
Boolean:
'Boolean, based on the current screenshot and its contents if provided, unless the user explicitly asks to compare with reference images, there is a like button',
},
mockModelConfig,
{},
'',
undefined,
expect.anything(),
);
expect(result.output).toBe(true);
});
});
});
Loading