Skip to content

Commit 518101f

Browse files
Feat: Omni parser V2 integration (#14)
* Add omni-parser system prompt for interface interaction Introduce a detailed system prompt for the omni-parser to guide AI interaction with web and mobile interfaces. This includes step-by-step task guidelines, tool usage instructions, and contextual format definitions for accurate and consistent task execution. * Integrate OmniParser support into backend services Added interfaces for OmniParser responses and updated OmniParserService to process base64 images with coordinate calculations. Integrated OmniParser into chat and LLM-related workflows to enhance system prompts with parsing results. Refactored relevant components and ensured compatibility with existing logic. * Refactor OmniParser integration and standardize type usage Replaced `OmniParserResult` with `OmniParserResponse` across services to align with updated interfaces. Simplified and centralized OmniParser utilities, eliminating redundant functions and improving code reuse. Enhanced support for handling `OmniParserResponse` in different providers and adjusted related logic for consistency. * Refactor omni-parser integration in prompt system Update prompt logic to handle `omniParserResult` more consistently, replacing redundant utility calls and simplifying conditionals. This ensures cleaner code and improves maintainability across multiple LLM provider implementations.
1 parent eed2fdc commit 518101f

File tree

13 files changed

+260
-217
lines changed

13 files changed

+260
-217
lines changed

backend/src/controllers/chatController.ts

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,19 @@ import { StreamingSource } from "../types/stream.types";
44
import { TestcaseController } from "./testcaseController";
55
import { getLatestScreenshot, saveScreenshot } from "../utils/screenshotUtils";
66
import { ExploreActionTypes, Modes } from "../types";
7+
import omniParserService from "../services/OmniParserService";
8+
import { config } from "../config";
79

810
export class ChatController {
911
static async handleChatMessage(req: Request, res: Response): Promise<void> {
1012
try {
1113
// Get data from request body and query params
12-
const { message, history, omniParserResult } = req.body;
14+
const { message, history } = req.body;
1315
const folderPath = req.query.folderPath as string;
1416
const currentChatId = req.query.currentChatId as string;
1517
const source = req.query.source as StreamingSource | undefined;
1618
const saveScreenshots = req.query.saveScreenshots as string;
19+
const mode = req.query.mode as Modes;
1720
const type = req.query.type as ExploreActionTypes;
1821

1922
// Always reset and recreate the provider with the correct mode to prevent context bleed
@@ -25,28 +28,33 @@ export class ChatController {
2528
if (!message || !Array.isArray(history)) {
2629
res.status(400).json({
2730
status: "error",
28-
message: "Message and valid history array are required",
31+
message: "Message and valid history array are required"
2932
});
3033
return;
3134
}
3235

3336
// Get latest screenshot if available
3437
const latestScreenshot = await getLatestScreenshot(source);
38+
let omniParserResult = null;
39+
if (config.omniParser.enabled) {
40+
omniParserResult = await omniParserService.processImage(latestScreenshot.originalImage);
41+
console.log("OmniParser result:", omniParserResult);
42+
}
3543

3644
await Promise.all([
3745
folderPath &&
38-
saveScreenshots === "true" &&
39-
saveScreenshot(
40-
latestScreenshot,
41-
folderPath,
42-
currentChatId,
43-
),
46+
saveScreenshots === "true" &&
47+
saveScreenshot(
48+
latestScreenshot,
49+
folderPath,
50+
currentChatId
51+
),
4452
folderPath &&
45-
TestcaseController.downloadTestcase(
46-
history,
47-
currentChatId,
48-
folderPath
49-
),
53+
TestcaseController.downloadTestcase(
54+
history,
55+
currentChatId,
56+
folderPath
57+
),
5058
ChatService.processMessage(
5159
res,
5260
message,
@@ -55,22 +63,22 @@ export class ChatController {
5563
type,
5664
latestScreenshot,
5765
source,
58-
omniParserResult
66+
omniParserResult as any
5967
),
6068
]);
6169
} catch (error) {
6270
console.error("Chat message error:", error);
6371
res.status(500).json({
6472
status: "error",
65-
message: "Error processing chat message",
73+
message: "Error processing chat message"
6674
});
6775
}
6876
}
6977

7078
static healthCheck(_req: Request, res: Response): void {
7179
res.json({
7280
status: "ok",
73-
message: "Hurray.. Server is running!",
81+
message: "Hurray.. Server is running!"
7482
});
7583
}
7684
}

backend/src/controllers/streamingController.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ export class StreamingController {
7373
action: "launch",
7474
url: "about:blank"
7575
};
76-
76+
7777
try {
7878
// This will simply validate the browser is available, or create a new one if not
7979
await this.streamingService.performAction(
@@ -84,17 +84,13 @@ export class StreamingController {
8484
console.error("Error ensuring browser is available:", browserError);
8585
// Continue anyway to try taking a screenshot
8686
}
87-
87+
8888
// Now attempt to take the screenshot
8989
const screenshot = await this.streamingService.takeScreenshot();
9090

9191
if (screenshot) {
92-
const imageBuffer = Buffer.from(
93-
screenshot.replace(/^data:image\/\w+;base64,/, ""),
94-
"base64",
95-
);
9692
const omniParserResults =
97-
await omniParserService.processImage(imageBuffer);
93+
await omniParserService.processImage(screenshot);
9894

9995
socket.emit("screenshot-snapshot", {
10096
image: screenshot,

backend/src/prompts/systemPrompts.prompt.ts

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,40 @@
11
import { StreamingSource } from "../types/stream.types";
22
import {
33
IClickableElement,
4-
IProcessedScreenshot,
4+
IProcessedScreenshot, OmniParserResponse
55
} from "../services/interfaces/BrowserService";
66
import { convertElementsToInput } from "../utils/prompt.util";
7+
import { addOmniParserResults } from "../utils/common.util";
78

89
const BASE_SYSTEM_PROMPT = (
9-
isMarkedScreenshotAvailable: boolean,
10+
isBrowser: boolean,
11+
omniParserResult: OmniParserResponse | null
1012
) => `You are factif-ai an AI agent experienced in web and mobile interface usage & testing.
1113
Make sure you understand the Environment Context. If the source is not provided, assume the default is Docker.
1214
${
13-
isMarkedScreenshotAvailable
15+
isBrowser || omniParserResult
1416
? `You will be provided with a marked screenshot where you can see elements that you can interact with and list of elements as element_list in the given format [marker_number]: html element tag details: [availability on the current viewport].
1517
Each mark in the screenshot have one unique number referred as marker_number. You are allowed to interact with marked elements only.`
1618
: ""
1719
}
1820
Scroll to explore more elements on the page if scroll is possible. Do not hallucinate.
1921
Understand the Task. split the task to steps and execute each step one by one.
2022
${
21-
isMarkedScreenshotAvailable
22-
? `
23-
Use element_list & marker_number to have an idea about available elements. Handle alert/confirmation popups if any.
24-
23+
isBrowser || omniParserResult ? `Use element_list & marker_number to have an idea about available elements. Handle alert/confirmation popups if any.` : ``
24+
}
25+
${
26+
isBrowser ? `
2527
example element_list:
2628
[0]: <button>Login</button>:[200,300]:[visible in the current viewport]
2729
[1]: <input type="text" placeholder="Username">:[125, 400]: [Not available in current viewport. Available on scroll]
2830
`
29-
: ""
31+
: omniParserResult ? `
32+
<element>
33+
<maker_number>marker number in the screenshot given</marker_number>
34+
<coordinates>center coordinate of the element. Use this value to interact with this element</coordinates>
35+
<content>text content of the element. such as label, description etc. Do not hallucinate on this. assume word by word meaning only</content>
36+
<is_intractable>boolean value denoting whether you can interact with this element or not</is_intractable>
37+
</element>` : ""
3038
}
3139
IMPORTANT: Before sending ANY response, you MUST verify it follows these rules:
3240
@@ -76,7 +84,7 @@ NEVER send a response with multiple tool uses.
7684
<action>click</action>
7785
<coordinate>450,300</coordinate>
7886
<about_this_action>Clicking on the username field</about_this_action>
79-
${isMarkedScreenshotAvailable ? `<marker_number>0<marker_number>` : ""}
87+
${isBrowser || omniParserResult ? `<marker_number>0<marker_number>` : ""}
8088
</perform_action>
8189
8290
3. Error Prevention
@@ -162,9 +170,9 @@ Parameters:
162170
- url: (optional) URL for 'launch' action
163171
* Example: <url>https://example.com</url>
164172
${
165-
isMarkedScreenshotAvailable
166-
? ``
167-
: `
173+
isBrowser || omniParserResult
174+
? ``
175+
: `
168176
- coordinate: (optional) X,Y coordinates for click/doubleClick
169177
* ONLY use coordinates from:
170178
1. Direct screenshot analysis with clear visual confirmation
@@ -174,7 +182,7 @@ Parameters:
174182
* For screenshot analysis: Describe element surroundings before identifying coordinates
175183
* For omni parser: Use provided formulas to calculate center coordinates
176184
* Example: <coordinate>450,300</coordinate>`
177-
}
185+
}
178186
- text: (optional) Text to type
179187
* Example: <text>Hello, world!</text>
180188
- key: (optional) Key to press
@@ -222,11 +230,11 @@ Usage:
222230
<perform_action>
223231
<action>Mandatory if the tool is action. action to perform. NEVER BE EMPTY</action>
224232
<url>URL to launch the browser at (optional) if action is launch then URL is mandatory</url>
225-
<coordinate>${isMarkedScreenshotAvailable ? `coordinate of the element in which the action has to perform. Coordinate will be available on the element list provided. NEVER BE EMPTY` : `x,y coordinates if the tool is click/doubleClick`}</coordinate>
233+
<coordinate>${isBrowser || omniParserResult ? `coordinate of the element in which the action has to perform. Coordinate will be available on the element list provided. NEVER BE EMPTY` : `x,y coordinates if the tool is click/doubleClick`}</coordinate>
226234
<text>provide text to type if the tool is type, key to press if the tool is keypress</text>
227235
<key>key to press if the tool is keypress</key>
228236
<about_this_action>any additional information you want to provide</about_this_action>
229-
${isMarkedScreenshotAvailable ? `<marker_number>Mandatory if the tool is action. NEVER BE EMPTY<marker_number>` : ""}
237+
${isBrowser || omniParserResult ? `<marker_number>Mandatory if the tool is action. NEVER BE EMPTY<marker_number>` : ""}
230238
</perform_action>
231239
232240
## ask_followup_question
@@ -259,17 +267,19 @@ Important Notes:
259267

260268
const getSystemPrompt = (
261269
source?: StreamingSource,
262-
hasOmniParserResults: boolean = false,
263-
imageData?: IProcessedScreenshot,
270+
omniParserResult: OmniParserResponse | null = null,
271+
imageData?: IProcessedScreenshot
264272
): string => {
265-
const isMarkedScreenshotAvailable =
266-
hasOmniParserResults || source === "chrome-puppeteer";
267-
let prompt = BASE_SYSTEM_PROMPT(isMarkedScreenshotAvailable);
273+
274+
let prompt = BASE_SYSTEM_PROMPT(source === "chrome-puppeteer",
275+
omniParserResult
276+
);
268277

269278
if (!source) return prompt;
270279

271280
return `${prompt}\n\n# Environment Context\nSource: ${source}
272281
${(imageData?.inference as IClickableElement[]).length > 0 ? `element_list: \n${convertElementsToInput(imageData?.inference as IClickableElement[])}\n\n` : ""}
282+
${omniParserResult ? `element_list: \n${addOmniParserResults(omniParserResult)}`: ''}
273283
To explore more use scroll_down or scroll_up based on your requirement.`;
274284
};
275285

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,76 @@
11
import axios from "axios";
2-
import FormData from "form-data";
32
import { config } from "../config";
3+
import { OmniParserElement, OmniParserProcessedElement, OmniParserResponse } from './interfaces/BrowserService';
4+
import sharp from 'sharp';
45

56
export class OmniParserService {
6-
private serverUrl: string;
7-
private enabled: boolean;
7+
serverUrl: string;
8+
enabled: boolean;
89

910
constructor() {
1011
this.serverUrl = config.omniParser.serverUrl;
1112
this.enabled = config.omniParser.enabled;
1213
}
1314

14-
async processImage(imageBuffer: Buffer): Promise<any> {
15-
if (!this.enabled) {
15+
async processImage(base64Image: string): Promise<OmniParserResponse | null> {
16+
if (!this.enabled || base64Image.length === 0) {
1617
return null;
1718
}
1819

19-
const formData = new FormData();
20-
formData.append("image", imageBuffer, {
21-
filename: "screenshot.png",
22-
contentType: "image/png",
23-
});
20+
const metadata = await sharp(Buffer.from(base64Image, 'base64')).metadata();
2421

2522
try {
26-
const response = await axios.post(`${this.serverUrl}/process`, formData, {
27-
headers: {
28-
...formData.getHeaders(),
23+
const url = `${this.serverUrl}/parse/`;
24+
const response = await axios.post(
25+
url,
26+
JSON.stringify({
27+
base64_image: base64Image,
28+
}),
29+
{
30+
headers: {
31+
'Content-Type': 'application/json',
32+
},
2933
},
30-
});
31-
return response.data;
34+
);
35+
return {
36+
processedImage: response.data.som_image_base64,
37+
elements: this.elementCoordinateGenerate(
38+
response.data.parsed_content_list,
39+
metadata.height as number,
40+
metadata.width as number,
41+
),
42+
};
3243
} catch (error: any) {
33-
console.error("OmniParser processing failed:", error);
44+
console.error('OmniParser processing failed:', error);
3445
return null;
3546
}
3647
}
48+
49+
elementCoordinateGenerate(
50+
elementList: OmniParserElement[],
51+
imageHeight: number,
52+
imageWidth: number,
53+
): OmniParserProcessedElement[] {
54+
return elementList.map((element) => {
55+
return {
56+
content: element.content,
57+
interactivity: element.interactivity,
58+
coordinates: this.calculateCoordinate(
59+
element.bbox,
60+
imageHeight,
61+
imageWidth,
62+
),
63+
};
64+
});
65+
}
66+
67+
calculateCoordinate(bbox: number[], height: number, width: number): string {
68+
const left = width * bbox[0];
69+
const top = height * bbox[1];
70+
const boxWidth = width * bbox[2];
71+
const boxHeight = height * bbox[3];
72+
return `${Math.floor((left + boxWidth) / 2)},${Math.floor((top + boxHeight) / 2)}`;
73+
}
3774
}
3875

3976
export default new OmniParserService();

backend/src/services/chatService.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import { ExploreModeOpenAIProvider } from "./llm/ExploreModeOpenAIProvider";
1111
import { OpenAIProvider } from "./llm/OpenAIProvider";
1212
import { GeminiProvider } from "./llm/GeminiProvider";
1313
import { AnthropicProvider } from "./llm/AnthropicProvider";
14-
import { IProcessedScreenshot } from "./interfaces/BrowserService";
14+
import { IProcessedScreenshot, OmniParserResponse } from "./interfaces/BrowserService";
1515

1616
export class ChatService {
1717
private static provider: LLMProvider;
@@ -71,7 +71,7 @@ export class ChatService {
7171
type: ExploreActionTypes = ExploreActionTypes.EXPLORE,
7272
imageData: IProcessedScreenshot,
7373
source?: StreamingSource,
74-
omniParserResult?: OmniParserResult
74+
omniParserResult?: OmniParserResponse,
7575
): Promise<void> {
7676
// Get the current model based on provider
7777
const model = (() => {

backend/src/services/interfaces/BrowserService.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,26 @@ export interface BrowserService {
1818
close(): Promise<ActionResult>;
1919
}
2020

21+
22+
export interface OmniParserElement {
23+
type: string;
24+
bbox: number[];
25+
interactivity: boolean;
26+
content: string;
27+
source: string;
28+
}
29+
30+
export interface OmniParserProcessedElement {
31+
interactivity: boolean;
32+
content: string;
33+
coordinates: string;
34+
}
35+
36+
export interface OmniParserResponse {
37+
processedImage: string;
38+
elements: OmniParserProcessedElement[];
39+
}
40+
2141
export type ActionType =
2242
| "click"
2343
| "type"

0 commit comments

Comments
 (0)