presidio-oss
diff --git a/‎backend/src/controllers/chatController.ts‎
Lines changed: 24 additions & 16 deletions b/‎backend/src/controllers/chatController.ts‎
Lines changed: 24 additions & 16 deletions
diff --git a/‎backend/src/controllers/streamingController.ts‎
Lines changed: 3 additions & 7 deletions b/‎backend/src/controllers/streamingController.ts‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎backend/src/prompts/systemPrompts.prompt.ts‎
Lines changed: 30 additions & 20 deletions b/‎backend/src/prompts/systemPrompts.prompt.ts‎
Lines changed: 30 additions & 20 deletions
diff --git a/‎backend/src/services/OmniParserService.ts‎
Lines changed: 53 additions & 16 deletions b/‎backend/src/services/OmniParserService.ts‎
Lines changed: 53 additions & 16 deletions
diff --git a/‎backend/src/services/chatService.ts‎
Lines changed: 2 additions & 2 deletions b/‎backend/src/services/chatService.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backend/src/services/interfaces/BrowserService.ts‎
Lines changed: 20 additions & 0 deletions b/‎backend/src/services/interfaces/BrowserService.ts‎
Lines changed: 20 additions & 0 deletions
@@ -4,16 +4,19 @@ import { StreamingSource } from "../types/stream.types";
 import { TestcaseController } from "./testcaseController";
 import { getLatestScreenshot, saveScreenshot } from "../utils/screenshotUtils";
 import { ExploreActionTypes, Modes } from "../types";
+import omniParserService from "../services/OmniParserService";
+import { config } from "../config";
 
 export class ChatController {
   static async handleChatMessage(req: Request, res: Response): Promise<void> {
     try {
       // Get data from request body and query params
-      const { message, history, omniParserResult } = req.body;
+      const { message, history } = req.body;
       const folderPath = req.query.folderPath as string;
       const currentChatId = req.query.currentChatId as string;
       const source = req.query.source as StreamingSource | undefined;
       const saveScreenshots = req.query.saveScreenshots as string;
+      const mode = req.query.mode as Modes;
       const type = req.query.type as ExploreActionTypes;
 
       // Always reset and recreate the provider with the correct mode to prevent context bleed
@@ -25,28 +28,33 @@ export class ChatController {
       if (!message || !Array.isArray(history)) {
         res.status(400).json({
           status: "error",
-          message: "Message and valid history array are required",
+          message: "Message and valid history array are required"
         });
         return;
       }
 
       // Get latest screenshot if available
       const latestScreenshot = await getLatestScreenshot(source);
+      let omniParserResult = null;
+      if (config.omniParser.enabled) {
+        omniParserResult = await omniParserService.processImage(latestScreenshot.originalImage);
+        console.log("OmniParser result:", omniParserResult);
+      }
 
       await Promise.all([
         folderPath &&
-          saveScreenshots === "true" &&
-          saveScreenshot(
-            latestScreenshot,
-            folderPath,
-            currentChatId,
-          ),
+        saveScreenshots === "true" &&
+        saveScreenshot(
+          latestScreenshot,
+          folderPath,
+          currentChatId
+        ),
         folderPath &&
-          TestcaseController.downloadTestcase(
-            history,
-            currentChatId,
-            folderPath
-          ),
+        TestcaseController.downloadTestcase(
+          history,
+          currentChatId,
+          folderPath
+        ),
         ChatService.processMessage(
           res,
           message,
@@ -55,22 +63,22 @@ export class ChatController {
           type,
           latestScreenshot,
           source,
-          omniParserResult
+          omniParserResult as any
         ),
       ]);
     } catch (error) {
       console.error("Chat message error:", error);
       res.status(500).json({
         status: "error",
-        message: "Error processing chat message",
+        message: "Error processing chat message"
       });
     }
   }
 
   static healthCheck(_req: Request, res: Response): void {
     res.json({
       status: "ok",
-      message: "Hurray.. Server is running!",
+      message: "Hurray.. Server is running!"
     });
   }
 }
@@ -73,7 +73,7 @@ export class StreamingController {
         action: "launch",
         url: "about:blank"
       };
-      
+
       try {
         // This will simply validate the browser is available, or create a new one if not
         await this.streamingService.performAction(
@@ -84,17 +84,13 @@ export class StreamingController {
         console.error("Error ensuring browser is available:", browserError);
         // Continue anyway to try taking a screenshot
       }
-      
+
       // Now attempt to take the screenshot
       const screenshot = await this.streamingService.takeScreenshot();
 
       if (screenshot) {
-        const imageBuffer = Buffer.from(
-          screenshot.replace(/^data:image\/\w+;base64,/, ""),
-          "base64",
-        );
         const omniParserResults =
-          await omniParserService.processImage(imageBuffer);
+          await omniParserService.processImage(screenshot);
 
         socket.emit("screenshot-snapshot", {
           image: screenshot,
 
@@ -1,32 +1,40 @@
 import { StreamingSource } from "../types/stream.types";
 import {
   IClickableElement,
-  IProcessedScreenshot,
+  IProcessedScreenshot, OmniParserResponse
 } from "../services/interfaces/BrowserService";
 import { convertElementsToInput } from "../utils/prompt.util";
+import { addOmniParserResults } from "../utils/common.util";
 
 const BASE_SYSTEM_PROMPT = (
-  isMarkedScreenshotAvailable: boolean,
+  isBrowser: boolean,
+  omniParserResult: OmniParserResponse | null
 ) => `You are factif-ai an AI agent experienced in web and mobile interface usage & testing.
 Make sure you understand the Environment Context. If the source is not provided, assume the default is Docker.
 ${
-  isMarkedScreenshotAvailable
+  isBrowser || omniParserResult
     ? `You will be provided with a marked screenshot where you can see elements that you can interact with and list of elements as element_list in the given format [marker_number]: html element tag details: [availability on the current viewport]. 
 Each mark in the screenshot have one unique number referred as marker_number. You are allowed to interact with marked elements only.`
     : ""
 }
 Scroll to explore more elements on the page if scroll is possible. Do not hallucinate.
 Understand the Task. split the task to steps and execute each step one by one.
 ${
-  isMarkedScreenshotAvailable
-    ? `
-Use element_list & marker_number to have an idea about available elements. Handle alert/confirmation popups if any.
-
+  isBrowser || omniParserResult ? `Use element_list & marker_number to have an idea about available elements. Handle alert/confirmation popups if any.` : ``
+}
+${
+  isBrowser ? `
 example element_list: 
 [0]: <button>Login</button>:[200,300]:[visible in the current viewport] 
 [1]: <input type="text" placeholder="Username">:[125, 400]: [Not available in current viewport. Available on scroll]
 `
-    : ""
+    : omniParserResult ? `
+    <element>
+<maker_number>marker number in the screenshot given</marker_number>
+<coordinates>center coordinate of the element. Use this value to interact with this element</coordinates>
+<content>text content of the element. such as label, description etc. Do not hallucinate on this. assume word by word meaning only</content>
+<is_intractable>boolean value denoting whether you can interact with this element or not</is_intractable>
+</element>` : ""
 }
 IMPORTANT: Before sending ANY response, you MUST verify it follows these rules:
 
@@ -76,7 +84,7 @@ NEVER send a response with multiple tool uses.
       <action>click</action>
       <coordinate>450,300</coordinate>
       <about_this_action>Clicking on the username field</about_this_action>
-      ${isMarkedScreenshotAvailable ? `<marker_number>0<marker_number>` : ""}
+      ${isBrowser || omniParserResult ? `<marker_number>0<marker_number>` : ""}
     </perform_action>
 
 3. Error Prevention
@@ -162,9 +170,9 @@ Parameters:
 - url: (optional) URL for 'launch' action
     * Example: <url>https://example.com</url>
     ${
-      isMarkedScreenshotAvailable
-        ? ``
-        : `
+  isBrowser || omniParserResult
+    ? ``
+    : `
  - coordinate: (optional) X,Y coordinates for click/doubleClick
     * ONLY use coordinates from:
       1. Direct screenshot analysis with clear visual confirmation
@@ -174,7 +182,7 @@ Parameters:
     * For screenshot analysis: Describe element surroundings before identifying coordinates
     * For omni parser: Use provided formulas to calculate center coordinates
     * Example: <coordinate>450,300</coordinate>`
-    }
+}
 - text: (optional) Text to type
     * Example: <text>Hello, world!</text>
 - key: (optional) Key to press
@@ -222,11 +230,11 @@ Usage:
     <perform_action>
       <action>Mandatory if the tool is action. action to perform. NEVER BE EMPTY</action>
       <url>URL to launch the browser at (optional) if action is launch then URL is mandatory</url>
-      <coordinate>${isMarkedScreenshotAvailable ? `coordinate of the element in which the action has to perform. Coordinate will be available on the element list provided. NEVER BE EMPTY` : `x,y coordinates if the tool is click/doubleClick`}</coordinate>
+      <coordinate>${isBrowser || omniParserResult ? `coordinate of the element in which the action has to perform. Coordinate will be available on the element list provided. NEVER BE EMPTY` : `x,y coordinates if the tool is click/doubleClick`}</coordinate>
       <text>provide text to type if the tool is type, key to press if the tool is keypress</text>
       <key>key to press if the tool is keypress</key>
       <about_this_action>any additional information you want to provide</about_this_action>
-      ${isMarkedScreenshotAvailable ? `<marker_number>Mandatory if the tool is action. NEVER BE EMPTY<marker_number>` : ""}
+      ${isBrowser || omniParserResult ? `<marker_number>Mandatory if the tool is action. NEVER BE EMPTY<marker_number>` : ""}
     </perform_action>
 
 ## ask_followup_question
@@ -259,17 +267,19 @@ Important Notes:
 
 const getSystemPrompt = (
   source?: StreamingSource,
-  hasOmniParserResults: boolean = false,
-  imageData?: IProcessedScreenshot,
+  omniParserResult: OmniParserResponse | null = null,
+  imageData?: IProcessedScreenshot
 ): string => {
-  const isMarkedScreenshotAvailable =
-    hasOmniParserResults || source === "chrome-puppeteer";
-  let prompt = BASE_SYSTEM_PROMPT(isMarkedScreenshotAvailable);
+
+  let prompt = BASE_SYSTEM_PROMPT(source === "chrome-puppeteer",
+    omniParserResult
+  );
 
   if (!source) return prompt;
 
   return `${prompt}\n\n# Environment Context\nSource: ${source}
   ${(imageData?.inference as IClickableElement[]).length > 0 ? `element_list: \n${convertElementsToInput(imageData?.inference as IClickableElement[])}\n\n` : ""}
+  ${omniParserResult ? `element_list: \n${addOmniParserResults(omniParserResult)}`: ''}
    To explore more use scroll_down or scroll_up based on your requirement.`;
 };
 
 
@@ -1,39 +1,76 @@
 import axios from "axios";
-import FormData from "form-data";
 import { config } from "../config";
+import { OmniParserElement, OmniParserProcessedElement, OmniParserResponse } from './interfaces/BrowserService';
+import sharp from 'sharp';
 
 export class OmniParserService {
-  private serverUrl: string;
-  private enabled: boolean;
+  serverUrl: string;
+  enabled: boolean;
 
   constructor() {
     this.serverUrl = config.omniParser.serverUrl;
     this.enabled = config.omniParser.enabled;
   }
 
-  async processImage(imageBuffer: Buffer): Promise<any> {
-    if (!this.enabled) {
+  async processImage(base64Image: string): Promise<OmniParserResponse | null> {
+    if (!this.enabled || base64Image.length === 0) {
       return null;
     }
 
-    const formData = new FormData();
-    formData.append("image", imageBuffer, {
-      filename: "screenshot.png",
-      contentType: "image/png",
-    });
+    const metadata = await sharp(Buffer.from(base64Image, 'base64')).metadata();
 
     try {
-      const response = await axios.post(`${this.serverUrl}/process`, formData, {
-        headers: {
-          ...formData.getHeaders(),
+      const url = `${this.serverUrl}/parse/`;
+      const response = await axios.post(
+        url,
+        JSON.stringify({
+          base64_image: base64Image,
+        }),
+        {
+          headers: {
+            'Content-Type': 'application/json',
+          },
         },
-      });
-      return response.data;
+      );
+      return {
+        processedImage: response.data.som_image_base64,
+        elements: this.elementCoordinateGenerate(
+          response.data.parsed_content_list,
+          metadata.height as number,
+          metadata.width as number,
+        ),
+      };
     } catch (error: any) {
-      console.error("OmniParser processing failed:", error);
+      console.error('OmniParser processing failed:', error);
       return null;
     }
   }
+
+  elementCoordinateGenerate(
+    elementList: OmniParserElement[],
+    imageHeight: number,
+    imageWidth: number,
+  ): OmniParserProcessedElement[] {
+    return elementList.map((element) => {
+      return {
+        content: element.content,
+        interactivity: element.interactivity,
+        coordinates: this.calculateCoordinate(
+          element.bbox,
+          imageHeight,
+          imageWidth,
+        ),
+      };
+    });
+  }
+
+  calculateCoordinate(bbox: number[], height: number, width: number): string {
+    const left = width * bbox[0];
+    const top = height * bbox[1];
+    const boxWidth = width * bbox[2];
+    const boxHeight = height * bbox[3];
+    return `${Math.floor((left + boxWidth) / 2)},${Math.floor((top + boxHeight) / 2)}`;
+  }
 }
 
 export default new OmniParserService();
@@ -11,7 +11,7 @@ import { ExploreModeOpenAIProvider } from "./llm/ExploreModeOpenAIProvider";
 import { OpenAIProvider } from "./llm/OpenAIProvider";
 import { GeminiProvider } from "./llm/GeminiProvider";
 import { AnthropicProvider } from "./llm/AnthropicProvider";
-import { IProcessedScreenshot } from "./interfaces/BrowserService";
+import { IProcessedScreenshot, OmniParserResponse } from "./interfaces/BrowserService";
 
 export class ChatService {
   private static provider: LLMProvider;
@@ -71,7 +71,7 @@ export class ChatService {
     type: ExploreActionTypes = ExploreActionTypes.EXPLORE,
     imageData: IProcessedScreenshot,
     source?: StreamingSource,
-    omniParserResult?: OmniParserResult
+    omniParserResult?: OmniParserResponse,
   ): Promise<void> {
     // Get the current model based on provider
     const model = (() => {
 
@@ -18,6 +18,26 @@ export interface BrowserService {
   close(): Promise<ActionResult>;
 }
 
+
+export interface OmniParserElement {
+  type: string;
+  bbox: number[];
+  interactivity: boolean;
+  content: string;
+  source: string;
+}
+
+export interface OmniParserProcessedElement {
+  interactivity: boolean;
+  content: string;
+  coordinates: string;
+}
+
+export interface OmniParserResponse {
+  processedImage: string;
+  elements: OmniParserProcessedElement[];
+}
+
 export type ActionType =
   | "click"
   | "type"