feat(cache): supports ui-tars model caching capability (#361)

zhoushaw · web-flow · commit 195eaad9031d · 2025-02-08T13:05:17.000+08:00
diff --git a/README.md b/README.md
@@ -46,6 +46,7 @@ From version v0.10.0, we support a new open-source model named [`UI-TARS`](https
 - **Support Private Deployment 🤖**: Supports private deployment of [`UI-TARS`](https://github.com/bytedance/ui-tars) model, which outperforms closed-source models like GPT-4o and Claude in UI automation scenarios while better protecting data security.
 - **Support General Models 🌟**: Supports general large models like GPT-4o and Claude, adapting to various scenario needs.
 - **Visual Reports for Debugging 🎞️**: Through our test reports and Playground, you can easily understand, replay and debug the entire process.
+- **Support Caching 🔄**: The first time you execute a task through AI, it will be cached, and subsequent executions of the same task will significantly improve execution efficiency.
 - **Completely Open Source 🔥**: Experience a whole new automation development experience, enjoy!
 - **Understand UI, JSON Format Responses 🔍**: You can specify data format requirements and receive responses in JSON format.
 - **Intuitive Assertions 🤔**: Express your assertions in natural language, and AI will understand and process them.
diff --git a/README.zh.md b/README.zh.md
@@ -47,6 +47,7 @@ Midscene.js 让 AI 成为你的浏览器操作员 🤖。只需用自然语言
 - **支持私有化部署 🤖**：支持私有化部署 [`UI-TARS`](https://github.com/bytedance/ui-tars) 模型，相比 GPT-4o、Claude 等闭源模型，不仅在 UI 自动化场景下表现更加出色，还能更好地保护数据安全。
 - **支持通用模型 🌟**：支持 GPT-4o、Claude 等通用大模型，适配多种场景需求。
 - **用可视化报告来调试 🎞️**：通过我们的测试报告和 Playground，你可以轻松理解、回放和调试整个过程。
+- **支持缓存 🔄**：首次通过 AI 执行后任务会被缓存，后续执行相同任务时可显著提升执行效率。
 - **完全开源 🔥**：体验全新的自动化开发体验，尽情享受吧！
 - **理解UI、JSON格式回答 🔍**：你可以提出关于数据格式的要求，然后得到 JSON 格式的预期回应。
 - **直观断言 🤔**：用自然语言表达你的断言，AI 会理解并处理。
diff --git a/apps/site/docs/en/caching.mdx b/apps/site/docs/en/caching.mdx
@@ -8,14 +8,47 @@ Currently, the caching capability is supported in all scenarios, and Midscene ca
 
 **Usage**
 
-```diff
-- playwright test --config=playwright.config.ts
-+ MIDSCENE_CACHE=true playwright test --config=playwright.config.ts
-```
+
+import { Tab, Tabs } from 'rspress/theme';
+
+<Tabs>
+  <Tab label="Playwright">
+    ```diff
+    - playwright test --config=playwright.config.ts
+    + MIDSCENE_CACHE=true playwright test --config=playwright.config.ts
+    ```
+  </Tab>
+  <Tab label="Puppeteer">
+    ```diff
+    - tsx demo.ts 
+    + MIDSCENE_CACHE=true tsx demo.ts
+    ```
+    
+    ```javascript
+    const mid = new PuppeteerAgent(originPage, {
+      cacheId: 'puppeteer-swag-sab)', // Add cache id
+    });
+    ```
+  </Tab>
+  <Tab label="Bridge Mode">
+    ```diff
+    - tsx demo.ts
+    + MIDSCENE_CACHE=true tsx demo.ts
+    ```
+
+    ```javascript
+    const agent = new AgentOverChromeBridge({
+      cacheId: 'star-midscene-github', // Add cache id
+    });
+    ```
+  </Tab>
+</Tabs>
+
+
 
 **Effect**
 
-After enabling the cache, the execution time is significantly reduced, for example, from 1m16s to 23s.
+After enabling the cache, the execution time is significantly reduced, for example, from 39s to 13s.
 
 * **before**
 
diff --git a/apps/site/docs/public/cache/no-cache-time.png b/apps/site/docs/public/cache/no-cache-time.png
diff --git a/apps/site/docs/public/cache/use-cache-time.png b/apps/site/docs/public/cache/use-cache-time.png
diff --git a/apps/site/docs/zh/caching.mdx b/apps/site/docs/zh/caching.mdx
@@ -8,14 +8,46 @@ Midscene.js 提供了 AI 缓存能力，用于提升整个 AI 执行过程的稳
 
 **使用方式**
 
-```diff
-- playwright test --config=playwright.config.ts
-+ MIDSCENE_CACHE=true playwright test --config=playwright.config.ts
-```
+
+import { Tab, Tabs } from 'rspress/theme';
+
+<Tabs>
+  <Tab label="Playwright">
+    ```diff
+    - playwright test --config=playwright.config.ts
+    + MIDSCENE_CACHE=true playwright test --config=playwright.config.ts
+    ```
+  </Tab>
+  <Tab label="Puppeteer">
+    ```diff
+    - tsx demo.ts 
+    + MIDSCENE_CACHE=true tsx demo.ts
+    ```
+    
+    ```javascript
+    const mid = new PuppeteerAgent(originPage, {
+      cacheId: 'puppeteer-swag-sab)', // Add cache id
+    });
+    ```
+  </Tab>
+  <Tab label="Bridge Mode">
+    ```diff
+    - tsx demo.ts
+    + MIDSCENE_CACHE=true tsx demo.ts
+    ```
+
+    ```javascript
+    const agent = new AgentOverChromeBridge({
+      cacheId: 'star-midscene-github', // Add cache id
+    });
+    ```
+  </Tab>
+</Tabs>
+
 
 **使用效果**
 
-通过引入缓存后，用例的执行时间大幅降低了，例如从1分16秒降低到了23秒。
+通过引入缓存后，用例的执行时间大幅降低了，例如从39秒降低到了13秒。
 
 * **before**
 
diff --git a/packages/web-integration/package.json b/packages/web-integration/package.json
@@ -106,7 +106,7 @@
     "test:u": "vitest --run -u",
     "test:ai": "AI_TEST_TYPE=web npm run test",
     "test:ai:temp": "AI_TEST_TYPE=web BRIDGE_MODE=true vitest --run tests/ai/bridge/temp.test.ts",
-    "test:ai:bridge": "BRIDGE_MODE=true npm run test --inspect tests/ai/bridge/agent.test.ts",
+    "test:ai:bridge": "MIDSCENE_CACHE=true BRIDGE_MODE=true AI_TEST_TYPE=web npm run test --inspect tests/ai/bridge/temp.test.ts",
     "test:ai:cache": "MIDSCENE_CACHE=true AI_TEST_TYPE=web npm run test",
     "test:ai:all": "npm run test:ai:web && npm run test:ai:native",
     "test:ai:native": "MIDSCENE_CACHE=true AI_TEST_TYPE=native npm run test",
diff --git a/packages/web-integration/src/bridge-mode/page-browser-side.ts b/packages/web-integration/src/bridge-mode/page-browser-side.ts
@@ -103,7 +103,9 @@ export class ChromeExtensionPageBrowserSide extends ChromeExtensionProxyPage {
 
   public async connectNewTabWithUrl(
     url: string,
-    options?: BridgeConnectTabOptions,
+    options: BridgeConnectTabOptions = {
+      trackingActiveTab: true,
+    },
   ) {
     const tab = await chrome.tabs.create({ url });
     const tabId = tab.id;
@@ -117,7 +119,11 @@ export class ChromeExtensionPageBrowserSide extends ChromeExtensionProxyPage {
     }
   }
 
-  public async connectCurrentTab(options?: BridgeConnectTabOptions) {
+  public async connectCurrentTab(
+    options: BridgeConnectTabOptions = {
+      trackingActiveTab: true,
+    },
+  ) {
     const tabs = await chrome.tabs.query({ active: true, currentWindow: true });
     console.log('current tab', tabs);
     const tabId = tabs[0]?.id;
diff --git a/packages/web-integration/src/chrome-extension/page.ts b/packages/web-integration/src/chrome-extension/page.ts
@@ -23,10 +23,13 @@ function sleep(ms: number) {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
 
+declare const __VERSION__: string;
+
 export default class ChromeExtensionProxyPage implements AbstractPage {
   pageType = 'chrome-extension-proxy';
 
   public trackingActiveTab: boolean;
+  private version: string = __VERSION__;
 
   private viewportSize?: Size;
 
diff --git a/packages/web-integration/src/common/task-cache.ts b/packages/web-integration/src/common/task-cache.ts
@@ -1,6 +1,7 @@
 import { existsSync, readFileSync } from 'node:fs';
 import { join } from 'node:path';
 import type { AIElementIdResponse, PlanningAIResponse } from '@midscene/core';
+import type { vlmPlanning } from '@midscene/core/ai-model';
 import { getAIConfig } from '@midscene/core/env';
 import {
   getLogDirByType,
@@ -24,6 +25,19 @@ export type PlanTask = {
   response: PlanningAIResponse;
 };
 
+export type UITarsPlanTask = {
+  type: 'ui-tars-plan';
+  prompt: string;
+  pageContext: {
+    url: string;
+    size: {
+      width: number;
+      height: number;
+    };
+  };
+  response: Awaited<ReturnType<typeof vlmPlanning>>;
+};
+
 export type LocateTask = {
   type: 'locate';
   prompt: string;
@@ -37,7 +51,7 @@ export type LocateTask = {
   response: AIElementIdResponse;
 };
 
-export type AiTasks = Array<PlanTask | LocateTask>;
+export type AiTasks = Array<PlanTask | LocateTask | UITarsPlanTask>;
 
 export type AiTaskCache = {
   aiTasks: Array<{
@@ -46,6 +60,19 @@ export type AiTaskCache = {
   }>;
 };
 
+export type CacheGroup = {
+  readCache: <T extends 'plan' | 'locate' | 'ui-tars-plan'>(
+    pageContext: WebUIContext,
+    type: T,
+    actionPrompt: string,
+  ) => T extends 'plan'
+    ? PlanTask['response']
+    : T extends 'locate'
+      ? LocateTask['response']
+      : UITarsPlanTask['response'];
+  saveCache: (cache: UITarsPlanTask | PlanTask | LocateTask) => void;
+};
+
 export class TaskCache {
   cache: AiTaskCache;
 
@@ -66,7 +93,7 @@ export class TaskCache {
     };
   }
 
-  getCacheGroupByPrompt(aiActionPrompt: string) {
+  getCacheGroupByPrompt(aiActionPrompt: string): CacheGroup {
     const { aiTasks = [] } = this.cache || { aiTasks: [] };
     const index = aiTasks.findIndex((item) => item.prompt === aiActionPrompt);
     const newCacheGroup: AiTasks = [];
@@ -75,30 +102,43 @@ export class TaskCache {
       tasks: newCacheGroup,
     });
     return {
-      readCache: <T extends 'plan' | 'locate'>(
+      readCache: <T extends 'plan' | 'locate' | 'ui-tars-plan'>(
         pageContext: WebUIContext,
         type: T,
         actionPrompt: string,
       ) => {
         if (index === -1) {
-          return false;
+          return false as any;
         }
         if (type === 'plan') {
           return this.readCache(
             pageContext,
             type,
             actionPrompt,
             aiTasks[index].tasks,
-          ) as T extends 'plan' ? PlanTask['response'] : LocateTask['response'];
+          ) as PlanTask['response'];
+        }
+        if (type === 'ui-tars-plan') {
+          return this.readCache(
+            pageContext,
+            type,
+            actionPrompt,
+            aiTasks[index].tasks,
+          ) as UITarsPlanTask['response'];
         }
+
         return this.readCache(
           pageContext,
           type,
           actionPrompt,
           aiTasks[index].tasks,
-        ) as T extends 'plan' ? PlanTask['response'] : LocateTask['response'];
+        ) as T extends 'plan'
+          ? PlanTask['response']
+          : T extends 'locate'
+            ? LocateTask['response']
+            : UITarsPlanTask['response'];
       },
-      saveCache: (cache: PlanTask | LocateTask) => {
+      saveCache: (cache: PlanTask | LocateTask | UITarsPlanTask) => {
         newCacheGroup.push(cache);
         this.writeCacheToFile();
       },
@@ -127,6 +167,12 @@ export class TaskCache {
     userPrompt: string,
     cacheGroup: AiTasks,
   ): PlanTask['response'];
+  readCache(
+    pageContext: WebUIContext,
+    type: 'ui-tars-plan',
+    userPrompt: string,
+    cacheGroup: AiTasks,
+  ): UITarsPlanTask['response'];
   readCache(
     pageContext: WebUIContext,
     type: 'locate',
@@ -135,10 +181,14 @@ export class TaskCache {
   ): LocateTask['response'];
   readCache(
     pageContext: WebUIContext,
-    type: 'plan' | 'locate',
+    type: 'plan' | 'locate' | 'ui-tars-plan',
     userPrompt: string,
     cacheGroup: AiTasks,
-  ): PlanTask['response'] | LocateTask['response'] | false {
+  ):
+    | PlanTask['response']
+    | LocateTask['response']
+    | UITarsPlanTask['response']
+    | false {
     if (cacheGroup.length > 0) {
       const index = cacheGroup.findIndex((item) => item.prompt === userPrompt);
 
diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts
@@ -591,7 +591,10 @@ export class PageTaskExecutor {
     return task;
   }
 
-  private planningTaskToGoal(userPrompt: string) {
+  private planningTaskToGoal(
+    userPrompt: string,
+    cacheGroup: ReturnType<TaskCache['getCacheGroupByPrompt']>,
+  ) {
     const task: ExecutionTaskPlanningApply = {
       type: 'Planning',
       locate: null,
@@ -621,10 +624,30 @@ export class PageTaskExecutor {
           ],
         });
         const startTime = Date.now();
-        const planResult = await vlmPlanning({
-          userInstruction: param.userPrompt,
-          conversationHistory: this.conversationHistory,
-          size: pageContext.size,
+
+        const planCache = cacheGroup.readCache(
+          pageContext,
+          'ui-tars-plan',
+          userPrompt,
+        );
+        let planResult: Awaited<ReturnType<typeof vlmPlanning>>;
+        if (planCache) {
+          planResult = planCache;
+        } else {
+          planResult = await vlmPlanning({
+            userInstruction: param.userPrompt,
+            conversationHistory: this.conversationHistory,
+            size: pageContext.size,
+          });
+        }
+        cacheGroup.saveCache({
+          type: 'ui-tars-plan',
+          pageContext: {
+            url: pageContext.url,
+            size: pageContext.size,
+          },
+          prompt: userPrompt,
+          response: planResult,
         });
         const aiCost = Date.now() - startTime;
         const { actions, action_summary } = planResult;
@@ -643,6 +666,9 @@ export class PageTaskExecutor {
               whatHaveDone: '',
             },
           },
+          cache: {
+            hit: Boolean(planCache),
+          },
           aiCost,
         };
       },
@@ -738,15 +764,17 @@ export class PageTaskExecutor {
       onTaskStart: options?.onTaskStart,
     });
     this.conversationHistory = [];
-
+    const cacheGroup = this.taskCache.getCacheGroupByPrompt(userPrompt);
     const isCompleted = false;
     let currentActionNumber = 0;
     const maxActionNumber = 20;
 
     while (!isCompleted && currentActionNumber < maxActionNumber) {
       currentActionNumber++;
-      const planningTask: ExecutionTaskPlanningApply =
-        this.planningTaskToGoal(userPrompt);
+      const planningTask: ExecutionTaskPlanningApply = this.planningTaskToGoal(
+        userPrompt,
+        cacheGroup,
+      );
       await taskExecutor.append(planningTask);
       const output = await taskExecutor.flush();
       if (taskExecutor.isInErrorState()) {
diff --git a/packages/web-integration/tests/ai/bridge/temp.test.ts b/packages/web-integration/tests/ai/bridge/temp.test.ts
diff --git a/packages/web-integration/tests/ai/web/playwright/ai-auto-todo.spec.ts b/packages/web-integration/tests/ai/web/playwright/ai-auto-todo.spec.ts