shareAI-lab
diff --git a/‎.github/workflows/benchmark.yml‎
Lines changed: 36 additions & 5 deletions b/‎.github/workflows/benchmark.yml‎
Lines changed: 36 additions & 5 deletions
diff --git a/‎docs/en/guides/benchmarking.md‎
Lines changed: 26 additions & 6 deletions b/‎docs/en/guides/benchmarking.md‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎docs/zh-CN/guides/benchmarking.md‎
Lines changed: 26 additions & 6 deletions b/‎docs/zh-CN/guides/benchmarking.md‎
Lines changed: 26 additions & 6 deletions
diff --git a/‎tests/benchmark/compare.ts‎
Lines changed: 84 additions & 4 deletions b/‎tests/benchmark/compare.ts‎
Lines changed: 84 additions & 4 deletions
@@ -9,11 +9,13 @@ on:
         required: true
         default: both
         options:
+          - all
           - both
           - swe
+          - tau
           - tb2
       provider:
-        description: "SWE provider filter"
+        description: "SWE/TAU provider filter"
         type: choice
         required: true
         default: all
@@ -45,6 +47,7 @@ jobs:
     name: Benchmark
     runs-on: ubuntu-latest
     timeout-minutes: 360
+    if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }}
     env:
       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
       DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -94,9 +97,13 @@ jobs:
       - name: Run unified benchmark command
         run: |
           mkdir -p tests/tmp
+          benchmark="${{ github.event.inputs.benchmark || 'both' }}"
+          provider="${{ github.event.inputs.provider || 'all' }}"
+          tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}"
+
           args=(
-            --benchmark=${{ inputs.benchmark }}
-            --tb2-model=${{ inputs.tb2_model }}
+            --benchmark=${benchmark}
+            --tb2-model=${tb2_model}
             --tb2-agent=oracle
             --tb2-runner=uvx
             --tb2-python=3.12
@@ -105,8 +112,8 @@ jobs:
             --output-file=tests/tmp/benchmark-report.json
           )
 
-          if [[ "${{ inputs.provider }}" != "all" && "${{ inputs.benchmark }}" != "tb2" ]]; then
-            args+=(--provider=${{ inputs.provider }})
+          if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then
+            args+=(--provider=${provider})
           fi
 
           npm run test:benchmark -- "${args[@]}"
@@ -144,6 +151,24 @@ jobs:
             console.log('');
           }
 
+          if (Array.isArray(report.tau) && report.tau.length > 0) {
+            console.log('### TAU-bench');
+            console.log('');
+            console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |');
+            console.log('|---|---|---:|---:|');
+            for (const r of report.tau) {
+              const name = `${r.provider.id} / ${r.provider.model}`;
+              const domain = r.summary.domain;
+              const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`;
+              const observed = (r.summary.token_observed_trials ?? 0) > 0;
+              const avgTokens = observed
+                ? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`)
+                : '-';
+              console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`);
+            }
+            console.log('');
+          }
+
           if (report.tb2) {
             const tb2 = report.tb2;
             console.log('### Terminal Bench 2.0');
@@ -152,6 +177,11 @@ jobs:
             if (tb2.model) console.log(`- Model: \`${tb2.model}\``);
             console.log(`- Passed: **${tb2.passed}/${tb2.total}**`);
             console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`);
+            if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) {
+              console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`);
+            } else {
+              console.log(`- Avg tokens: **N/A**`);
+            }
             console.log('');
           }
           NODE
@@ -165,3 +195,4 @@ jobs:
           path: |
             tests/tmp/benchmark-report.json
             tests/tmp/jobs/*/result.json
+            tests/tmp/tau2-data/simulations/*.json
@@ -1,10 +1,12 @@
 # Benchmarking
 
-KODE SDK benchmark runner now has a single entry command and supports three targets:
+KODE SDK benchmark runner now has a single entry command and supports multiple targets:
 
 - `swe`: SWE-bench-Verified only
+- `tau`: TAU-bench only
 - `tb2`: Terminal Bench 2.0 only
-- `both`: run both in one command
+- `both`: run SWE + TAU + TB2
+- `all`: alias of `both` (compatibility)
 
 ## Prerequisites
 
@@ -29,6 +31,7 @@ GEMINI_MODEL_ID=gemini-3-pro-preview
 
 3. Runtime tools:
 - SWE-bench-Verified: Docker is required
+- TAU-bench: `tau2` or `uvx` is required (official TAU2 harness)
 - TB2: `harbor`, `uvx`, or Docker (runner decides by `--tb2-runner`)
 
 ## Unified Command
@@ -39,7 +42,7 @@ npm run test:benchmark -- [flags]
 
 ### Common examples
 
-Run both SWE + TB2 in one command:
+Run SWE + TAU + TB2 in one command:
 
 ```bash
 npm run test:benchmark -- \
@@ -72,12 +75,26 @@ npm run test:benchmark -- \
   --output-file=tests/tmp/tb2-report.json
 ```
 
+Run only TAU-bench (official TAU2 script + dataset):
+
+```bash
+npm run test:benchmark -- \
+  --benchmark=tau \
+  --provider=openai \
+  --tau-domain=all \
+  --num-trials=1 \
+  --output=json \
+  --output-file=tests/tmp/tau-report.json
+```
+
 ## Flags
 
 | Flag | Description | Default |
 |---|---|---|
-| `--benchmark=swe\|tb2\|both` | Which benchmark(s) to run | `both` |
-| `--provider=...` | SWE provider filter (`anthropic`, `openai`, `gemini`, etc.) | all discovered |
+| `--benchmark=swe\|tau\|tb2\|both\|all` | Which benchmark(s) to run (`both`=`all`) | `both` |
+| `--provider=...` | Provider filter for SWE/TAU (`anthropic`, `openai`, `gemini`, etc.) | all discovered |
+| `--tau-domain=airline\|retail\|all` | TAU domain filter | `all` |
+| `--num-trials=N` | TAU trials per task (Pass^k) | `1` |
 | `--tb2-model=provider/model` | TB2 model id | `BENCHMARK_TB2_MODEL` or `openai/$OPENAI_MODEL_ID` |
 | `--tb2-agent=...` | TB2 agent (`oracle`, etc.) | `oracle` |
 | `--tb2-dataset=...` | TB2 dataset id | `terminal-bench@2.0` |
@@ -92,7 +109,7 @@ npm run test:benchmark -- \
 
 ## Output
 
-With `--output=json`, one report contains both sections:
+With `--output=json`, one report may contain `swe`, `tau`, and `tb2` sections depending on `--benchmark`.
 
 ```json
 {
@@ -118,5 +135,8 @@ With `--output=json`, one report contains both sections:
 ## Notes
 
 - SWE-bench is fixed to **SWE-bench-Verified**. There is no mini/full mode switch anymore.
+- TAU now runs with the official **TAU2** harness (`tau2 run ...`) from Sierra.
+- TAU user simulator can be configured with `BENCHMARK_USER_MODEL=provider/model`.
 - TB2 uses official Harbor run flow (`harbor run -d terminal-bench@2.0 -m ... -a ...`) under the selected runner.
+- TAU/TB2 token stats are extracted from official result files when available; if a runner/agent does not emit usage, it is shown as `N/A`.
 - If Docker image pulls are slow, set `BENCHMARK_DOCKER_PROXY`.
@@ -1,10 +1,12 @@
 # Benchmarking
 
-KODE SDK 的 benchmark 入口已统一为一个命令，支持三种目标：
+KODE SDK 的 benchmark 入口已统一为一个命令，支持多个目标：
 
 - `swe`：只跑 SWE-bench-Verified
+- `tau`：只跑 TAU-bench
 - `tb2`：只跑 Terminal Bench 2.0
-- `both`：一次命令同时跑两者
+- `both`：一次命令跑 SWE + TAU + TB2
+- `all`：`both` 的兼容别名
 
 ## 前置条件
 
@@ -29,6 +31,7 @@ GEMINI_MODEL_ID=gemini-3-pro-preview
 
 3. 运行依赖：
 - SWE-bench-Verified：必须有 Docker
+- TAU-bench：需要 `tau2` 或 `uvx`（官方 TAU2 harness）
 - TB2：`harbor`、`uvx` 或 Docker（由 `--tb2-runner` 决定）
 
 ## 统一命令
@@ -39,7 +42,7 @@ npm run test:benchmark -- [参数]
 
 ### 常用示例
 
-一次命令同时跑 SWE + TB2：
+一次命令同时跑 SWE + TAU + TB2：
 
 ```bash
 npm run test:benchmark -- \
@@ -72,12 +75,26 @@ npm run test:benchmark -- \
   --output-file=tests/tmp/tb2-report.json
 ```
 
+只跑 TAU-bench（官方 TAU2 脚本与数据集）：
+
+```bash
+npm run test:benchmark -- \
+  --benchmark=tau \
+  --provider=openai \
+  --tau-domain=all \
+  --num-trials=1 \
+  --output=json \
+  --output-file=tests/tmp/tau-report.json
+```
+
 ## 参数说明
 
 | 参数 | 含义 | 默认值 |
 |---|---|---|
-| `--benchmark=swe\|tb2\|both` | 选择要跑的 benchmark | `both` |
-| `--provider=...` | SWE provider 过滤（`anthropic`、`openai`、`gemini` 等） | 自动发现全部 |
+| `--benchmark=swe\|tau\|tb2\|both\|all` | 选择要跑的 benchmark（`both`=`all`） | `both` |
+| `--provider=...` | SWE/TAU 的 provider 过滤（`anthropic`、`openai`、`gemini` 等） | 自动发现全部 |
+| `--tau-domain=airline\|retail\|all` | TAU 领域过滤 | `all` |
+| `--num-trials=N` | TAU 每个任务试验次数（Pass^k） | `1` |
 | `--tb2-model=provider/model` | TB2 模型 ID | `BENCHMARK_TB2_MODEL` 或 `openai/$OPENAI_MODEL_ID` |
 | `--tb2-agent=...` | TB2 agent（如 `oracle`） | `oracle` |
 | `--tb2-dataset=...` | TB2 数据集 ID | `terminal-bench@2.0` |
@@ -92,7 +109,7 @@ npm run test:benchmark -- \
 
 ## 输出格式
 
-使用 `--output=json` 时，单个报告同时包含 SWE 和 TB2：
+使用 `--output=json` 时，报告会按 `--benchmark` 输出 `swe`/`tau`/`tb2` 分区：
 
 ```json
 {
@@ -118,5 +135,8 @@ npm run test:benchmark -- \
 ## 说明
 
 - SWE 已固定为 **SWE-bench-Verified**，不再有 mini/full 模式参数。
+- TAU 已切换为 Sierra 官方 **TAU2** harness（`tau2 run ...`）。
+- TAU 的用户模拟模型可通过 `BENCHMARK_USER_MODEL=provider/model` 指定。
 - TB2 走官方 Harbor 流程（`harbor run -d terminal-bench@2.0 -m ... -a ...`），由 runner 包装执行。
+- TAU/TB2 的 token 统计会从官方结果文件提取；若 runner/agent 未产出 usage，则显示为 `N/A`。
 - 若 Docker 拉取镜像慢，可设置 `BENCHMARK_DOCKER_PROXY`。
@@ -1,5 +1,5 @@
 import fs from 'fs';
-import type { BenchmarkReport, SWEProviderResult, TB2Summary } from './types';
+import type { BenchmarkReport, SWEProviderResult, TAUProviderResult, TB2Summary } from './types';
 
 interface ComparisonRow {
   label: string;
@@ -11,6 +11,7 @@ interface ComparisonRow {
 
 interface ComparisonResult {
   swe: ComparisonRow[];
+  tau: ComparisonRow[];
   tb2: ComparisonRow[];
   hasRegressions: boolean;
 }
@@ -104,6 +105,58 @@ function compareSWE(oldResults: SWEProviderResult[], newResults: SWEProviderResu
   return rows;
 }
 
+function compareTAU(oldResults: TAUProviderResult[], newResults: TAUProviderResult[]): ComparisonRow[] {
+  const rows: ComparisonRow[] = [];
+
+  for (const newR of newResults) {
+    const key = `${newR.provider.id}/${newR.provider.model} [${newR.summary.domain}]`;
+    const oldR = oldResults.find(
+      r =>
+        r.provider.id === newR.provider.id
+        && r.provider.model === newR.provider.model
+        && r.summary.domain === newR.summary.domain,
+    );
+
+    if (!oldR) {
+      const pass1 = newR.summary.pass_at_k[0] ?? 0;
+      rows.push({
+        label: `${key} [pass^1]`,
+        oldValue: '-',
+        newValue: fmtPct(pass1),
+        delta: 'new',
+        direction: 'na',
+      });
+      continue;
+    }
+
+    const oldPass1 = oldR.summary.pass_at_k[0] ?? 0;
+    const newPass1 = newR.summary.pass_at_k[0] ?? 0;
+    const passDelta = deltaStr(oldPass1, newPass1, 'pct');
+    rows.push({
+      label: `${key} [pass^1]`,
+      oldValue: fmtPct(oldPass1),
+      newValue: fmtPct(newPass1),
+      delta: passDelta.text,
+      direction: passDelta.dir,
+    });
+
+    const oldTokObserved = (oldR.summary.token_observed_trials ?? 0) > 0;
+    const newTokObserved = (newR.summary.token_observed_trials ?? 0) > 0;
+    if (oldTokObserved && newTokObserved) {
+      const tokenDelta = deltaStr(oldR.summary.avg_tokens, newR.summary.avg_tokens, 'tokens');
+      rows.push({
+        label: `${key} [tokens]`,
+        oldValue: fmtK(oldR.summary.avg_tokens),
+        newValue: fmtK(newR.summary.avg_tokens),
+        delta: tokenDelta.text,
+        direction: tokenDelta.dir,
+      });
+    }
+  }
+
+  return rows;
+}
+
 function compareTB2(oldTB2?: TB2Summary, newTB2?: TB2Summary): ComparisonRow[] {
   if (!newTB2) return [];
   if (!oldTB2) {
@@ -136,6 +189,19 @@ function compareTB2(oldTB2?: TB2Summary, newTB2?: TB2Summary): ComparisonRow[] {
     direction: newTB2.passed > oldTB2.passed ? 'better' : newTB2.passed < oldTB2.passed ? 'worse' : 'same',
   });
 
+  const oldTokObserved = (oldTB2.token_observed_trials ?? 0) > 0 && oldTB2.avg_total_tokens !== undefined;
+  const newTokObserved = (newTB2.token_observed_trials ?? 0) > 0 && newTB2.avg_total_tokens !== undefined;
+  if (oldTokObserved && newTokObserved) {
+    const tokenDelta = deltaStr(oldTB2.avg_total_tokens!, newTB2.avg_total_tokens!, 'tokens');
+    rows.push({
+      label: 'tb2 [tokens]',
+      oldValue: fmtK(oldTB2.avg_total_tokens!),
+      newValue: fmtK(newTB2.avg_total_tokens!),
+      delta: tokenDelta.text,
+      direction: tokenDelta.dir,
+    });
+  }
+
   return rows;
 }
 
@@ -145,9 +211,10 @@ export function loadReport(filePath: string): BenchmarkReport {
 
 export function compareReports(oldReport: BenchmarkReport, newReport: BenchmarkReport): ComparisonResult {
   const sweRows = compareSWE(oldReport.swe ?? [], newReport.swe ?? []);
+  const tauRows = compareTAU(oldReport.tau ?? [], newReport.tau ?? []);
   const tb2Rows = compareTB2(oldReport.tb2, newReport.tb2);
-  const hasRegressions = [...sweRows, ...tb2Rows].some(r => r.direction === 'worse');
-  return { swe: sweRows, tb2: tb2Rows, hasRegressions };
+  const hasRegressions = [...sweRows, ...tauRows, ...tb2Rows].some(r => r.direction === 'worse');
+  return { swe: sweRows, tau: tauRows, tb2: tb2Rows, hasRegressions };
 }
 
 export function printComparison(oldPath: string, newPath: string, result: ComparisonResult): void {
@@ -159,7 +226,7 @@ export function printComparison(oldPath: string, newPath: string, result: Compar
   console.log(`  Current:   ${newPath}`);
   console.log('');
 
-  const allRows = [...result.swe, ...result.tb2];
+  const allRows = [...result.swe, ...result.tau, ...result.tb2];
   if (allRows.length === 0) {
     console.log('  No comparable results found.\n');
     return;
@@ -182,6 +249,19 @@ export function printComparison(oldPath: string, newPath: string, result: Compar
     console.log('');
   }
 
+  if (result.tau.length > 0) {
+    console.log('--- TAU Comparison ---\n');
+    console.log(header);
+    console.log(sep);
+    for (const row of result.tau) {
+      const dir = row.direction === 'better' ? ' ^' : row.direction === 'worse' ? ' v' : '  ';
+      console.log(
+        `${pad(row.label, maxLabel)} | ${lpad(row.oldValue, 10)} | ${lpad(row.newValue, 10)} | ${lpad(row.delta, 12)} |${dir}`,
+      );
+    }
+    console.log('');
+  }
+
   if (result.tb2.length > 0) {
     console.log('--- TB2 Comparison ---\n');
     console.log(header);