Skip to content

Commit 850186c

Browse files
committed
feat(benchmark): restore official TAU2, add token accounting for TAU/TB2, and gate
Actions run via var - restore TAU benchmark with official TAU2 harness integration - make --benchmark=both run SWE + TAU + TB2 (all kept as alias) - fix workflow defaults for push/PR (avoid empty benchmark/provider/model args) - add token extraction/aggregation for TAU and TB2 (show N/A when source has no usage) - extend reports/comparisons/types to include TAU/TB2 token stats - update benchmark docs (EN/ZH) and Actions summary output for TAU/TB2 token fields - add workflow gate: run benchmark job only when vars.BENCHMARK_ACTION_ENABLED == '1'
1 parent 007e6c2 commit 850186c

10 files changed

Lines changed: 920 additions & 48 deletions

File tree

.github/workflows/benchmark.yml

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ on:
99
required: true
1010
default: both
1111
options:
12+
- all
1213
- both
1314
- swe
15+
- tau
1416
- tb2
1517
provider:
16-
description: "SWE provider filter"
18+
description: "SWE/TAU provider filter"
1719
type: choice
1820
required: true
1921
default: all
@@ -45,6 +47,7 @@ jobs:
4547
name: Benchmark
4648
runs-on: ubuntu-latest
4749
timeout-minutes: 360
50+
if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }}
4851
env:
4952
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
5053
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -94,9 +97,13 @@ jobs:
9497
- name: Run unified benchmark command
9598
run: |
9699
mkdir -p tests/tmp
100+
benchmark="${{ github.event.inputs.benchmark || 'both' }}"
101+
provider="${{ github.event.inputs.provider || 'all' }}"
102+
tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}"
103+
97104
args=(
98-
--benchmark=${{ inputs.benchmark }}
99-
--tb2-model=${{ inputs.tb2_model }}
105+
--benchmark=${benchmark}
106+
--tb2-model=${tb2_model}
100107
--tb2-agent=oracle
101108
--tb2-runner=uvx
102109
--tb2-python=3.12
@@ -105,8 +112,8 @@ jobs:
105112
--output-file=tests/tmp/benchmark-report.json
106113
)
107114
108-
if [[ "${{ inputs.provider }}" != "all" && "${{ inputs.benchmark }}" != "tb2" ]]; then
109-
args+=(--provider=${{ inputs.provider }})
115+
if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then
116+
args+=(--provider=${provider})
110117
fi
111118
112119
npm run test:benchmark -- "${args[@]}"
@@ -144,6 +151,24 @@ jobs:
144151
console.log('');
145152
}
146153
154+
if (Array.isArray(report.tau) && report.tau.length > 0) {
155+
console.log('### TAU-bench');
156+
console.log('');
157+
console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |');
158+
console.log('|---|---|---:|---:|');
159+
for (const r of report.tau) {
160+
const name = `${r.provider.id} / ${r.provider.model}`;
161+
const domain = r.summary.domain;
162+
const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`;
163+
const observed = (r.summary.token_observed_trials ?? 0) > 0;
164+
const avgTokens = observed
165+
? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`)
166+
: '-';
167+
console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`);
168+
}
169+
console.log('');
170+
}
171+
147172
if (report.tb2) {
148173
const tb2 = report.tb2;
149174
console.log('### Terminal Bench 2.0');
@@ -152,6 +177,11 @@ jobs:
152177
if (tb2.model) console.log(`- Model: \`${tb2.model}\``);
153178
console.log(`- Passed: **${tb2.passed}/${tb2.total}**`);
154179
console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`);
180+
if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) {
181+
console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`);
182+
} else {
183+
console.log(`- Avg tokens: **N/A**`);
184+
}
155185
console.log('');
156186
}
157187
NODE
@@ -165,3 +195,4 @@ jobs:
165195
path: |
166196
tests/tmp/benchmark-report.json
167197
tests/tmp/jobs/*/result.json
198+
tests/tmp/tau2-data/simulations/*.json

docs/en/guides/benchmarking.md

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Benchmarking
22

3-
KODE SDK benchmark runner now has a single entry command and supports three targets:
3+
KODE SDK benchmark runner now has a single entry command and supports multiple targets:
44

55
- `swe`: SWE-bench-Verified only
6+
- `tau`: TAU-bench only
67
- `tb2`: Terminal Bench 2.0 only
7-
- `both`: run both in one command
8+
- `both`: run SWE + TAU + TB2
9+
- `all`: alias of `both` (compatibility)
810

911
## Prerequisites
1012

@@ -29,6 +31,7 @@ GEMINI_MODEL_ID=gemini-3-pro-preview
2931

3032
3. Runtime tools:
3133
- SWE-bench-Verified: Docker is required
34+
- TAU-bench: `tau2` or `uvx` is required (official TAU2 harness)
3235
- TB2: `harbor`, `uvx`, or Docker (runner decides by `--tb2-runner`)
3336

3437
## Unified Command
@@ -39,7 +42,7 @@ npm run test:benchmark -- [flags]
3942

4043
### Common examples
4144

42-
Run both SWE + TB2 in one command:
45+
Run SWE + TAU + TB2 in one command:
4346

4447
```bash
4548
npm run test:benchmark -- \
@@ -72,12 +75,26 @@ npm run test:benchmark -- \
7275
--output-file=tests/tmp/tb2-report.json
7376
```
7477

78+
Run only TAU-bench (official TAU2 script + dataset):
79+
80+
```bash
81+
npm run test:benchmark -- \
82+
--benchmark=tau \
83+
--provider=openai \
84+
--tau-domain=all \
85+
--num-trials=1 \
86+
--output=json \
87+
--output-file=tests/tmp/tau-report.json
88+
```
89+
7590
## Flags
7691

7792
| Flag | Description | Default |
7893
|---|---|---|
79-
| `--benchmark=swe\|tb2\|both` | Which benchmark(s) to run | `both` |
80-
| `--provider=...` | SWE provider filter (`anthropic`, `openai`, `gemini`, etc.) | all discovered |
94+
| `--benchmark=swe\|tau\|tb2\|both\|all` | Which benchmark(s) to run (`both`=`all`) | `both` |
95+
| `--provider=...` | Provider filter for SWE/TAU (`anthropic`, `openai`, `gemini`, etc.) | all discovered |
96+
| `--tau-domain=airline\|retail\|all` | TAU domain filter | `all` |
97+
| `--num-trials=N` | TAU trials per task (Pass^k) | `1` |
8198
| `--tb2-model=provider/model` | TB2 model id | `BENCHMARK_TB2_MODEL` or `openai/$OPENAI_MODEL_ID` |
8299
| `--tb2-agent=...` | TB2 agent (`oracle`, etc.) | `oracle` |
83100
| `--tb2-dataset=...` | TB2 dataset id | `terminal-bench@2.0` |
@@ -92,7 +109,7 @@ npm run test:benchmark -- \
92109

93110
## Output
94111

95-
With `--output=json`, one report contains both sections:
112+
With `--output=json`, one report may contain `swe`, `tau`, and `tb2` sections depending on `--benchmark`.
96113

97114
```json
98115
{
@@ -118,5 +135,8 @@ With `--output=json`, one report contains both sections:
118135
## Notes
119136

120137
- SWE-bench is fixed to **SWE-bench-Verified**. There is no mini/full mode switch anymore.
138+
- TAU now runs with the official **TAU2** harness (`tau2 run ...`) from Sierra.
139+
- TAU user simulator can be configured with `BENCHMARK_USER_MODEL=provider/model`.
121140
- TB2 uses official Harbor run flow (`harbor run -d terminal-bench@2.0 -m ... -a ...`) under the selected runner.
141+
- TAU/TB2 token stats are extracted from official result files when available; if a runner/agent does not emit usage, it is shown as `N/A`.
122142
- If Docker image pulls are slow, set `BENCHMARK_DOCKER_PROXY`.

docs/zh-CN/guides/benchmarking.md

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Benchmarking
22

3-
KODE SDK 的 benchmark 入口已统一为一个命令,支持三种目标
3+
KODE SDK 的 benchmark 入口已统一为一个命令,支持多个目标
44

55
- `swe`:只跑 SWE-bench-Verified
6+
- `tau`:只跑 TAU-bench
67
- `tb2`:只跑 Terminal Bench 2.0
7-
- `both`:一次命令同时跑两者
8+
- `both`:一次命令跑 SWE + TAU + TB2
9+
- `all``both` 的兼容别名
810

911
## 前置条件
1012

@@ -29,6 +31,7 @@ GEMINI_MODEL_ID=gemini-3-pro-preview
2931

3032
3. 运行依赖:
3133
- SWE-bench-Verified:必须有 Docker
34+
- TAU-bench:需要 `tau2``uvx`(官方 TAU2 harness)
3235
- TB2:`harbor``uvx` 或 Docker(由 `--tb2-runner` 决定)
3336

3437
## 统一命令
@@ -39,7 +42,7 @@ npm run test:benchmark -- [参数]
3942

4043
### 常用示例
4144

42-
一次命令同时跑 SWE + TB2:
45+
一次命令同时跑 SWE + TAU + TB2:
4346

4447
```bash
4548
npm run test:benchmark -- \
@@ -72,12 +75,26 @@ npm run test:benchmark -- \
7275
--output-file=tests/tmp/tb2-report.json
7376
```
7477

78+
只跑 TAU-bench(官方 TAU2 脚本与数据集):
79+
80+
```bash
81+
npm run test:benchmark -- \
82+
--benchmark=tau \
83+
--provider=openai \
84+
--tau-domain=all \
85+
--num-trials=1 \
86+
--output=json \
87+
--output-file=tests/tmp/tau-report.json
88+
```
89+
7590
## 参数说明
7691

7792
| 参数 | 含义 | 默认值 |
7893
|---|---|---|
79-
| `--benchmark=swe\|tb2\|both` | 选择要跑的 benchmark | `both` |
80-
| `--provider=...` | SWE provider 过滤(`anthropic``openai``gemini` 等) | 自动发现全部 |
94+
| `--benchmark=swe\|tau\|tb2\|both\|all` | 选择要跑的 benchmark(`both`=`all`| `both` |
95+
| `--provider=...` | SWE/TAU 的 provider 过滤(`anthropic``openai``gemini` 等) | 自动发现全部 |
96+
| `--tau-domain=airline\|retail\|all` | TAU 领域过滤 | `all` |
97+
| `--num-trials=N` | TAU 每个任务试验次数(Pass^k) | `1` |
8198
| `--tb2-model=provider/model` | TB2 模型 ID | `BENCHMARK_TB2_MODEL``openai/$OPENAI_MODEL_ID` |
8299
| `--tb2-agent=...` | TB2 agent(如 `oracle`| `oracle` |
83100
| `--tb2-dataset=...` | TB2 数据集 ID | `terminal-bench@2.0` |
@@ -92,7 +109,7 @@ npm run test:benchmark -- \
92109

93110
## 输出格式
94111

95-
使用 `--output=json` 时,单个报告同时包含 SWE 和 TB2
112+
使用 `--output=json` 时,报告会按 `--benchmark` 输出 `swe`/`tau`/`tb2` 分区
96113

97114
```json
98115
{
@@ -118,5 +135,8 @@ npm run test:benchmark -- \
118135
## 说明
119136

120137
- SWE 已固定为 **SWE-bench-Verified**,不再有 mini/full 模式参数。
138+
- TAU 已切换为 Sierra 官方 **TAU2** harness(`tau2 run ...`)。
139+
- TAU 的用户模拟模型可通过 `BENCHMARK_USER_MODEL=provider/model` 指定。
121140
- TB2 走官方 Harbor 流程(`harbor run -d terminal-bench@2.0 -m ... -a ...`),由 runner 包装执行。
141+
- TAU/TB2 的 token 统计会从官方结果文件提取;若 runner/agent 未产出 usage,则显示为 `N/A`
122142
- 若 Docker 拉取镜像慢,可设置 `BENCHMARK_DOCKER_PROXY`

tests/benchmark/compare.ts

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import fs from 'fs';
2-
import type { BenchmarkReport, SWEProviderResult, TB2Summary } from './types';
2+
import type { BenchmarkReport, SWEProviderResult, TAUProviderResult, TB2Summary } from './types';
33

44
interface ComparisonRow {
55
label: string;
@@ -11,6 +11,7 @@ interface ComparisonRow {
1111

1212
interface ComparisonResult {
1313
swe: ComparisonRow[];
14+
tau: ComparisonRow[];
1415
tb2: ComparisonRow[];
1516
hasRegressions: boolean;
1617
}
@@ -104,6 +105,58 @@ function compareSWE(oldResults: SWEProviderResult[], newResults: SWEProviderResu
104105
return rows;
105106
}
106107

108+
function compareTAU(oldResults: TAUProviderResult[], newResults: TAUProviderResult[]): ComparisonRow[] {
109+
const rows: ComparisonRow[] = [];
110+
111+
for (const newR of newResults) {
112+
const key = `${newR.provider.id}/${newR.provider.model} [${newR.summary.domain}]`;
113+
const oldR = oldResults.find(
114+
r =>
115+
r.provider.id === newR.provider.id
116+
&& r.provider.model === newR.provider.model
117+
&& r.summary.domain === newR.summary.domain,
118+
);
119+
120+
if (!oldR) {
121+
const pass1 = newR.summary.pass_at_k[0] ?? 0;
122+
rows.push({
123+
label: `${key} [pass^1]`,
124+
oldValue: '-',
125+
newValue: fmtPct(pass1),
126+
delta: 'new',
127+
direction: 'na',
128+
});
129+
continue;
130+
}
131+
132+
const oldPass1 = oldR.summary.pass_at_k[0] ?? 0;
133+
const newPass1 = newR.summary.pass_at_k[0] ?? 0;
134+
const passDelta = deltaStr(oldPass1, newPass1, 'pct');
135+
rows.push({
136+
label: `${key} [pass^1]`,
137+
oldValue: fmtPct(oldPass1),
138+
newValue: fmtPct(newPass1),
139+
delta: passDelta.text,
140+
direction: passDelta.dir,
141+
});
142+
143+
const oldTokObserved = (oldR.summary.token_observed_trials ?? 0) > 0;
144+
const newTokObserved = (newR.summary.token_observed_trials ?? 0) > 0;
145+
if (oldTokObserved && newTokObserved) {
146+
const tokenDelta = deltaStr(oldR.summary.avg_tokens, newR.summary.avg_tokens, 'tokens');
147+
rows.push({
148+
label: `${key} [tokens]`,
149+
oldValue: fmtK(oldR.summary.avg_tokens),
150+
newValue: fmtK(newR.summary.avg_tokens),
151+
delta: tokenDelta.text,
152+
direction: tokenDelta.dir,
153+
});
154+
}
155+
}
156+
157+
return rows;
158+
}
159+
107160
function compareTB2(oldTB2?: TB2Summary, newTB2?: TB2Summary): ComparisonRow[] {
108161
if (!newTB2) return [];
109162
if (!oldTB2) {
@@ -136,6 +189,19 @@ function compareTB2(oldTB2?: TB2Summary, newTB2?: TB2Summary): ComparisonRow[] {
136189
direction: newTB2.passed > oldTB2.passed ? 'better' : newTB2.passed < oldTB2.passed ? 'worse' : 'same',
137190
});
138191

192+
const oldTokObserved = (oldTB2.token_observed_trials ?? 0) > 0 && oldTB2.avg_total_tokens !== undefined;
193+
const newTokObserved = (newTB2.token_observed_trials ?? 0) > 0 && newTB2.avg_total_tokens !== undefined;
194+
if (oldTokObserved && newTokObserved) {
195+
const tokenDelta = deltaStr(oldTB2.avg_total_tokens!, newTB2.avg_total_tokens!, 'tokens');
196+
rows.push({
197+
label: 'tb2 [tokens]',
198+
oldValue: fmtK(oldTB2.avg_total_tokens!),
199+
newValue: fmtK(newTB2.avg_total_tokens!),
200+
delta: tokenDelta.text,
201+
direction: tokenDelta.dir,
202+
});
203+
}
204+
139205
return rows;
140206
}
141207

@@ -145,9 +211,10 @@ export function loadReport(filePath: string): BenchmarkReport {
145211

146212
export function compareReports(oldReport: BenchmarkReport, newReport: BenchmarkReport): ComparisonResult {
147213
const sweRows = compareSWE(oldReport.swe ?? [], newReport.swe ?? []);
214+
const tauRows = compareTAU(oldReport.tau ?? [], newReport.tau ?? []);
148215
const tb2Rows = compareTB2(oldReport.tb2, newReport.tb2);
149-
const hasRegressions = [...sweRows, ...tb2Rows].some(r => r.direction === 'worse');
150-
return { swe: sweRows, tb2: tb2Rows, hasRegressions };
216+
const hasRegressions = [...sweRows, ...tauRows, ...tb2Rows].some(r => r.direction === 'worse');
217+
return { swe: sweRows, tau: tauRows, tb2: tb2Rows, hasRegressions };
151218
}
152219

153220
export function printComparison(oldPath: string, newPath: string, result: ComparisonResult): void {
@@ -159,7 +226,7 @@ export function printComparison(oldPath: string, newPath: string, result: Compar
159226
console.log(` Current: ${newPath}`);
160227
console.log('');
161228

162-
const allRows = [...result.swe, ...result.tb2];
229+
const allRows = [...result.swe, ...result.tau, ...result.tb2];
163230
if (allRows.length === 0) {
164231
console.log(' No comparable results found.\n');
165232
return;
@@ -182,6 +249,19 @@ export function printComparison(oldPath: string, newPath: string, result: Compar
182249
console.log('');
183250
}
184251

252+
if (result.tau.length > 0) {
253+
console.log('--- TAU Comparison ---\n');
254+
console.log(header);
255+
console.log(sep);
256+
for (const row of result.tau) {
257+
const dir = row.direction === 'better' ? ' ^' : row.direction === 'worse' ? ' v' : ' ';
258+
console.log(
259+
`${pad(row.label, maxLabel)} | ${lpad(row.oldValue, 10)} | ${lpad(row.newValue, 10)} | ${lpad(row.delta, 12)} |${dir}`,
260+
);
261+
}
262+
console.log('');
263+
}
264+
185265
if (result.tb2.length > 0) {
186266
console.log('--- TB2 Comparison ---\n');
187267
console.log(header);

0 commit comments

Comments
 (0)