Skip to content

Commit d26c310

Browse files
author
Aeshma-Daeva
committed
fix: harden shim tool schema reduction
1 parent d458bf4 commit d26c310

3 files changed

Lines changed: 690 additions & 65 deletions

File tree

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
ROOT="${1:-/tmp/openclaude-shim-bench}"
5+
RESULTS="$ROOT/results"
6+
BASE="$ROOT/base"
7+
OFF="$ROOT/off"
8+
MINIFY="$ROOT/minify"
9+
10+
rm -rf "$ROOT"
11+
mkdir -p "$BASE/src" "$BASE/docs" "$BASE/tests" "$RESULTS"
12+
13+
cat >"$BASE/README.md" <<'EOF'
14+
# Shim Live Test Project
15+
16+
This disposable project is used to test OpenClaude workflows against the
17+
OpenAI-compatible shim.
18+
19+
The project exposes a tiny calculator module, a config file, and notes that
20+
mention NanoGPT, DeepSeek, Qwen, and tool schema minimization.
21+
22+
Expected version: 0.4.2
23+
EOF
24+
25+
cat >"$BASE/src/calculator.js" <<'EOF'
26+
export function add(a, b) {
27+
return a + b
28+
}
29+
30+
export function multiply(a, b) {
31+
return a * b
32+
}
33+
34+
export function formatResult(label, value) {
35+
return `${label}: ${value}`
36+
}
37+
EOF
38+
39+
cat >"$BASE/src/config.json" <<'EOF'
40+
{
41+
"name": "shim-live-test",
42+
"version": "0.4.2",
43+
"provider": "nanogpt",
44+
"defaultModel": "deepseek/deepseek-v4-pro"
45+
}
46+
EOF
47+
48+
cat >"$BASE/docs/notes.md" <<'EOF'
49+
# Notes
50+
51+
- NanoGPT is used through an OpenAI-compatible endpoint.
52+
- DeepSeek V4 Pro is the default model for agentic coding work.
53+
- Qwen models can be used as alternatives.
54+
- Tool schema minimization should preserve all available tools.
55+
EOF
56+
57+
cat >"$BASE/tests/calculator.test.js" <<'EOF'
58+
import { add, multiply, formatResult } from '../src/calculator.js'
59+
60+
if (add(2, 3) !== 5) throw new Error('add failed')
61+
if (multiply(3, 4) !== 12) throw new Error('multiply failed')
62+
if (formatResult('sum', 5) !== 'sum: 5') throw new Error('format failed')
63+
64+
console.log('calculator tests passed')
65+
EOF
66+
67+
cat >"$BASE/package.json" <<'EOF'
68+
{
69+
"name": "shim-live-test",
70+
"version": "0.4.2",
71+
"type": "module",
72+
"scripts": {
73+
"test": "node tests/calculator.test.js"
74+
}
75+
}
76+
EOF
77+
78+
cp -a "$BASE" "$OFF"
79+
cp -a "$BASE" "$MINIFY"
80+
81+
run_case() {
82+
local mode="$1"
83+
local workdir="$2"
84+
local outdir="$3"
85+
local id="$4"
86+
local prompt="$5"
87+
mkdir -p "$outdir"
88+
89+
printf 'running %s %s\n' "$mode" "$id" >&2
90+
(
91+
cd "$workdir"
92+
OPENAI_SHIM_TOOL_MODE="$mode" \
93+
timeout 120 openclaude --bare -p --output-format json \
94+
--permission-mode bypassPermissions \
95+
--max-budget-usd 0.20 \
96+
"$prompt"
97+
) >"$outdir/${id}.json" 2>"$outdir/${id}.log" || {
98+
code=$?
99+
printf '{"type":"harness_error","exit_code":%s,"case":"%s"}\n' "$code" "$id" >>"$outdir/${id}.json"
100+
}
101+
}
102+
103+
run_suite() {
104+
local mode="$1"
105+
local workdir="$2"
106+
local outdir="$RESULTS/$mode"
107+
108+
run_case "$mode" "$workdir" "$outdir" "01_arithmetic" "Reply with exactly the number: 4"
109+
run_case "$mode" "$workdir" "$outdir" "02_read_config" "Read src/config.json and report only the version and defaultModel."
110+
run_case "$mode" "$workdir" "$outdir" "03_search_deepseek" "Search the project for DeepSeek and report matching file paths only."
111+
run_case "$mode" "$workdir" "$outdir" "04_run_tests" "Run the test suite with npm test and report pass or fail with the key output."
112+
run_case "$mode" "$workdir" "$outdir" "05_create_doc" "Create docs/generated-summary.md containing one concise sentence about this project, then report the file path."
113+
run_case "$mode" "$workdir" "$outdir" "06_edit_code" "Edit src/calculator.js to add an exported subtract(a, b) function, then report what changed."
114+
run_case "$mode" "$workdir" "$outdir" "07_update_test" "Update tests/calculator.test.js to test subtract(7, 2) === 5, then run npm test and report pass or fail."
115+
run_case "$mode" "$workdir" "$outdir" "08_summarize_project" "Read README.md and docs/notes.md, then summarize the project in three bullets."
116+
run_case "$mode" "$workdir" "$outdir" "09_find_version" "Find every occurrence of 0.4.2 in this project and report file paths."
117+
run_case "$mode" "$workdir" "$outdir" "10_plan_next" "Inspect the project structure and propose the next two engineering tasks. Mention the files you inspected."
118+
}
119+
120+
run_suite off "$OFF"
121+
run_suite minify "$MINIFY"
122+
123+
node - "$RESULTS" <<'NODE'
124+
const fs = require('fs')
125+
const path = require('path')
126+
const root = process.argv[2]
127+
128+
function readRun(mode, id) {
129+
const file = path.join(root, mode, `${id}.json`)
130+
const raw = fs.readFileSync(file, 'utf8').trim().split('\n').filter(Boolean).at(-1)
131+
const json = JSON.parse(raw)
132+
const model = json.modelUsage ? Object.keys(json.modelUsage)[0] : ''
133+
const usage = model ? json.modelUsage[model] : {}
134+
return {
135+
ok: json.type === 'result' && json.subtype === 'success' && !json.is_error,
136+
input: usage.inputTokens ?? json.usage?.input_tokens ?? 0,
137+
output: usage.outputTokens ?? json.usage?.output_tokens ?? 0,
138+
cost: usage.costUSD ?? json.total_cost_usd ?? 0,
139+
duration: json.duration_ms ?? 0,
140+
turns: json.num_turns ?? 0,
141+
result: String(json.result ?? '').replace(/\s+/g, ' ').slice(0, 120),
142+
}
143+
}
144+
145+
const ids = fs.readdirSync(path.join(root, 'off'))
146+
.filter(name => name.endsWith('.json'))
147+
.map(name => name.replace(/\.json$/, ''))
148+
.sort()
149+
150+
const lines = [
151+
'case,off_input,minify_input,input_reduction_pct,off_cost,minify_cost,cost_reduction_pct,off_turns,minify_turns,off_ok,minify_ok',
152+
]
153+
let offInput = 0
154+
let minInput = 0
155+
let offCost = 0
156+
let minCost = 0
157+
let offOk = 0
158+
let minOk = 0
159+
160+
for (const id of ids) {
161+
const off = readRun('off', id)
162+
const min = readRun('minify', id)
163+
offInput += off.input
164+
minInput += min.input
165+
offCost += off.cost
166+
minCost += min.cost
167+
offOk += off.ok ? 1 : 0
168+
minOk += min.ok ? 1 : 0
169+
lines.push([
170+
id,
171+
off.input,
172+
min.input,
173+
((off.input - min.input) / off.input * 100).toFixed(1),
174+
off.cost.toFixed(6),
175+
min.cost.toFixed(6),
176+
((off.cost - min.cost) / off.cost * 100).toFixed(1),
177+
off.turns,
178+
min.turns,
179+
off.ok,
180+
min.ok,
181+
].join(','))
182+
}
183+
184+
lines.push([
185+
'TOTAL',
186+
offInput,
187+
minInput,
188+
((offInput - minInput) / offInput * 100).toFixed(1),
189+
offCost.toFixed(6),
190+
minCost.toFixed(6),
191+
((offCost - minCost) / offCost * 100).toFixed(1),
192+
'',
193+
'',
194+
`${offOk}/${ids.length}`,
195+
`${minOk}/${ids.length}`,
196+
].join(','))
197+
198+
fs.writeFileSync(path.join(root, 'summary.csv'), `${lines.join('\n')}\n`)
199+
console.log(lines.join('\n'))
200+
NODE
201+
202+
printf '\nResults written to %s\n' "$RESULTS"

0 commit comments

Comments
 (0)