|
| 1 | +#!/usr/bin/env bash |
| 2 | +set -euo pipefail |
| 3 | + |
| 4 | +ROOT="${1:-/tmp/openclaude-shim-bench}" |
| 5 | +RESULTS="$ROOT/results" |
| 6 | +BASE="$ROOT/base" |
| 7 | +OFF="$ROOT/off" |
| 8 | +MINIFY="$ROOT/minify" |
| 9 | + |
| 10 | +rm -rf "$ROOT" |
| 11 | +mkdir -p "$BASE/src" "$BASE/docs" "$BASE/tests" "$RESULTS" |
| 12 | + |
| 13 | +cat >"$BASE/README.md" <<'EOF' |
| 14 | +# Shim Live Test Project |
| 15 | +
|
| 16 | +This disposable project is used to test OpenClaude workflows against the |
| 17 | +OpenAI-compatible shim. |
| 18 | +
|
| 19 | +The project exposes a tiny calculator module, a config file, and notes that |
| 20 | +mention NanoGPT, DeepSeek, Qwen, and tool schema minimization. |
| 21 | +
|
| 22 | +Expected version: 0.4.2 |
| 23 | +EOF |
| 24 | + |
| 25 | +cat >"$BASE/src/calculator.js" <<'EOF' |
| 26 | +export function add(a, b) { |
| 27 | + return a + b |
| 28 | +} |
| 29 | +
|
| 30 | +export function multiply(a, b) { |
| 31 | + return a * b |
| 32 | +} |
| 33 | +
|
| 34 | +export function formatResult(label, value) { |
| 35 | + return `${label}: ${value}` |
| 36 | +} |
| 37 | +EOF |
| 38 | + |
| 39 | +cat >"$BASE/src/config.json" <<'EOF' |
| 40 | +{ |
| 41 | + "name": "shim-live-test", |
| 42 | + "version": "0.4.2", |
| 43 | + "provider": "nanogpt", |
| 44 | + "defaultModel": "deepseek/deepseek-v4-pro" |
| 45 | +} |
| 46 | +EOF |
| 47 | + |
| 48 | +cat >"$BASE/docs/notes.md" <<'EOF' |
| 49 | +# Notes |
| 50 | +
|
| 51 | +- NanoGPT is used through an OpenAI-compatible endpoint. |
| 52 | +- DeepSeek V4 Pro is the default model for agentic coding work. |
| 53 | +- Qwen models can be used as alternatives. |
| 54 | +- Tool schema minimization should preserve all available tools. |
| 55 | +EOF |
| 56 | + |
| 57 | +cat >"$BASE/tests/calculator.test.js" <<'EOF' |
| 58 | +import { add, multiply, formatResult } from '../src/calculator.js' |
| 59 | +
|
| 60 | +if (add(2, 3) !== 5) throw new Error('add failed') |
| 61 | +if (multiply(3, 4) !== 12) throw new Error('multiply failed') |
| 62 | +if (formatResult('sum', 5) !== 'sum: 5') throw new Error('format failed') |
| 63 | +
|
| 64 | +console.log('calculator tests passed') |
| 65 | +EOF |
| 66 | + |
| 67 | +cat >"$BASE/package.json" <<'EOF' |
| 68 | +{ |
| 69 | + "name": "shim-live-test", |
| 70 | + "version": "0.4.2", |
| 71 | + "type": "module", |
| 72 | + "scripts": { |
| 73 | + "test": "node tests/calculator.test.js" |
| 74 | + } |
| 75 | +} |
| 76 | +EOF |
| 77 | + |
| 78 | +cp -a "$BASE" "$OFF" |
| 79 | +cp -a "$BASE" "$MINIFY" |
| 80 | + |
| 81 | +run_case() { |
| 82 | + local mode="$1" |
| 83 | + local workdir="$2" |
| 84 | + local outdir="$3" |
| 85 | + local id="$4" |
| 86 | + local prompt="$5" |
| 87 | + mkdir -p "$outdir" |
| 88 | + |
| 89 | + printf 'running %s %s\n' "$mode" "$id" >&2 |
| 90 | + ( |
| 91 | + cd "$workdir" |
| 92 | + OPENAI_SHIM_TOOL_MODE="$mode" \ |
| 93 | + timeout 120 openclaude --bare -p --output-format json \ |
| 94 | + --permission-mode bypassPermissions \ |
| 95 | + --max-budget-usd 0.20 \ |
| 96 | + "$prompt" |
| 97 | + ) >"$outdir/${id}.json" 2>"$outdir/${id}.log" || { |
| 98 | + code=$? |
| 99 | + printf '{"type":"harness_error","exit_code":%s,"case":"%s"}\n' "$code" "$id" >>"$outdir/${id}.json" |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +run_suite() { |
| 104 | + local mode="$1" |
| 105 | + local workdir="$2" |
| 106 | + local outdir="$RESULTS/$mode" |
| 107 | + |
| 108 | + run_case "$mode" "$workdir" "$outdir" "01_arithmetic" "Reply with exactly the number: 4" |
| 109 | + run_case "$mode" "$workdir" "$outdir" "02_read_config" "Read src/config.json and report only the version and defaultModel." |
| 110 | + run_case "$mode" "$workdir" "$outdir" "03_search_deepseek" "Search the project for DeepSeek and report matching file paths only." |
| 111 | + run_case "$mode" "$workdir" "$outdir" "04_run_tests" "Run the test suite with npm test and report pass or fail with the key output." |
| 112 | + run_case "$mode" "$workdir" "$outdir" "05_create_doc" "Create docs/generated-summary.md containing one concise sentence about this project, then report the file path." |
| 113 | + run_case "$mode" "$workdir" "$outdir" "06_edit_code" "Edit src/calculator.js to add an exported subtract(a, b) function, then report what changed." |
| 114 | + run_case "$mode" "$workdir" "$outdir" "07_update_test" "Update tests/calculator.test.js to test subtract(7, 2) === 5, then run npm test and report pass or fail." |
| 115 | + run_case "$mode" "$workdir" "$outdir" "08_summarize_project" "Read README.md and docs/notes.md, then summarize the project in three bullets." |
| 116 | + run_case "$mode" "$workdir" "$outdir" "09_find_version" "Find every occurrence of 0.4.2 in this project and report file paths." |
| 117 | + run_case "$mode" "$workdir" "$outdir" "10_plan_next" "Inspect the project structure and propose the next two engineering tasks. Mention the files you inspected." |
| 118 | +} |
| 119 | + |
| 120 | +run_suite off "$OFF" |
| 121 | +run_suite minify "$MINIFY" |
| 122 | + |
| 123 | +node - "$RESULTS" <<'NODE' |
| 124 | +const fs = require('fs') |
| 125 | +const path = require('path') |
| 126 | +const root = process.argv[2] |
| 127 | +
|
| 128 | +function readRun(mode, id) { |
| 129 | + const file = path.join(root, mode, `${id}.json`) |
| 130 | + const raw = fs.readFileSync(file, 'utf8').trim().split('\n').filter(Boolean).at(-1) |
| 131 | + const json = JSON.parse(raw) |
| 132 | + const model = json.modelUsage ? Object.keys(json.modelUsage)[0] : '' |
| 133 | + const usage = model ? json.modelUsage[model] : {} |
| 134 | + return { |
| 135 | + ok: json.type === 'result' && json.subtype === 'success' && !json.is_error, |
| 136 | + input: usage.inputTokens ?? json.usage?.input_tokens ?? 0, |
| 137 | + output: usage.outputTokens ?? json.usage?.output_tokens ?? 0, |
| 138 | + cost: usage.costUSD ?? json.total_cost_usd ?? 0, |
| 139 | + duration: json.duration_ms ?? 0, |
| 140 | + turns: json.num_turns ?? 0, |
| 141 | + result: String(json.result ?? '').replace(/\s+/g, ' ').slice(0, 120), |
| 142 | + } |
| 143 | +} |
| 144 | +
|
| 145 | +const ids = fs.readdirSync(path.join(root, 'off')) |
| 146 | + .filter(name => name.endsWith('.json')) |
| 147 | + .map(name => name.replace(/\.json$/, '')) |
| 148 | + .sort() |
| 149 | +
|
| 150 | +const lines = [ |
| 151 | + 'case,off_input,minify_input,input_reduction_pct,off_cost,minify_cost,cost_reduction_pct,off_turns,minify_turns,off_ok,minify_ok', |
| 152 | +] |
| 153 | +let offInput = 0 |
| 154 | +let minInput = 0 |
| 155 | +let offCost = 0 |
| 156 | +let minCost = 0 |
| 157 | +let offOk = 0 |
| 158 | +let minOk = 0 |
| 159 | +
|
| 160 | +for (const id of ids) { |
| 161 | + const off = readRun('off', id) |
| 162 | + const min = readRun('minify', id) |
| 163 | + offInput += off.input |
| 164 | + minInput += min.input |
| 165 | + offCost += off.cost |
| 166 | + minCost += min.cost |
| 167 | + offOk += off.ok ? 1 : 0 |
| 168 | + minOk += min.ok ? 1 : 0 |
| 169 | + lines.push([ |
| 170 | + id, |
| 171 | + off.input, |
| 172 | + min.input, |
| 173 | + ((off.input - min.input) / off.input * 100).toFixed(1), |
| 174 | + off.cost.toFixed(6), |
| 175 | + min.cost.toFixed(6), |
| 176 | + ((off.cost - min.cost) / off.cost * 100).toFixed(1), |
| 177 | + off.turns, |
| 178 | + min.turns, |
| 179 | + off.ok, |
| 180 | + min.ok, |
| 181 | + ].join(',')) |
| 182 | +} |
| 183 | +
|
| 184 | +lines.push([ |
| 185 | + 'TOTAL', |
| 186 | + offInput, |
| 187 | + minInput, |
| 188 | + ((offInput - minInput) / offInput * 100).toFixed(1), |
| 189 | + offCost.toFixed(6), |
| 190 | + minCost.toFixed(6), |
| 191 | + ((offCost - minCost) / offCost * 100).toFixed(1), |
| 192 | + '', |
| 193 | + '', |
| 194 | + `${offOk}/${ids.length}`, |
| 195 | + `${minOk}/${ids.length}`, |
| 196 | +].join(',')) |
| 197 | +
|
| 198 | +fs.writeFileSync(path.join(root, 'summary.csv'), `${lines.join('\n')}\n`) |
| 199 | +console.log(lines.join('\n')) |
| 200 | +NODE |
| 201 | + |
| 202 | +printf '\nResults written to %s\n' "$RESULTS" |
0 commit comments