|
| 1 | +"""Simple eval runner: invokes Claude Code with a prompt and reports time + token usage. |
| 2 | +
|
| 3 | +Usage: |
| 4 | + python claude_eval.py "your prompt here" |
| 5 | + python claude_eval.py --prompt-file path/to/prompt.txt |
| 6 | + echo "your prompt" | python claude_eval.py - |
| 7 | +""" |
| 8 | + |
| 9 | +from __future__ import annotations |
| 10 | + |
| 11 | +import argparse |
| 12 | +import contextlib |
| 13 | +import json |
| 14 | +import shutil |
| 15 | +import subprocess |
| 16 | +import sys |
| 17 | +import tempfile |
| 18 | +import time |
| 19 | +from collections.abc import Iterator |
| 20 | +from dataclasses import asdict, dataclass |
| 21 | +from datetime import datetime |
| 22 | +from pathlib import Path |
| 23 | + |
| 24 | +REPO_ROOT = Path(__file__).resolve().parent.parent |
| 25 | +DEFAULT_RUNS_DIR = Path(__file__).resolve().parent / "runs" |
| 26 | +SKILLS_DIR = REPO_ROOT / "skills" |
| 27 | + |
| 28 | + |
| 29 | +@dataclass |
| 30 | +class EvalResult: |
| 31 | + prompt: str |
| 32 | + model: str | None |
| 33 | + effort: str | None |
| 34 | + skill: str | None |
| 35 | + wall_time_s: float |
| 36 | + duration_ms: int | None |
| 37 | + duration_api_ms: int | None |
| 38 | + num_turns: int | None |
| 39 | + input_tokens: int |
| 40 | + output_tokens: int |
| 41 | + cache_creation_input_tokens: int |
| 42 | + cache_read_input_tokens: int |
| 43 | + total_input_tokens: int |
| 44 | + total_cost_usd: float | None |
| 45 | + is_error: bool |
| 46 | + result_text: str |
| 47 | + session_id: str | None |
| 48 | + |
| 49 | + |
| 50 | +def read_prompt(args: argparse.Namespace) -> str: |
| 51 | + if args.prompt_file: |
| 52 | + return Path(args.prompt_file).read_text(encoding="utf-8").strip() |
| 53 | + if args.prompt == "-" or args.prompt is None: |
| 54 | + data = sys.stdin.read().strip() |
| 55 | + if not data: |
| 56 | + sys.exit("error: no prompt provided (stdin empty)") |
| 57 | + return data |
| 58 | + return args.prompt |
| 59 | + |
| 60 | + |
| 61 | +def list_available_skills() -> list[str]: |
| 62 | + if not SKILLS_DIR.is_dir(): |
| 63 | + return [] |
| 64 | + return sorted( |
| 65 | + p.name for p in SKILLS_DIR.iterdir() if p.is_dir() and (p / "SKILL.md").is_file() |
| 66 | + ) |
| 67 | + |
| 68 | + |
| 69 | +@contextlib.contextmanager |
| 70 | +def staged_skill_dir(skill: str | None) -> Iterator[Path | None]: |
| 71 | + """Stage a temp directory of the form ``<tmp>/.claude/skills/<skill>/...`` |
| 72 | + so Claude Code's normal skill discovery picks it up via ``--add-dir``. |
| 73 | +
|
| 74 | + Per the Claude Code docs: |
| 75 | +
|
| 76 | + The `--add-dir` flag grants file access rather than configuration |
| 77 | + discovery, but skills are an exception: `.claude/skills/` within |
| 78 | + an added directory is loaded automatically. |
| 79 | +
|
| 80 | + This registers the skill (name + description go into the skill listing) |
| 81 | + without injecting its full body into the prompt — Claude only loads the |
| 82 | + body when it decides to use the skill, or when invoked as ``/<skill>``. |
| 83 | + """ |
| 84 | + if not skill: |
| 85 | + yield None |
| 86 | + return |
| 87 | + |
| 88 | + skill_src = SKILLS_DIR / skill |
| 89 | + if not (skill_src / "SKILL.md").is_file(): |
| 90 | + available = list_available_skills() |
| 91 | + hint = f" Available skills: {', '.join(available)}." if available else "" |
| 92 | + sys.exit(f"error: skill '{skill}' not found at {skill_src / 'SKILL.md'}.{hint}") |
| 93 | + |
| 94 | + tmp_root = Path(tempfile.mkdtemp(prefix="eval-skill-")) |
| 95 | + try: |
| 96 | + dest = tmp_root / ".claude" / "skills" / skill |
| 97 | + dest.parent.mkdir(parents=True, exist_ok=True) |
| 98 | + shutil.copytree(skill_src, dest) |
| 99 | + yield tmp_root |
| 100 | + finally: |
| 101 | + shutil.rmtree(tmp_root, ignore_errors=True) |
| 102 | + |
| 103 | + |
| 104 | +def run_claude( |
| 105 | + prompt: str, |
| 106 | + model: str | None, |
| 107 | + effort: str | None, |
| 108 | + skill: str | None, |
| 109 | + extra_args: list[str], |
| 110 | + yolo: bool = False, |
| 111 | +) -> tuple[float, dict]: |
| 112 | + claude_bin = shutil.which("claude") |
| 113 | + if not claude_bin: |
| 114 | + sys.exit("error: 'claude' CLI not found on PATH") |
| 115 | + |
| 116 | + with staged_skill_dir(skill) as skill_root: |
| 117 | + cmd = [claude_bin, "-p", prompt, "--output-format", "json"] |
| 118 | + if model: |
| 119 | + cmd += ["--model", model] |
| 120 | + if effort: |
| 121 | + cmd += ["--effort", effort] |
| 122 | + if skill_root is not None: |
| 123 | + cmd += ["--add-dir", str(skill_root)] |
| 124 | + if yolo: |
| 125 | + # Bypass all tool-permission prompts so the model can actually run |
| 126 | + # shell, edit files, etc. unattended. Without this, ``claude -p`` |
| 127 | + # silently degrades to "I would have run X" because there is no |
| 128 | + # interactive user to approve tool calls. |
| 129 | + cmd += ["--dangerously-skip-permissions"] |
| 130 | + cmd += extra_args |
| 131 | + |
| 132 | + start = time.perf_counter() |
| 133 | + proc = subprocess.run( |
| 134 | + cmd, |
| 135 | + capture_output=True, |
| 136 | + text=True, |
| 137 | + encoding="utf-8", |
| 138 | + stdin=subprocess.DEVNULL, |
| 139 | + ) |
| 140 | + elapsed = time.perf_counter() - start |
| 141 | + |
| 142 | + stdout = (proc.stdout or "").strip() |
| 143 | + try: |
| 144 | + payload = json.loads(stdout) if stdout else None |
| 145 | + except json.JSONDecodeError: |
| 146 | + payload = None |
| 147 | + |
| 148 | + if payload is None: |
| 149 | + if proc.stderr: |
| 150 | + sys.stderr.write(proc.stderr) |
| 151 | + if stdout: |
| 152 | + sys.stderr.write(stdout + "\n") |
| 153 | + sys.exit(f"error: claude exited with code {proc.returncode} and produced no JSON output") |
| 154 | + |
| 155 | + return elapsed, payload |
| 156 | + |
| 157 | + |
| 158 | +def build_result( |
| 159 | + prompt: str, |
| 160 | + model: str | None, |
| 161 | + effort: str | None, |
| 162 | + skill: str | None, |
| 163 | + elapsed_s: float, |
| 164 | + payload: dict, |
| 165 | +) -> EvalResult: |
| 166 | + usage = payload.get("usage") or {} |
| 167 | + input_tokens = int(usage.get("input_tokens", 0) or 0) |
| 168 | + output_tokens = int(usage.get("output_tokens", 0) or 0) |
| 169 | + cache_creation = int(usage.get("cache_creation_input_tokens", 0) or 0) |
| 170 | + cache_read = int(usage.get("cache_read_input_tokens", 0) or 0) |
| 171 | + |
| 172 | + return EvalResult( |
| 173 | + prompt=prompt, |
| 174 | + model=model, |
| 175 | + effort=effort, |
| 176 | + skill=skill, |
| 177 | + wall_time_s=round(elapsed_s, 3), |
| 178 | + duration_ms=payload.get("duration_ms"), |
| 179 | + duration_api_ms=payload.get("duration_api_ms"), |
| 180 | + num_turns=payload.get("num_turns"), |
| 181 | + input_tokens=input_tokens, |
| 182 | + output_tokens=output_tokens, |
| 183 | + cache_creation_input_tokens=cache_creation, |
| 184 | + cache_read_input_tokens=cache_read, |
| 185 | + total_input_tokens=input_tokens + cache_creation + cache_read, |
| 186 | + total_cost_usd=payload.get("total_cost_usd"), |
| 187 | + is_error=bool(payload.get("is_error", False)), |
| 188 | + result_text=payload.get("result", ""), |
| 189 | + session_id=payload.get("session_id"), |
| 190 | + ) |
| 191 | + |
| 192 | + |
| 193 | +def print_human(result: EvalResult) -> None: |
| 194 | + print("=" * 60) |
| 195 | + print("Claude Code Eval Result") |
| 196 | + print("=" * 60) |
| 197 | + print(f"Prompt: {result.prompt[:120]}{'...' if len(result.prompt) > 120 else ''}") |
| 198 | + print(f"Model: {result.model or '(default)'}") |
| 199 | + print(f"Effort: {result.effort or '(default)'}") |
| 200 | + print(f"Skill: {result.skill or '(none)'}") |
| 201 | + print(f"Wall time: {result.wall_time_s:.3f} s") |
| 202 | + if result.duration_ms is not None: |
| 203 | + print(f"Reported time: {result.duration_ms / 1000:.3f} s (api: {(result.duration_api_ms or 0) / 1000:.3f} s)") |
| 204 | + print(f"Turns: {result.num_turns}") |
| 205 | + print(f"Input tokens: {result.input_tokens}") |
| 206 | + print(f" + cache write: {result.cache_creation_input_tokens}") |
| 207 | + print(f" + cache read: {result.cache_read_input_tokens}") |
| 208 | + print(f" = total in: {result.total_input_tokens}") |
| 209 | + print(f"Output tokens: {result.output_tokens}") |
| 210 | + if result.total_cost_usd is not None: |
| 211 | + print(f"Cost (USD): ${result.total_cost_usd:.6f}") |
| 212 | + print(f"Error: {result.is_error}") |
| 213 | + print("-" * 60) |
| 214 | + print("Response:") |
| 215 | + print(result.result_text) |
| 216 | + print("=" * 60) |
| 217 | + |
| 218 | + |
| 219 | +def main() -> None: |
| 220 | + parser = argparse.ArgumentParser(description="Run a prompt on Claude Code and measure time + tokens.") |
| 221 | + parser.add_argument("prompt", nargs="?", help="The prompt to send (use '-' to read from stdin).") |
| 222 | + parser.add_argument("--prompt-file", help="Read the prompt from a file.") |
| 223 | + parser.add_argument( |
| 224 | + "--model", |
| 225 | + default="sonnet", |
| 226 | + help="Model alias (e.g. sonnet, opus, haiku) or full name (e.g. claude-sonnet-4-6). Default: sonnet.", |
| 227 | + ) |
| 228 | + parser.add_argument( |
| 229 | + "--effort", |
| 230 | + choices=["low", "medium", "high", "max"], |
| 231 | + default="high", |
| 232 | + help="Reasoning effort level for the session. Default: high.", |
| 233 | + ) |
| 234 | + parser.add_argument( |
| 235 | + "--skill", |
| 236 | + default=None, |
| 237 | + help=( |
| 238 | + "Name of a skill under skills/ to expose to the model " |
| 239 | + "(its SKILL.md is appended to the system prompt). " |
| 240 | + "Omit to run with no skill. Use --list-skills to see options." |
| 241 | + ), |
| 242 | + ) |
| 243 | + parser.add_argument( |
| 244 | + "--list-skills", |
| 245 | + action="store_true", |
| 246 | + help="Print the names of available skills under skills/ and exit.", |
| 247 | + ) |
| 248 | + parser.add_argument( |
| 249 | + "--yolo", |
| 250 | + "--dangerously-skip-permissions", |
| 251 | + dest="yolo", |
| 252 | + action="store_true", |
| 253 | + help=( |
| 254 | + "Pass --dangerously-skip-permissions to claude, so the model can " |
| 255 | + "use shell / edit / write tools without per-call approval. " |
| 256 | + "Required for any eval whose prompt actually wants the model to " |
| 257 | + "run commands (otherwise claude -p degrades to memory-only answers)." |
| 258 | + ), |
| 259 | + ) |
| 260 | + parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON to stdout instead of the human-readable summary.") |
| 261 | + parser.add_argument( |
| 262 | + "--output", |
| 263 | + help=( |
| 264 | + "Path to write the result JSON file. Defaults to " |
| 265 | + "eval/runs/<timestamp>-<model>-<effort>.json. " |
| 266 | + "Pass an empty string ('') to skip writing a file." |
| 267 | + ), |
| 268 | + ) |
| 269 | + args, extra_args = parser.parse_known_args() |
| 270 | + extra_args = [a for a in extra_args if a != "--"] |
| 271 | + |
| 272 | + if args.list_skills: |
| 273 | + skills = list_available_skills() |
| 274 | + if not skills: |
| 275 | + print("(no skills found under skills/)") |
| 276 | + else: |
| 277 | + for name in skills: |
| 278 | + print(name) |
| 279 | + return |
| 280 | + |
| 281 | + prompt = read_prompt(args) |
| 282 | + |
| 283 | + elapsed, payload = run_claude( |
| 284 | + prompt, args.model, args.effort, args.skill, extra_args, yolo=args.yolo |
| 285 | + ) |
| 286 | + result = build_result(prompt, args.model, args.effort, args.skill, elapsed, payload) |
| 287 | + |
| 288 | + serialized = json.dumps(asdict(result), indent=2) |
| 289 | + |
| 290 | + output_path: Path | None |
| 291 | + if args.output is None: |
| 292 | + DEFAULT_RUNS_DIR.mkdir(parents=True, exist_ok=True) |
| 293 | + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") |
| 294 | + skill_part = f"-{args.skill}" if args.skill else "" |
| 295 | + filename = f"{stamp}-{args.model}-{args.effort}{skill_part}.json" |
| 296 | + output_path = DEFAULT_RUNS_DIR / filename |
| 297 | + elif args.output == "": |
| 298 | + output_path = None |
| 299 | + else: |
| 300 | + output_path = Path(args.output) |
| 301 | + if output_path.parent and not output_path.parent.exists(): |
| 302 | + output_path.parent.mkdir(parents=True, exist_ok=True) |
| 303 | + |
| 304 | + if output_path is not None: |
| 305 | + output_path.write_text(serialized, encoding="utf-8") |
| 306 | + |
| 307 | + if args.json: |
| 308 | + print(serialized) |
| 309 | + else: |
| 310 | + print_human(result) |
| 311 | + if output_path is not None: |
| 312 | + print(f"Saved JSON to: {output_path}") |
| 313 | + |
| 314 | + |
| 315 | +if __name__ == "__main__": |
| 316 | + main() |
0 commit comments