Skip to content

Commit 8dfca01

Browse files
Merge pull request #24 from amd/dholanda/eval
Initial Claude Skill Eval Harness
2 parents 725fb86 + 4ed4798 commit 8dfca01

3 files changed

Lines changed: 658 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,6 @@ __pycache__/
88

99
# uv
1010
.uv-cache/
11+
12+
# Eval run artifacts
13+
eval/runs/

eval/claude_eval.py

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
"""Simple eval runner: invokes Claude Code with a prompt and reports time + token usage.
2+
3+
Usage:
4+
python claude_eval.py "your prompt here"
5+
python claude_eval.py --prompt-file path/to/prompt.txt
6+
echo "your prompt" | python claude_eval.py -
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import argparse
12+
import contextlib
13+
import json
14+
import shutil
15+
import subprocess
16+
import sys
17+
import tempfile
18+
import time
19+
from collections.abc import Iterator
20+
from dataclasses import asdict, dataclass
21+
from datetime import datetime
22+
from pathlib import Path
23+
24+
REPO_ROOT = Path(__file__).resolve().parent.parent
25+
DEFAULT_RUNS_DIR = Path(__file__).resolve().parent / "runs"
26+
SKILLS_DIR = REPO_ROOT / "skills"
27+
28+
29+
@dataclass
30+
class EvalResult:
31+
prompt: str
32+
model: str | None
33+
effort: str | None
34+
skill: str | None
35+
wall_time_s: float
36+
duration_ms: int | None
37+
duration_api_ms: int | None
38+
num_turns: int | None
39+
input_tokens: int
40+
output_tokens: int
41+
cache_creation_input_tokens: int
42+
cache_read_input_tokens: int
43+
total_input_tokens: int
44+
total_cost_usd: float | None
45+
is_error: bool
46+
result_text: str
47+
session_id: str | None
48+
49+
50+
def read_prompt(args: argparse.Namespace) -> str:
51+
if args.prompt_file:
52+
return Path(args.prompt_file).read_text(encoding="utf-8").strip()
53+
if args.prompt == "-" or args.prompt is None:
54+
data = sys.stdin.read().strip()
55+
if not data:
56+
sys.exit("error: no prompt provided (stdin empty)")
57+
return data
58+
return args.prompt
59+
60+
61+
def list_available_skills() -> list[str]:
62+
if not SKILLS_DIR.is_dir():
63+
return []
64+
return sorted(
65+
p.name for p in SKILLS_DIR.iterdir() if p.is_dir() and (p / "SKILL.md").is_file()
66+
)
67+
68+
69+
@contextlib.contextmanager
70+
def staged_skill_dir(skill: str | None) -> Iterator[Path | None]:
71+
"""Stage a temp directory of the form ``<tmp>/.claude/skills/<skill>/...``
72+
so Claude Code's normal skill discovery picks it up via ``--add-dir``.
73+
74+
Per the Claude Code docs:
75+
76+
The `--add-dir` flag grants file access rather than configuration
77+
discovery, but skills are an exception: `.claude/skills/` within
78+
an added directory is loaded automatically.
79+
80+
This registers the skill (name + description go into the skill listing)
81+
without injecting its full body into the prompt — Claude only loads the
82+
body when it decides to use the skill, or when invoked as ``/<skill>``.
83+
"""
84+
if not skill:
85+
yield None
86+
return
87+
88+
skill_src = SKILLS_DIR / skill
89+
if not (skill_src / "SKILL.md").is_file():
90+
available = list_available_skills()
91+
hint = f" Available skills: {', '.join(available)}." if available else ""
92+
sys.exit(f"error: skill '{skill}' not found at {skill_src / 'SKILL.md'}.{hint}")
93+
94+
tmp_root = Path(tempfile.mkdtemp(prefix="eval-skill-"))
95+
try:
96+
dest = tmp_root / ".claude" / "skills" / skill
97+
dest.parent.mkdir(parents=True, exist_ok=True)
98+
shutil.copytree(skill_src, dest)
99+
yield tmp_root
100+
finally:
101+
shutil.rmtree(tmp_root, ignore_errors=True)
102+
103+
104+
def run_claude(
105+
prompt: str,
106+
model: str | None,
107+
effort: str | None,
108+
skill: str | None,
109+
extra_args: list[str],
110+
yolo: bool = False,
111+
) -> tuple[float, dict]:
112+
claude_bin = shutil.which("claude")
113+
if not claude_bin:
114+
sys.exit("error: 'claude' CLI not found on PATH")
115+
116+
with staged_skill_dir(skill) as skill_root:
117+
cmd = [claude_bin, "-p", prompt, "--output-format", "json"]
118+
if model:
119+
cmd += ["--model", model]
120+
if effort:
121+
cmd += ["--effort", effort]
122+
if skill_root is not None:
123+
cmd += ["--add-dir", str(skill_root)]
124+
if yolo:
125+
# Bypass all tool-permission prompts so the model can actually run
126+
# shell, edit files, etc. unattended. Without this, ``claude -p``
127+
# silently degrades to "I would have run X" because there is no
128+
# interactive user to approve tool calls.
129+
cmd += ["--dangerously-skip-permissions"]
130+
cmd += extra_args
131+
132+
start = time.perf_counter()
133+
proc = subprocess.run(
134+
cmd,
135+
capture_output=True,
136+
text=True,
137+
encoding="utf-8",
138+
stdin=subprocess.DEVNULL,
139+
)
140+
elapsed = time.perf_counter() - start
141+
142+
stdout = (proc.stdout or "").strip()
143+
try:
144+
payload = json.loads(stdout) if stdout else None
145+
except json.JSONDecodeError:
146+
payload = None
147+
148+
if payload is None:
149+
if proc.stderr:
150+
sys.stderr.write(proc.stderr)
151+
if stdout:
152+
sys.stderr.write(stdout + "\n")
153+
sys.exit(f"error: claude exited with code {proc.returncode} and produced no JSON output")
154+
155+
return elapsed, payload
156+
157+
158+
def build_result(
159+
prompt: str,
160+
model: str | None,
161+
effort: str | None,
162+
skill: str | None,
163+
elapsed_s: float,
164+
payload: dict,
165+
) -> EvalResult:
166+
usage = payload.get("usage") or {}
167+
input_tokens = int(usage.get("input_tokens", 0) or 0)
168+
output_tokens = int(usage.get("output_tokens", 0) or 0)
169+
cache_creation = int(usage.get("cache_creation_input_tokens", 0) or 0)
170+
cache_read = int(usage.get("cache_read_input_tokens", 0) or 0)
171+
172+
return EvalResult(
173+
prompt=prompt,
174+
model=model,
175+
effort=effort,
176+
skill=skill,
177+
wall_time_s=round(elapsed_s, 3),
178+
duration_ms=payload.get("duration_ms"),
179+
duration_api_ms=payload.get("duration_api_ms"),
180+
num_turns=payload.get("num_turns"),
181+
input_tokens=input_tokens,
182+
output_tokens=output_tokens,
183+
cache_creation_input_tokens=cache_creation,
184+
cache_read_input_tokens=cache_read,
185+
total_input_tokens=input_tokens + cache_creation + cache_read,
186+
total_cost_usd=payload.get("total_cost_usd"),
187+
is_error=bool(payload.get("is_error", False)),
188+
result_text=payload.get("result", ""),
189+
session_id=payload.get("session_id"),
190+
)
191+
192+
193+
def print_human(result: EvalResult) -> None:
194+
print("=" * 60)
195+
print("Claude Code Eval Result")
196+
print("=" * 60)
197+
print(f"Prompt: {result.prompt[:120]}{'...' if len(result.prompt) > 120 else ''}")
198+
print(f"Model: {result.model or '(default)'}")
199+
print(f"Effort: {result.effort or '(default)'}")
200+
print(f"Skill: {result.skill or '(none)'}")
201+
print(f"Wall time: {result.wall_time_s:.3f} s")
202+
if result.duration_ms is not None:
203+
print(f"Reported time: {result.duration_ms / 1000:.3f} s (api: {(result.duration_api_ms or 0) / 1000:.3f} s)")
204+
print(f"Turns: {result.num_turns}")
205+
print(f"Input tokens: {result.input_tokens}")
206+
print(f" + cache write: {result.cache_creation_input_tokens}")
207+
print(f" + cache read: {result.cache_read_input_tokens}")
208+
print(f" = total in: {result.total_input_tokens}")
209+
print(f"Output tokens: {result.output_tokens}")
210+
if result.total_cost_usd is not None:
211+
print(f"Cost (USD): ${result.total_cost_usd:.6f}")
212+
print(f"Error: {result.is_error}")
213+
print("-" * 60)
214+
print("Response:")
215+
print(result.result_text)
216+
print("=" * 60)
217+
218+
219+
def main() -> None:
220+
parser = argparse.ArgumentParser(description="Run a prompt on Claude Code and measure time + tokens.")
221+
parser.add_argument("prompt", nargs="?", help="The prompt to send (use '-' to read from stdin).")
222+
parser.add_argument("--prompt-file", help="Read the prompt from a file.")
223+
parser.add_argument(
224+
"--model",
225+
default="sonnet",
226+
help="Model alias (e.g. sonnet, opus, haiku) or full name (e.g. claude-sonnet-4-6). Default: sonnet.",
227+
)
228+
parser.add_argument(
229+
"--effort",
230+
choices=["low", "medium", "high", "max"],
231+
default="high",
232+
help="Reasoning effort level for the session. Default: high.",
233+
)
234+
parser.add_argument(
235+
"--skill",
236+
default=None,
237+
help=(
238+
"Name of a skill under skills/ to expose to the model "
239+
"(its SKILL.md is appended to the system prompt). "
240+
"Omit to run with no skill. Use --list-skills to see options."
241+
),
242+
)
243+
parser.add_argument(
244+
"--list-skills",
245+
action="store_true",
246+
help="Print the names of available skills under skills/ and exit.",
247+
)
248+
parser.add_argument(
249+
"--yolo",
250+
"--dangerously-skip-permissions",
251+
dest="yolo",
252+
action="store_true",
253+
help=(
254+
"Pass --dangerously-skip-permissions to claude, so the model can "
255+
"use shell / edit / write tools without per-call approval. "
256+
"Required for any eval whose prompt actually wants the model to "
257+
"run commands (otherwise claude -p degrades to memory-only answers)."
258+
),
259+
)
260+
parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON to stdout instead of the human-readable summary.")
261+
parser.add_argument(
262+
"--output",
263+
help=(
264+
"Path to write the result JSON file. Defaults to "
265+
"eval/runs/<timestamp>-<model>-<effort>.json. "
266+
"Pass an empty string ('') to skip writing a file."
267+
),
268+
)
269+
args, extra_args = parser.parse_known_args()
270+
extra_args = [a for a in extra_args if a != "--"]
271+
272+
if args.list_skills:
273+
skills = list_available_skills()
274+
if not skills:
275+
print("(no skills found under skills/)")
276+
else:
277+
for name in skills:
278+
print(name)
279+
return
280+
281+
prompt = read_prompt(args)
282+
283+
elapsed, payload = run_claude(
284+
prompt, args.model, args.effort, args.skill, extra_args, yolo=args.yolo
285+
)
286+
result = build_result(prompt, args.model, args.effort, args.skill, elapsed, payload)
287+
288+
serialized = json.dumps(asdict(result), indent=2)
289+
290+
output_path: Path | None
291+
if args.output is None:
292+
DEFAULT_RUNS_DIR.mkdir(parents=True, exist_ok=True)
293+
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
294+
skill_part = f"-{args.skill}" if args.skill else ""
295+
filename = f"{stamp}-{args.model}-{args.effort}{skill_part}.json"
296+
output_path = DEFAULT_RUNS_DIR / filename
297+
elif args.output == "":
298+
output_path = None
299+
else:
300+
output_path = Path(args.output)
301+
if output_path.parent and not output_path.parent.exists():
302+
output_path.parent.mkdir(parents=True, exist_ok=True)
303+
304+
if output_path is not None:
305+
output_path.write_text(serialized, encoding="utf-8")
306+
307+
if args.json:
308+
print(serialized)
309+
else:
310+
print_human(result)
311+
if output_path is not None:
312+
print(f"Saved JSON to: {output_path}")
313+
314+
315+
if __name__ == "__main__":
316+
main()

0 commit comments

Comments
 (0)