Skip to content

Commit d61e131

Browse files
Add skill behavioral testing harness (#54)
1 parent 511f266 commit d61e131

6 files changed

Lines changed: 432 additions & 0 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ __pycache__/
1111

1212
# Eval run artifacts
1313
eval/runs/
14+
15+
# Behavioral matrix results
16+
eval/behavioral/results/

eval/behavioral/conftest.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""pytest wiring for the behavioral harness.
2+
3+
Adds this directory to ``sys.path`` so tests can ``from harness import ...``,
4+
and runs a one-time API preflight so the (expensive) behavioral runs fail
5+
fast with a clear message when the `claude` API isn't reachable -- e.g.
6+
when you're not connected to the network that can reach it.
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import sys
12+
from pathlib import Path
13+
14+
import pytest
15+
16+
sys.path.insert(0, str(Path(__file__).resolve().parent))
17+
18+
from harness import DEFAULT_MODEL, check_api_reachable # noqa: E402
19+
20+
21+
@pytest.fixture(scope="session", autouse=True)
22+
def _require_api_reachable() -> None:
23+
"""Fail the suite up front if the `claude` API can't be reached."""
24+
ok, detail = check_api_reachable(DEFAULT_MODEL)
25+
if not ok:
26+
pytest.fail(
27+
f"claude API not reachable -- are you on the right network? ({detail})"
28+
)

eval/behavioral/harness.py

Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
"""Behavioral-test harness for repo skills (local, pytest-based, non-CI).
2+
3+
A behavioral test runs a skill-driven prompt through the agent **once**, then
4+
asserts what the agent *should* and *should not* have done. Tests read like:
5+
6+
from harness import claude
7+
8+
def test_image_generation():
9+
with claude("sonnet", skill="local-ai-use") as agent:
10+
run = agent.prompt("Use local AI, then generate a cat to out.png.")
11+
12+
# Deterministic checks (cheap, fail fast).
13+
run.logs_contains("local-ai-use")
14+
run.workspace_contains("out.png")
15+
16+
# Natural-language expectations (graded by an LLM judge).
17+
run.should("Download the SD-Turbo model")
18+
run.should_not("Use the GenerateImage tool")
19+
20+
`claude(model, skill=...)` returns an `Agent` context manager. Entering it
21+
stages an isolated temp workspace (skill copied under
22+
`<tmp>/.claude/skills/<skill>/`); leaving it deletes that workspace. `prompt()`
23+
runs the agent once with tool permissions bypassed and returns a `Run`.
24+
25+
Every assertion on `Run` raises `AssertionError` on failure and prints a
26+
`[PASS]`/`[FAIL]` line for visibility under `-s`.
27+
"""
28+
29+
from __future__ import annotations
30+
31+
import json
32+
import os
33+
import re
34+
import shutil
35+
import subprocess
36+
import sys
37+
import tempfile
38+
from pathlib import Path
39+
40+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
41+
from claude_eval import SKILLS_DIR # noqa: E402
42+
43+
DEFAULT_SKILL = os.environ.get("BEHAVIORAL_SKILL", "local-ai-use")
44+
DEFAULT_MODEL = os.environ.get("BEHAVIORAL_MODEL", "sonnet")
45+
DEFAULT_EFFORT = os.environ.get("BEHAVIORAL_EFFORT", "high")
46+
47+
48+
def _claude_env() -> dict[str, str]:
49+
"""Environment for `claude` subprocesses.
50+
51+
Disable the CLI's internal retry loop by default so a network/auth
52+
problem (e.g. not connected to the network that can reach the API)
53+
fails fast instead of being retried into a long, confusing hang. The
54+
caller can still override by exporting ``CLAUDE_CODE_MAX_RETRIES``.
55+
"""
56+
env = dict(os.environ)
57+
env.setdefault("CLAUDE_CODE_MAX_RETRIES", "0")
58+
return env
59+
60+
61+
def check_api_reachable(model: str | None = DEFAULT_MODEL, timeout: int = 60) -> tuple[bool, str]:
62+
"""Preflight: confirm the `claude` CLI can actually reach the API.
63+
64+
Runs a trivial prompt with retries disabled so an unreachable API fails
65+
fast. Returns ``(ok, detail)`` where ``detail`` is a short human-readable
66+
reason on failure. This is meant to be called once before the (expensive)
67+
behavioral runs so the suite can skip cleanly when off-network.
68+
"""
69+
claude_bin = shutil.which("claude")
70+
if not claude_bin:
71+
return False, "'claude' CLI not found on PATH"
72+
73+
cmd = [claude_bin, "-p", "Reply with the single word: ok", "--output-format", "json"]
74+
if model:
75+
cmd += ["--model", model]
76+
77+
try:
78+
proc = subprocess.run(
79+
cmd, capture_output=True, text=True, encoding="utf-8",
80+
stdin=subprocess.DEVNULL, timeout=timeout, env=_claude_env(),
81+
)
82+
except subprocess.TimeoutExpired:
83+
return False, f"API preflight timed out after {timeout}s (is the network reachable?)"
84+
85+
if proc.returncode != 0:
86+
detail = (proc.stderr or proc.stdout or f"exit code {proc.returncode}").strip()
87+
return False, detail[:500]
88+
return True, "ok"
89+
90+
91+
def _stage_workspace(skill: str) -> Path:
92+
"""Copy ``skill`` into an isolated temp workspace and return its path."""
93+
skill_src = SKILLS_DIR / skill
94+
if not (skill_src / "SKILL.md").is_file():
95+
raise FileNotFoundError(f"skill '{skill}' not found at {skill_src / 'SKILL.md'}")
96+
97+
workspace = Path(tempfile.mkdtemp(prefix=f"behavioral-{skill}-"))
98+
dest = workspace / ".claude" / "skills" / skill
99+
dest.parent.mkdir(parents=True, exist_ok=True)
100+
shutil.copytree(skill_src, dest)
101+
return workspace
102+
103+
104+
def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str | None) -> list[dict]:
105+
"""Run the agent once in ``workspace`` and return the stream-json events."""
106+
claude_bin = shutil.which("claude")
107+
if not claude_bin:
108+
raise RuntimeError("'claude' CLI not found on PATH")
109+
110+
cmd = [
111+
claude_bin, "-p", prompt_text,
112+
"--output-format", "stream-json", "--verbose",
113+
"--dangerously-skip-permissions",
114+
"--add-dir", str(workspace),
115+
]
116+
if model:
117+
cmd += ["--model", model]
118+
if effort:
119+
cmd += ["--effort", effort]
120+
121+
proc = subprocess.run(
122+
cmd, cwd=str(workspace), capture_output=True, text=True,
123+
encoding="utf-8", stdin=subprocess.DEVNULL, env=_claude_env(),
124+
)
125+
126+
events: list[dict] = []
127+
for line in (proc.stdout or "").splitlines():
128+
line = line.strip()
129+
if not line:
130+
continue
131+
try:
132+
events.append(json.loads(line))
133+
except json.JSONDecodeError:
134+
continue
135+
136+
if not events:
137+
raise RuntimeError(
138+
f"claude exited with code {proc.returncode} and produced no "
139+
f"parseable stream-json output. stderr:\n{proc.stderr}"
140+
)
141+
return events
142+
143+
144+
def _walk(obj, tool_uses, tool_results) -> None:
145+
"""Collect (tool name, tool input) pairs and tool-result text from events."""
146+
if isinstance(obj, dict):
147+
otype = obj.get("type")
148+
if otype == "tool_use":
149+
tool_uses.append((str(obj.get("name", "")), json.dumps(obj.get("input", {}), ensure_ascii=False)))
150+
elif otype == "tool_result":
151+
content = obj.get("content")
152+
if isinstance(content, str):
153+
tool_results.append(content)
154+
elif isinstance(content, list):
155+
for c in content:
156+
if isinstance(c, dict) and isinstance(c.get("text"), str):
157+
tool_results.append(c["text"])
158+
for v in obj.values():
159+
_walk(v, tool_uses, tool_results)
160+
elif isinstance(obj, list):
161+
for v in obj:
162+
_walk(v, tool_uses, tool_results)
163+
164+
165+
def _list_workspace_files(workspace: Path) -> list[str]:
166+
files: list[str] = []
167+
for p in sorted(workspace.rglob("*")):
168+
if ".claude" in p.relative_to(workspace).parts:
169+
continue
170+
if p.is_file():
171+
files.append(str(p.relative_to(workspace)).replace("\\", "/"))
172+
return files
173+
174+
175+
def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tuple[bool, str]:
176+
"""Ask a grader LLM whether ``statement`` is TRUE given the run's evidence.
177+
178+
The grader may read files in the workspace (e.g. open out.png), so the
179+
workspace is added and tool permissions are bypassed for the grader too.
180+
"""
181+
claude_bin = shutil.which("claude")
182+
if not claude_bin:
183+
return False, "llm_judge skipped: 'claude' CLI not on PATH"
184+
185+
cmd_text = run.command_text
186+
if len(cmd_text) > 4000:
187+
cmd_text = cmd_text[:4000] + "\n...[truncated]..."
188+
evidence = (
189+
f"Files in workspace: {run.files or 'none'}\n"
190+
f"Tools the agent used: {sorted(run.tool_names) or 'none'}\n"
191+
f"--- Agent final message ---\n{run.result_text[:1500]}\n"
192+
f"--- Transcript commands/outputs (truncated) ---\n{cmd_text}\n"
193+
)
194+
prompt_text = (
195+
"You are grading whether a coding agent's run satisfied a specific "
196+
"expectation. Decide if the following statement is TRUE based on the "
197+
"evidence and (if needed) by reading files in the provided workspace "
198+
f"directory: {run.workspace}\n\n"
199+
f"STATEMENT TO EVALUATE:\n{statement}\n\n"
200+
f"EVIDENCE:\n{evidence}\n\n"
201+
"Respond with ONLY a single-line JSON object and nothing else: "
202+
'{"pass": true|false, "reason": "<one short sentence>"}'
203+
)
204+
cmd = [
205+
claude_bin, "-p", prompt_text,
206+
"--output-format", "json",
207+
"--dangerously-skip-permissions",
208+
"--add-dir", str(run.workspace),
209+
]
210+
if judge_model:
211+
cmd += ["--model", judge_model]
212+
213+
try:
214+
proc = subprocess.run(
215+
cmd, capture_output=True, text=True, encoding="utf-8",
216+
stdin=subprocess.DEVNULL, timeout=180, env=_claude_env(),
217+
)
218+
except subprocess.TimeoutExpired:
219+
return False, "llm_judge timed out after 180s"
220+
221+
try:
222+
payload = json.loads((proc.stdout or "").strip())
223+
verdict_text = payload.get("result", "") if isinstance(payload, dict) else ""
224+
except json.JSONDecodeError:
225+
verdict_text = (proc.stdout or "").strip()
226+
227+
match = re.search(r"\{.*\}", verdict_text, re.DOTALL)
228+
if not match:
229+
return False, f"llm_judge gave no JSON verdict: {verdict_text[:200]!r}"
230+
try:
231+
verdict = json.loads(match.group(0))
232+
except json.JSONDecodeError:
233+
return False, f"llm_judge verdict not valid JSON: {match.group(0)[:200]!r}"
234+
235+
passed = bool(verdict.get("pass"))
236+
reason = str(verdict.get("reason", "")).strip() or "(no reason given)"
237+
return passed, f"llm_judge: {reason}"
238+
239+
240+
class Run:
241+
"""The captured result of one agent run, with inline-asserting checks.
242+
243+
Each check prints a ``[PASS]``/``[FAIL]`` line and raises ``AssertionError``
244+
on failure, so the owning pytest test fails at that line.
245+
"""
246+
247+
def __init__(self, *, workspace: Path, events: list[dict], judge_model: str | None) -> None:
248+
tool_uses: list[tuple[str, str]] = []
249+
tool_results: list[str] = []
250+
for ev in events:
251+
_walk(ev, tool_uses, tool_results)
252+
253+
result_text = ""
254+
for ev in events:
255+
if ev.get("type") == "result" and isinstance(ev.get("result"), str):
256+
result_text = ev["result"]
257+
258+
self.workspace = workspace
259+
self.judge_model = judge_model
260+
self.files = _list_workspace_files(workspace)
261+
self.tool_names = {name for name, _ in tool_uses if name}
262+
self.result_text = result_text
263+
264+
# `command_text` is what the agent actually did (tool inputs + outputs),
265+
# used by the judge so the agent's prose ("I won't call DALL-E") cannot
266+
# create false signals.
267+
self.command_text = "\n".join([inp for _, inp in tool_uses] + tool_results)
268+
269+
# `logs` is the full raw transcript, searchable for skill activation,
270+
# tool names, command strings, etc.
271+
self.logs = "\n".join(json.dumps(ev, ensure_ascii=False) for ev in events)
272+
273+
def logs_contains(self, text: str) -> "Run":
274+
ok = text.lower() in self.logs.lower()
275+
self._report(ok, "logs_contains", f"transcript contains '{text}'")
276+
return self
277+
278+
def workspace_contains(self, path: str) -> "Run":
279+
ok = (self.workspace / path).is_file()
280+
detail = f"workspace contains '{path}'"
281+
if not ok:
282+
detail += f" (files: {self.files or 'none'})"
283+
self._report(ok, "workspace_contains", detail)
284+
return self
285+
286+
def should(self, statement: str) -> "Run":
287+
observed, reason = _grade_with_llm(statement, self, self.judge_model)
288+
self._report(observed, "should", f"{statement} -- {reason}")
289+
return self
290+
291+
def should_not(self, statement: str) -> "Run":
292+
observed, reason = _grade_with_llm(statement, self, self.judge_model)
293+
self._report(not observed, "should_not", f"{statement} -- {reason}")
294+
return self
295+
296+
def _report(self, passed: bool, kind: str, detail: str) -> None:
297+
print(f" [{'PASS' if passed else 'FAIL'}] ({kind}) {detail}", flush=True)
298+
assert passed, f"({kind}) {detail}"
299+
300+
301+
class Agent:
302+
"""A single agent session bound to an isolated, skill-staged workspace.
303+
304+
Use as a context manager so the temp workspace is always cleaned up::
305+
306+
with claude("sonnet", skill="local-ai-use") as agent:
307+
run = agent.prompt("...")
308+
"""
309+
310+
def __init__(
311+
self,
312+
model: str | None = DEFAULT_MODEL,
313+
*,
314+
skill: str = DEFAULT_SKILL,
315+
effort: str | None = DEFAULT_EFFORT,
316+
) -> None:
317+
self.model = model
318+
self.skill = skill
319+
self.effort = effort
320+
self.workspace: Path | None = None
321+
322+
def __enter__(self) -> "Agent":
323+
self.workspace = _stage_workspace(self.skill)
324+
return self
325+
326+
def __exit__(self, *exc) -> None:
327+
if self.workspace is not None:
328+
shutil.rmtree(self.workspace, ignore_errors=True)
329+
self.workspace = None
330+
331+
def prompt(self, text: str) -> Run:
332+
"""Run ``text`` through the agent once and return a Run to assert on."""
333+
if self.workspace is None:
334+
raise RuntimeError("Agent.prompt() must be called inside a 'with' block")
335+
336+
print(f"\n[behavioral] skill='{self.skill}' model='{self.model}': {text}", flush=True)
337+
events = _run_agent(text, self.workspace, self.model, self.effort)
338+
return Run(workspace=self.workspace, events=events, judge_model=self.model)
339+
340+
341+
def claude(
342+
model: str | None = DEFAULT_MODEL,
343+
*,
344+
skill: str = DEFAULT_SKILL,
345+
effort: str | None = DEFAULT_EFFORT,
346+
) -> Agent:
347+
"""Factory for a Claude-backed `Agent` (the only agent backend today)."""
348+
return Agent(model, skill=skill, effort=effort)

eval/behavioral/pytest.ini

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[pytest]
2+
# Behavioral tests are slow (they run a real agent + local models) and noisy,
3+
# so default to showing prints. Point pytest at this dir to pick up conftest.py.
4+
addopts = -s -ra
5+
testpaths = tests
6+
python_files = test_*.py
7+
python_functions = test_*

eval/behavioral/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Behavioral tests need pytest. The harness itself uses only the standard
2+
# library; pytest is the test runner / reporter.
3+
#
4+
# pip install -r eval/behavioral/requirements.txt
5+
pytest>=8.0

0 commit comments

Comments
 (0)