|
| 1 | +"""Behavioral-test harness for repo skills (local, pytest-based, non-CI). |
| 2 | +
|
| 3 | +A behavioral test runs a skill-driven prompt through the agent **once**, then |
| 4 | +asserts what the agent *should* and *should not* have done. Tests read like: |
| 5 | +
|
| 6 | + from harness import claude |
| 7 | +
|
| 8 | + def test_image_generation(): |
| 9 | + with claude("sonnet", skill="local-ai-use") as agent: |
| 10 | + run = agent.prompt("Use local AI, then generate a cat to out.png.") |
| 11 | +
|
| 12 | + # Deterministic checks (cheap, fail fast). |
| 13 | + run.logs_contains("local-ai-use") |
| 14 | + run.workspace_contains("out.png") |
| 15 | +
|
| 16 | + # Natural-language expectations (graded by an LLM judge). |
| 17 | + run.should("Download the SD-Turbo model") |
| 18 | + run.should_not("Use the GenerateImage tool") |
| 19 | +
|
| 20 | +`claude(model, skill=...)` returns an `Agent` context manager. Entering it |
| 21 | +stages an isolated temp workspace (skill copied under |
| 22 | +`<tmp>/.claude/skills/<skill>/`); leaving it deletes that workspace. `prompt()` |
| 23 | +runs the agent once with tool permissions bypassed and returns a `Run`. |
| 24 | +
|
| 25 | +Every assertion on `Run` raises `AssertionError` on failure and prints a |
| 26 | +`[PASS]`/`[FAIL]` line for visibility under `-s`. |
| 27 | +""" |
| 28 | + |
| 29 | +from __future__ import annotations |
| 30 | + |
| 31 | +import json |
| 32 | +import os |
| 33 | +import re |
| 34 | +import shutil |
| 35 | +import subprocess |
| 36 | +import sys |
| 37 | +import tempfile |
| 38 | +from pathlib import Path |
| 39 | + |
| 40 | +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
| 41 | +from claude_eval import SKILLS_DIR # noqa: E402 |
| 42 | + |
| 43 | +DEFAULT_SKILL = os.environ.get("BEHAVIORAL_SKILL", "local-ai-use") |
| 44 | +DEFAULT_MODEL = os.environ.get("BEHAVIORAL_MODEL", "sonnet") |
| 45 | +DEFAULT_EFFORT = os.environ.get("BEHAVIORAL_EFFORT", "high") |
| 46 | + |
| 47 | + |
| 48 | +def _claude_env() -> dict[str, str]: |
| 49 | + """Environment for `claude` subprocesses. |
| 50 | +
|
| 51 | + Disable the CLI's internal retry loop by default so a network/auth |
| 52 | + problem (e.g. not connected to the network that can reach the API) |
| 53 | + fails fast instead of being retried into a long, confusing hang. The |
| 54 | + caller can still override by exporting ``CLAUDE_CODE_MAX_RETRIES``. |
| 55 | + """ |
| 56 | + env = dict(os.environ) |
| 57 | + env.setdefault("CLAUDE_CODE_MAX_RETRIES", "0") |
| 58 | + return env |
| 59 | + |
| 60 | + |
| 61 | +def check_api_reachable(model: str | None = DEFAULT_MODEL, timeout: int = 60) -> tuple[bool, str]: |
| 62 | + """Preflight: confirm the `claude` CLI can actually reach the API. |
| 63 | +
|
| 64 | + Runs a trivial prompt with retries disabled so an unreachable API fails |
| 65 | + fast. Returns ``(ok, detail)`` where ``detail`` is a short human-readable |
| 66 | + reason on failure. This is meant to be called once before the (expensive) |
| 67 | + behavioral runs so the suite can skip cleanly when off-network. |
| 68 | + """ |
| 69 | + claude_bin = shutil.which("claude") |
| 70 | + if not claude_bin: |
| 71 | + return False, "'claude' CLI not found on PATH" |
| 72 | + |
| 73 | + cmd = [claude_bin, "-p", "Reply with the single word: ok", "--output-format", "json"] |
| 74 | + if model: |
| 75 | + cmd += ["--model", model] |
| 76 | + |
| 77 | + try: |
| 78 | + proc = subprocess.run( |
| 79 | + cmd, capture_output=True, text=True, encoding="utf-8", |
| 80 | + stdin=subprocess.DEVNULL, timeout=timeout, env=_claude_env(), |
| 81 | + ) |
| 82 | + except subprocess.TimeoutExpired: |
| 83 | + return False, f"API preflight timed out after {timeout}s (is the network reachable?)" |
| 84 | + |
| 85 | + if proc.returncode != 0: |
| 86 | + detail = (proc.stderr or proc.stdout or f"exit code {proc.returncode}").strip() |
| 87 | + return False, detail[:500] |
| 88 | + return True, "ok" |
| 89 | + |
| 90 | + |
| 91 | +def _stage_workspace(skill: str) -> Path: |
| 92 | + """Copy ``skill`` into an isolated temp workspace and return its path.""" |
| 93 | + skill_src = SKILLS_DIR / skill |
| 94 | + if not (skill_src / "SKILL.md").is_file(): |
| 95 | + raise FileNotFoundError(f"skill '{skill}' not found at {skill_src / 'SKILL.md'}") |
| 96 | + |
| 97 | + workspace = Path(tempfile.mkdtemp(prefix=f"behavioral-{skill}-")) |
| 98 | + dest = workspace / ".claude" / "skills" / skill |
| 99 | + dest.parent.mkdir(parents=True, exist_ok=True) |
| 100 | + shutil.copytree(skill_src, dest) |
| 101 | + return workspace |
| 102 | + |
| 103 | + |
| 104 | +def _run_agent(prompt_text: str, workspace: Path, model: str | None, effort: str | None) -> list[dict]: |
| 105 | + """Run the agent once in ``workspace`` and return the stream-json events.""" |
| 106 | + claude_bin = shutil.which("claude") |
| 107 | + if not claude_bin: |
| 108 | + raise RuntimeError("'claude' CLI not found on PATH") |
| 109 | + |
| 110 | + cmd = [ |
| 111 | + claude_bin, "-p", prompt_text, |
| 112 | + "--output-format", "stream-json", "--verbose", |
| 113 | + "--dangerously-skip-permissions", |
| 114 | + "--add-dir", str(workspace), |
| 115 | + ] |
| 116 | + if model: |
| 117 | + cmd += ["--model", model] |
| 118 | + if effort: |
| 119 | + cmd += ["--effort", effort] |
| 120 | + |
| 121 | + proc = subprocess.run( |
| 122 | + cmd, cwd=str(workspace), capture_output=True, text=True, |
| 123 | + encoding="utf-8", stdin=subprocess.DEVNULL, env=_claude_env(), |
| 124 | + ) |
| 125 | + |
| 126 | + events: list[dict] = [] |
| 127 | + for line in (proc.stdout or "").splitlines(): |
| 128 | + line = line.strip() |
| 129 | + if not line: |
| 130 | + continue |
| 131 | + try: |
| 132 | + events.append(json.loads(line)) |
| 133 | + except json.JSONDecodeError: |
| 134 | + continue |
| 135 | + |
| 136 | + if not events: |
| 137 | + raise RuntimeError( |
| 138 | + f"claude exited with code {proc.returncode} and produced no " |
| 139 | + f"parseable stream-json output. stderr:\n{proc.stderr}" |
| 140 | + ) |
| 141 | + return events |
| 142 | + |
| 143 | + |
| 144 | +def _walk(obj, tool_uses, tool_results) -> None: |
| 145 | + """Collect (tool name, tool input) pairs and tool-result text from events.""" |
| 146 | + if isinstance(obj, dict): |
| 147 | + otype = obj.get("type") |
| 148 | + if otype == "tool_use": |
| 149 | + tool_uses.append((str(obj.get("name", "")), json.dumps(obj.get("input", {}), ensure_ascii=False))) |
| 150 | + elif otype == "tool_result": |
| 151 | + content = obj.get("content") |
| 152 | + if isinstance(content, str): |
| 153 | + tool_results.append(content) |
| 154 | + elif isinstance(content, list): |
| 155 | + for c in content: |
| 156 | + if isinstance(c, dict) and isinstance(c.get("text"), str): |
| 157 | + tool_results.append(c["text"]) |
| 158 | + for v in obj.values(): |
| 159 | + _walk(v, tool_uses, tool_results) |
| 160 | + elif isinstance(obj, list): |
| 161 | + for v in obj: |
| 162 | + _walk(v, tool_uses, tool_results) |
| 163 | + |
| 164 | + |
| 165 | +def _list_workspace_files(workspace: Path) -> list[str]: |
| 166 | + files: list[str] = [] |
| 167 | + for p in sorted(workspace.rglob("*")): |
| 168 | + if ".claude" in p.relative_to(workspace).parts: |
| 169 | + continue |
| 170 | + if p.is_file(): |
| 171 | + files.append(str(p.relative_to(workspace)).replace("\\", "/")) |
| 172 | + return files |
| 173 | + |
| 174 | + |
| 175 | +def _grade_with_llm(statement: str, run: "Run", judge_model: str | None) -> tuple[bool, str]: |
| 176 | + """Ask a grader LLM whether ``statement`` is TRUE given the run's evidence. |
| 177 | +
|
| 178 | + The grader may read files in the workspace (e.g. open out.png), so the |
| 179 | + workspace is added and tool permissions are bypassed for the grader too. |
| 180 | + """ |
| 181 | + claude_bin = shutil.which("claude") |
| 182 | + if not claude_bin: |
| 183 | + return False, "llm_judge skipped: 'claude' CLI not on PATH" |
| 184 | + |
| 185 | + cmd_text = run.command_text |
| 186 | + if len(cmd_text) > 4000: |
| 187 | + cmd_text = cmd_text[:4000] + "\n...[truncated]..." |
| 188 | + evidence = ( |
| 189 | + f"Files in workspace: {run.files or 'none'}\n" |
| 190 | + f"Tools the agent used: {sorted(run.tool_names) or 'none'}\n" |
| 191 | + f"--- Agent final message ---\n{run.result_text[:1500]}\n" |
| 192 | + f"--- Transcript commands/outputs (truncated) ---\n{cmd_text}\n" |
| 193 | + ) |
| 194 | + prompt_text = ( |
| 195 | + "You are grading whether a coding agent's run satisfied a specific " |
| 196 | + "expectation. Decide if the following statement is TRUE based on the " |
| 197 | + "evidence and (if needed) by reading files in the provided workspace " |
| 198 | + f"directory: {run.workspace}\n\n" |
| 199 | + f"STATEMENT TO EVALUATE:\n{statement}\n\n" |
| 200 | + f"EVIDENCE:\n{evidence}\n\n" |
| 201 | + "Respond with ONLY a single-line JSON object and nothing else: " |
| 202 | + '{"pass": true|false, "reason": "<one short sentence>"}' |
| 203 | + ) |
| 204 | + cmd = [ |
| 205 | + claude_bin, "-p", prompt_text, |
| 206 | + "--output-format", "json", |
| 207 | + "--dangerously-skip-permissions", |
| 208 | + "--add-dir", str(run.workspace), |
| 209 | + ] |
| 210 | + if judge_model: |
| 211 | + cmd += ["--model", judge_model] |
| 212 | + |
| 213 | + try: |
| 214 | + proc = subprocess.run( |
| 215 | + cmd, capture_output=True, text=True, encoding="utf-8", |
| 216 | + stdin=subprocess.DEVNULL, timeout=180, env=_claude_env(), |
| 217 | + ) |
| 218 | + except subprocess.TimeoutExpired: |
| 219 | + return False, "llm_judge timed out after 180s" |
| 220 | + |
| 221 | + try: |
| 222 | + payload = json.loads((proc.stdout or "").strip()) |
| 223 | + verdict_text = payload.get("result", "") if isinstance(payload, dict) else "" |
| 224 | + except json.JSONDecodeError: |
| 225 | + verdict_text = (proc.stdout or "").strip() |
| 226 | + |
| 227 | + match = re.search(r"\{.*\}", verdict_text, re.DOTALL) |
| 228 | + if not match: |
| 229 | + return False, f"llm_judge gave no JSON verdict: {verdict_text[:200]!r}" |
| 230 | + try: |
| 231 | + verdict = json.loads(match.group(0)) |
| 232 | + except json.JSONDecodeError: |
| 233 | + return False, f"llm_judge verdict not valid JSON: {match.group(0)[:200]!r}" |
| 234 | + |
| 235 | + passed = bool(verdict.get("pass")) |
| 236 | + reason = str(verdict.get("reason", "")).strip() or "(no reason given)" |
| 237 | + return passed, f"llm_judge: {reason}" |
| 238 | + |
| 239 | + |
| 240 | +class Run: |
| 241 | + """The captured result of one agent run, with inline-asserting checks. |
| 242 | +
|
| 243 | + Each check prints a ``[PASS]``/``[FAIL]`` line and raises ``AssertionError`` |
| 244 | + on failure, so the owning pytest test fails at that line. |
| 245 | + """ |
| 246 | + |
| 247 | + def __init__(self, *, workspace: Path, events: list[dict], judge_model: str | None) -> None: |
| 248 | + tool_uses: list[tuple[str, str]] = [] |
| 249 | + tool_results: list[str] = [] |
| 250 | + for ev in events: |
| 251 | + _walk(ev, tool_uses, tool_results) |
| 252 | + |
| 253 | + result_text = "" |
| 254 | + for ev in events: |
| 255 | + if ev.get("type") == "result" and isinstance(ev.get("result"), str): |
| 256 | + result_text = ev["result"] |
| 257 | + |
| 258 | + self.workspace = workspace |
| 259 | + self.judge_model = judge_model |
| 260 | + self.files = _list_workspace_files(workspace) |
| 261 | + self.tool_names = {name for name, _ in tool_uses if name} |
| 262 | + self.result_text = result_text |
| 263 | + |
| 264 | + # `command_text` is what the agent actually did (tool inputs + outputs), |
| 265 | + # used by the judge so the agent's prose ("I won't call DALL-E") cannot |
| 266 | + # create false signals. |
| 267 | + self.command_text = "\n".join([inp for _, inp in tool_uses] + tool_results) |
| 268 | + |
| 269 | + # `logs` is the full raw transcript, searchable for skill activation, |
| 270 | + # tool names, command strings, etc. |
| 271 | + self.logs = "\n".join(json.dumps(ev, ensure_ascii=False) for ev in events) |
| 272 | + |
| 273 | + def logs_contains(self, text: str) -> "Run": |
| 274 | + ok = text.lower() in self.logs.lower() |
| 275 | + self._report(ok, "logs_contains", f"transcript contains '{text}'") |
| 276 | + return self |
| 277 | + |
| 278 | + def workspace_contains(self, path: str) -> "Run": |
| 279 | + ok = (self.workspace / path).is_file() |
| 280 | + detail = f"workspace contains '{path}'" |
| 281 | + if not ok: |
| 282 | + detail += f" (files: {self.files or 'none'})" |
| 283 | + self._report(ok, "workspace_contains", detail) |
| 284 | + return self |
| 285 | + |
| 286 | + def should(self, statement: str) -> "Run": |
| 287 | + observed, reason = _grade_with_llm(statement, self, self.judge_model) |
| 288 | + self._report(observed, "should", f"{statement} -- {reason}") |
| 289 | + return self |
| 290 | + |
| 291 | + def should_not(self, statement: str) -> "Run": |
| 292 | + observed, reason = _grade_with_llm(statement, self, self.judge_model) |
| 293 | + self._report(not observed, "should_not", f"{statement} -- {reason}") |
| 294 | + return self |
| 295 | + |
| 296 | + def _report(self, passed: bool, kind: str, detail: str) -> None: |
| 297 | + print(f" [{'PASS' if passed else 'FAIL'}] ({kind}) {detail}", flush=True) |
| 298 | + assert passed, f"({kind}) {detail}" |
| 299 | + |
| 300 | + |
| 301 | +class Agent: |
| 302 | + """A single agent session bound to an isolated, skill-staged workspace. |
| 303 | +
|
| 304 | + Use as a context manager so the temp workspace is always cleaned up:: |
| 305 | +
|
| 306 | + with claude("sonnet", skill="local-ai-use") as agent: |
| 307 | + run = agent.prompt("...") |
| 308 | + """ |
| 309 | + |
| 310 | + def __init__( |
| 311 | + self, |
| 312 | + model: str | None = DEFAULT_MODEL, |
| 313 | + *, |
| 314 | + skill: str = DEFAULT_SKILL, |
| 315 | + effort: str | None = DEFAULT_EFFORT, |
| 316 | + ) -> None: |
| 317 | + self.model = model |
| 318 | + self.skill = skill |
| 319 | + self.effort = effort |
| 320 | + self.workspace: Path | None = None |
| 321 | + |
| 322 | + def __enter__(self) -> "Agent": |
| 323 | + self.workspace = _stage_workspace(self.skill) |
| 324 | + return self |
| 325 | + |
| 326 | + def __exit__(self, *exc) -> None: |
| 327 | + if self.workspace is not None: |
| 328 | + shutil.rmtree(self.workspace, ignore_errors=True) |
| 329 | + self.workspace = None |
| 330 | + |
| 331 | + def prompt(self, text: str) -> Run: |
| 332 | + """Run ``text`` through the agent once and return a Run to assert on.""" |
| 333 | + if self.workspace is None: |
| 334 | + raise RuntimeError("Agent.prompt() must be called inside a 'with' block") |
| 335 | + |
| 336 | + print(f"\n[behavioral] skill='{self.skill}' model='{self.model}': {text}", flush=True) |
| 337 | + events = _run_agent(text, self.workspace, self.model, self.effort) |
| 338 | + return Run(workspace=self.workspace, events=events, judge_model=self.model) |
| 339 | + |
| 340 | + |
| 341 | +def claude( |
| 342 | + model: str | None = DEFAULT_MODEL, |
| 343 | + *, |
| 344 | + skill: str = DEFAULT_SKILL, |
| 345 | + effort: str | None = DEFAULT_EFFORT, |
| 346 | +) -> Agent: |
| 347 | + """Factory for a Claude-backed `Agent` (the only agent backend today).""" |
| 348 | + return Agent(model, skill=skill, effort=effort) |
0 commit comments