Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/behavioral.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ jobs:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# Lets the harness default to this skill if a test relies on the env.
BEHAVIORAL_SKILL: ${{ matrix.skill }}
# Cost cap: sonnet only. The harness also enforces this under CI.
BEHAVIORAL_MODEL: sonnet
run: |
set -euo pipefail
test_file="tests/test_$(echo '${{ matrix.skill }}' | tr '-' '_').py"
Expand Down
30 changes: 29 additions & 1 deletion eval/behavioral/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,32 @@ def test_image_generation():
DEFAULT_MODEL = os.environ.get("BEHAVIORAL_MODEL", "sonnet")
DEFAULT_EFFORT = os.environ.get("BEHAVIORAL_EFFORT", "high")

# Automated runs are capped at sonnet: a behavioral run makes real cloud calls
# (agent run + LLM judge), so a workflow picking an expensive model can quietly
# run up a large bill. No override -- the cap is non-negotiable in CI.
AUTOMATED_MODEL = "sonnet"
_TRUTHY = {"1", "true", "yes", "on"}


def _is_automated_env() -> bool:
"""True under CI / an automated workflow (GitHub Actions sets both)."""
return any(
os.environ.get(var, "").strip().lower() in _TRUTHY
for var in ("CI", "GITHUB_ACTIONS")
)


def _enforce_model_policy(model: str | None) -> str | None:
"""Coerce non-sonnet models to sonnet in CI; pass through otherwise."""
if model is None or not _is_automated_env() or "sonnet" in model.lower():
return model
print(
f"[behavioral] automated run: coercing model '{model}' -> "
f"'{AUTOMATED_MODEL}' to cap token usage.",
flush=True,
)
return AUTOMATED_MODEL


def _claude_env() -> dict[str, str]:
"""Environment for `claude` subprocesses.
Expand All @@ -70,6 +96,7 @@ def check_api_reachable(model: str | None = DEFAULT_MODEL, timeout: int = 60) ->
if not claude_bin:
return False, "'claude' CLI not found on PATH"

model = _enforce_model_policy(model)
cmd = [claude_bin, "-p", "Reply with the single word: ok", "--output-format", "json"]
if model:
cmd += ["--model", model]
Expand Down Expand Up @@ -314,7 +341,8 @@ def __init__(
skill: str = DEFAULT_SKILL,
effort: str | None = DEFAULT_EFFORT,
) -> None:
self.model = model
# Coerce here so the agent run and the LLM judge share the capped model.
self.model = _enforce_model_policy(model)
self.skill = skill
self.effort = effort
self.workspace: Path | None = None
Expand Down
Loading