Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
- Omitted matrix dimensions default to `agent_instructions=default`, `skills=none`, `mcp=none`, and `environment=base`. These defaults affect variant names and output paths.
- Tasks are only discovered one level deep under `tasks_dir`, sorted alphabetically. Every direct child directory must contain non-empty `task.md`, `env/`, and executable `verify.sh` if present.
- Trial workspaces are assembled in a fixed order: copy `env/`, then apply environment overlay, then copy `AGENTS.md`. Later steps override earlier files on conflicts.
- With `[reviewer]`, Calibra runs `swival` CLI instead of Session mode and skips `verify.sh`. `verified` comes from reviewer outcomes, and reports add `review_rounds` plus `reviewer_verdict`.
- With `[reviewer]`, Calibra runs `swival` CLI instead of Session mode and skips `verify.sh` by default. Set `verify = true` under `[reviewer]` to run both; both must pass for a trial to be verified. Reports add `review_rounds` plus `reviewer_verdict`. Verify scripts receive `CALIBRA_*` environment variables (one per matrix dimension, plus `CALIBRA_VARIANT` for the full label).
- `--resume` only skips a trial when `config_hash`, `task`, `variant`, and `repeat` all match the existing JSON. Config changes intentionally invalidate prior results.
- Trial JSON lives at `results/<campaign>/<task>/<variant>_<repeat>.json`. Analysis writes `summary.json`, `summary.md`, and `summary.csv`, and web/cache code depends on that layout.
- Every trial report must include a `calibra` block with `config_hash`, `task`, `variant`, `repeat`, `trial_seed`, `wall_time_s`, and `attempts`. Other modules read those fields back during resume, analysis, and web rendering.
Expand Down
7 changes: 6 additions & 1 deletion calibra/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class ConfigError(Exception):
class ReviewerConfig:
command: str
max_rounds: int = 5
verify: bool = False


@dataclass
Expand Down Expand Up @@ -415,8 +416,12 @@ def load_campaign(path: str | Path) -> Campaign:
raise ConfigError(f"[reviewer] command executable not found: {executable}")
resolved_exe = str(candidate)

rev_verify = reviewer_raw.get("verify", False)
if not isinstance(rev_verify, bool):
raise ConfigError("[reviewer] verify must be a boolean")

rev_command_resolved = shlex.join([resolved_exe] + tokens[1:])
reviewer = ReviewerConfig(command=rev_command_resolved, max_rounds=rev_max_rounds)
reviewer = ReviewerConfig(command=rev_command_resolved, max_rounds=rev_max_rounds, verify=rev_verify)

config_hash = compute_config_hash(raw)

Expand Down
32 changes: 29 additions & 3 deletions calibra/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,18 @@ def setup_workspace(spec: TrialSpec, variant: Variant) -> Path:
return tmpdir


def run_verify(verify_script: Path, workdir: Path) -> bool:
def run_verify(verify_script: Path, workdir: Path, spec: "TrialSpec | None" = None) -> bool:
env = os.environ.copy()
if spec:
dims = spec.variant.dim_labels()
for key, val in dims.items():
env[f"CALIBRA_{key.upper()}"] = val
env["CALIBRA_VARIANT"] = spec.variant.label
try:
result = subprocess.run(
[str(verify_script)],
cwd=str(workdir),
env=env,
timeout=30,
capture_output=True,
)
Expand Down Expand Up @@ -178,6 +185,7 @@ def _resolve_mcp_servers(spec: TrialSpec) -> dict | None:
"seed": "--seed",
"trace_dir": "--trace-dir",
"command_middleware": "--command-middleware",
"aws_profile": "--aws-profile",
}
_CLI_REPEAT_MAP = {
"skills_dir": "--skills-dir",
Expand Down Expand Up @@ -364,7 +372,7 @@ def _run_session():

verified = None
if spec.task.verify_script:
verified = run_verify(spec.task.verify_script, tmpdir)
verified = run_verify(spec.task.verify_script, tmpdir, spec)

failure_class = classify_failure(None, report, False, verified=verified)

Expand Down Expand Up @@ -415,6 +423,11 @@ def run_trial_cli(
env, xdg_dir = _make_isolated_env()
report_path = tmpdir / ".calibra-report.json"

reviewer_command = campaign.reviewer.command
criteria_path = spec.task.env_dir.parent / "criteria.md"
if criteria_path.is_file():
reviewer_command += f" --verify {criteria_path}"

(tmpdir / "swival.toml").unlink(missing_ok=True)

try:
Expand All @@ -435,7 +448,7 @@ def run_trial_cli(
"--report",
str(report_path),
"--reviewer",
campaign.reviewer.command,
reviewer_command,
"--max-review-rounds",
str(campaign.reviewer.max_rounds),
]
Expand All @@ -447,6 +460,12 @@ def run_trial_cli(
else:
argv.append("--no-mcp")

if spec.variant.skills.skills_dirs:
for skills_dir in spec.variant.skills.skills_dirs:
argv.extend(["--skills-dir", str(skills_dir)])
else:
argv.append("--no-skills")

Comment on lines +463 to +468
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this is already defined in [session], we will end up with duplicate flags.

yolo, session_opts = _resolve_yolo(merged_session_opts or {})
if yolo:
argv.append("--yolo")
Expand Down Expand Up @@ -484,6 +503,13 @@ def run_trial_cli(
verdict = _reviewer_verdict(report)
verified = verdict

if spec.task.verify_script and not timed_out and campaign.reviewer.verify:
script_passed = run_verify(spec.task.verify_script, tmpdir, spec)
if verified is None:
verified = script_passed
else:
verified = verified and script_passed

failure_class = _classify_cli_failure(exit_code, stderr_text, report, timed_out, verified)

return TrialResult(
Expand Down
4 changes: 3 additions & 1 deletion docs.md/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,19 @@ Each variant gets a label by joining dimension labels with underscores in a fixe

## [reviewer] section

Enables Swival's reviewer feature. When configured, Calibra runs trials via the `swival` CLI instead of the Session API, passing `--reviewer` and `--report` flags. The reviewer receives the task prompt (`task.md`) as its goal — the same prompt the agent was given — so it can judge whether the agent's work satisfies the original task. The reviewer command runs after each agent answer; exit 0 means accept, exit 1 means retry with feedback, exit 2+ means error (treated as unverified). When a reviewer is active, `verify.sh` is skipped - the reviewer determines pass/fail.
Enables Swival's reviewer feature. When configured, Calibra runs trials via the `swival` CLI instead of the Session API, passing `--reviewer` and `--report` flags. The reviewer receives the task prompt (`task.md`) as its goal — the same prompt the agent was given — so it can judge whether the agent's work satisfies the original task. The reviewer command runs after each agent answer; exit 0 means accept, exit 1 means retry with feedback, exit 2+ means error (treated as unverified). When a reviewer is active, `verify.sh` is skipped by default; set `verify = true` to run both (both must pass). If a task has a `criteria.md`, Calibra appends `--verify <path>` to the reviewer command so the reviewer receives the pass criteria.

| Field | Type | Default | Description |
| ------------ | ------ | ---------- | -------------------------------------------------------- |
| `command` | string | *required* | Shell command for the reviewer executable |
| `max_rounds` | int | `5` | Maximum retry rounds (0 = run reviewer once, no retries) |
| `verify` | bool | `false` | Also run `verify.sh`; both must pass for `verified=true` |

```toml
[reviewer]
command = "./review.sh"
max_rounds = 3
verify = true
```

The `command` is parsed with `shlex.split`, so arguments with spaces must be quoted. The first token is resolved as an executable (via `which` or relative to the config file directory). If the `[reviewer]` section is present, `command` must be provided - an empty section is an error.
Expand Down
2 changes: 1 addition & 1 deletion docs.md/running.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ The CLI invocation includes `--report` (to get the full JSON report with timelin

To prevent user or project config from leaking into trials, Calibra sets `XDG_CONFIG_HOME` to an empty temp directory (preventing `~/.config/swival/config.toml` from loading), deletes any `swival.toml` that may have been copied from the task's `env/` or overlay, and passes `--no-mcp` unless the variant has explicit MCP config.

After the subprocess completes, Calibra reads the report JSON, determines `verified` from the last review event in the timeline (exit 0 = true, exit 1 = false, exit 2+ = null), and classifies failures using the report data first with a stderr fallback. `verify.sh` is skipped in reviewer mode.
After the subprocess completes, Calibra reads the report JSON, determines `verified` from the last review event in the timeline (exit 0 = true, exit 1 = false, exit 2+ = null), and classifies failures using the report data first with a stderr fallback. `verify.sh` is skipped in reviewer mode by default (see `[reviewer] verify`).

## Monitoring progress

Expand Down
3 changes: 2 additions & 1 deletion docs.md/tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ tasks/my-task/
env/ # Initial workspace files (required, can be empty)
verify.sh # Verification script (optional)
meta.toml # Task metadata (optional)
criteria.md # Reviewer criteria (optional)
```

## task.md: the prompt
Expand Down Expand Up @@ -85,7 +86,7 @@ The entire `env/` tree is copied into a temporary directory for each trial, so t

An optional executable shell script that checks whether the agent succeeded. Calibra runs it in the trial's workspace directory after the agent finishes. Exit code 0 means pass, anything else means fail. The script has a 30-second timeout; if the timeout expires or the script can't be executed (e.g., missing interpreter), it counts as a fail. The script's stdout and stderr are captured but not stored in the report. If `verify.sh` is not present, the trial won't have a `verified` field and pass rates can't be computed.

When a campaign has a `[reviewer]` configured, `verify.sh` is skipped - the reviewer determines pass/fail instead. Tasks can include both `verify.sh` and be used with reviewer campaigns; the campaign config controls which verification method is used.
When a campaign has a `[reviewer]` configured, `verify.sh` is skipped by default — the reviewer determines pass/fail. Set `verify = true` under `[reviewer]` to run both; both must pass for a trial to be verified.

Make sure the script is executable:

Expand Down