Swival · jfoote · May 5, 2026 · jedisct1 · May 6, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -16,7 +16,7 @@
 - Omitted matrix dimensions default to `agent_instructions=default`, `skills=none`, `mcp=none`, and `environment=base`. These defaults affect variant names and output paths.
 - Tasks are only discovered one level deep under `tasks_dir`, sorted alphabetically. Every direct child directory must contain non-empty `task.md`, `env/`, and executable `verify.sh` if present.
 - Trial workspaces are assembled in a fixed order: copy `env/`, then apply environment overlay, then copy `AGENTS.md`. Later steps override earlier files on conflicts.
-- With `[reviewer]`, Calibra runs `swival` CLI instead of Session mode and skips `verify.sh`. `verified` comes from reviewer outcomes, and reports add `review_rounds` plus `reviewer_verdict`.
+- With `[reviewer]`, Calibra runs `swival` CLI instead of Session mode and skips `verify.sh` by default. Set `verify = true` under `[reviewer]` to run both; both must pass for a trial to be verified. Reports add `review_rounds` plus `reviewer_verdict`. Verify scripts receive `CALIBRA_*` environment variables (one per matrix dimension, plus `CALIBRA_VARIANT` for the full label).
 - `--resume` only skips a trial when `config_hash`, `task`, `variant`, and `repeat` all match the existing JSON. Config changes intentionally invalidate prior results.
 - Trial JSON lives at `results/<campaign>/<task>/<variant>_<repeat>.json`. Analysis writes `summary.json`, `summary.md`, and `summary.csv`, and web/cache code depends on that layout.
 - Every trial report must include a `calibra` block with `config_hash`, `task`, `variant`, `repeat`, `trial_seed`, `wall_time_s`, and `attempts`. Other modules read those fields back during resume, analysis, and web rendering.

diff --git a/calibra/config.py b/calibra/config.py
@@ -24,6 +24,7 @@ class ConfigError(Exception):
 class ReviewerConfig:
     command: str
     max_rounds: int = 5
+    verify: bool = False
 
 
 @dataclass
@@ -415,8 +416,12 @@ def load_campaign(path: str | Path) -> Campaign:
                 raise ConfigError(f"[reviewer] command executable not found: {executable}")
             resolved_exe = str(candidate)
 
+        rev_verify = reviewer_raw.get("verify", False)
+        if not isinstance(rev_verify, bool):
+            raise ConfigError("[reviewer] verify must be a boolean")
+
         rev_command_resolved = shlex.join([resolved_exe] + tokens[1:])
-        reviewer = ReviewerConfig(command=rev_command_resolved, max_rounds=rev_max_rounds)
+        reviewer = ReviewerConfig(command=rev_command_resolved, max_rounds=rev_max_rounds, verify=rev_verify)
 
     config_hash = compute_config_hash(raw)
 

diff --git a/calibra/runner.py b/calibra/runner.py
@@ -132,11 +132,18 @@ def setup_workspace(spec: TrialSpec, variant: Variant) -> Path:
     return tmpdir
 
 
-def run_verify(verify_script: Path, workdir: Path) -> bool:
+def run_verify(verify_script: Path, workdir: Path, spec: "TrialSpec | None" = None) -> bool:
+    env = os.environ.copy()
+    if spec:
+        dims = spec.variant.dim_labels()
+        for key, val in dims.items():
+            env[f"CALIBRA_{key.upper()}"] = val
+        env["CALIBRA_VARIANT"] = spec.variant.label
     try:
         result = subprocess.run(
             [str(verify_script)],
             cwd=str(workdir),
+            env=env,
             timeout=30,
             capture_output=True,
         )
@@ -178,6 +185,7 @@ def _resolve_mcp_servers(spec: TrialSpec) -> dict | None:
     "seed": "--seed",
     "trace_dir": "--trace-dir",
     "command_middleware": "--command-middleware",
+    "aws_profile": "--aws-profile",
 }
 _CLI_REPEAT_MAP = {
     "skills_dir": "--skills-dir",
@@ -364,7 +372,7 @@ def _run_session():
 
         verified = None
         if spec.task.verify_script:
-            verified = run_verify(spec.task.verify_script, tmpdir)
+            verified = run_verify(spec.task.verify_script, tmpdir, spec)
 
         failure_class = classify_failure(None, report, False, verified=verified)
 
@@ -415,6 +423,11 @@ def run_trial_cli(
     env, xdg_dir = _make_isolated_env()
     report_path = tmpdir / ".calibra-report.json"
 
+    reviewer_command = campaign.reviewer.command
+    criteria_path = spec.task.env_dir.parent / "criteria.md"
+    if criteria_path.is_file():
+        reviewer_command += f" --verify {criteria_path}"
+
     (tmpdir / "swival.toml").unlink(missing_ok=True)
 
     try:
@@ -435,7 +448,7 @@ def run_trial_cli(
             "--report",
             str(report_path),
             "--reviewer",
-            campaign.reviewer.command,
+            reviewer_command,
             "--max-review-rounds",
             str(campaign.reviewer.max_rounds),
         ]
@@ -447,6 +460,12 @@ def run_trial_cli(
         else:
             argv.append("--no-mcp")
 
+        if spec.variant.skills.skills_dirs:
+            for skills_dir in spec.variant.skills.skills_dirs:
+                argv.extend(["--skills-dir", str(skills_dir)])
+        else:
+            argv.append("--no-skills")
+
         yolo, session_opts = _resolve_yolo(merged_session_opts or {})
         if yolo:
             argv.append("--yolo")
@@ -484,6 +503,13 @@ def run_trial_cli(
             verdict = _reviewer_verdict(report)
             verified = verdict
 
+        if spec.task.verify_script and not timed_out and campaign.reviewer.verify:
+            script_passed = run_verify(spec.task.verify_script, tmpdir, spec)
+            if verified is None:
+                verified = script_passed
+            else:
+                verified = verified and script_passed
+
         failure_class = _classify_cli_failure(exit_code, stderr_text, report, timed_out, verified)
 
         return TrialResult(

diff --git a/docs.md/configuration.md b/docs.md/configuration.md
@@ -176,17 +176,19 @@ Each variant gets a label by joining dimension labels with underscores in a fixe
 
 ## [reviewer] section
 
-Enables Swival's reviewer feature. When configured, Calibra runs trials via the `swival` CLI instead of the Session API, passing `--reviewer` and `--report` flags. The reviewer receives the task prompt (`task.md`) as its goal — the same prompt the agent was given — so it can judge whether the agent's work satisfies the original task. The reviewer command runs after each agent answer; exit 0 means accept, exit 1 means retry with feedback, exit 2+ means error (treated as unverified). When a reviewer is active, `verify.sh` is skipped - the reviewer determines pass/fail.
+Enables Swival's reviewer feature. When configured, Calibra runs trials via the `swival` CLI instead of the Session API, passing `--reviewer` and `--report` flags. The reviewer receives the task prompt (`task.md`) as its goal — the same prompt the agent was given — so it can judge whether the agent's work satisfies the original task. The reviewer command runs after each agent answer; exit 0 means accept, exit 1 means retry with feedback, exit 2+ means error (treated as unverified). When a reviewer is active, `verify.sh` is skipped by default; set `verify = true` to run both (both must pass). If a task has a `criteria.md`, Calibra appends `--verify <path>` to the reviewer command so the reviewer receives the pass criteria.
 
 | Field        | Type   | Default    | Description                                              |
 | ------------ | ------ | ---------- | -------------------------------------------------------- |
 | `command`    | string | *required* | Shell command for the reviewer executable                |
 | `max_rounds` | int    | `5`        | Maximum retry rounds (0 = run reviewer once, no retries) |
+| `verify`     | bool   | `false`    | Also run `verify.sh`; both must pass for `verified=true` |
 
 ```toml
 [reviewer]
 command = "./review.sh"
 max_rounds = 3
+verify = true
 ```
 
 The `command` is parsed with `shlex.split`, so arguments with spaces must be quoted. The first token is resolved as an executable (via `which` or relative to the config file directory). If the `[reviewer]` section is present, `command` must be provided - an empty section is an error.

diff --git a/docs.md/running.md b/docs.md/running.md
@@ -149,7 +149,7 @@ The CLI invocation includes `--report` (to get the full JSON report with timelin
 
 To prevent user or project config from leaking into trials, Calibra sets `XDG_CONFIG_HOME` to an empty temp directory (preventing `~/.config/swival/config.toml` from loading), deletes any `swival.toml` that may have been copied from the task's `env/` or overlay, and passes `--no-mcp` unless the variant has explicit MCP config.
 
-After the subprocess completes, Calibra reads the report JSON, determines `verified` from the last review event in the timeline (exit 0 = true, exit 1 = false, exit 2+ = null), and classifies failures using the report data first with a stderr fallback. `verify.sh` is skipped in reviewer mode.
+After the subprocess completes, Calibra reads the report JSON, determines `verified` from the last review event in the timeline (exit 0 = true, exit 1 = false, exit 2+ = null), and classifies failures using the report data first with a stderr fallback. `verify.sh` is skipped in reviewer mode by default (see `[reviewer] verify`).
 
 ## Monitoring progress
 

diff --git a/docs.md/tasks.md b/docs.md/tasks.md
@@ -10,6 +10,7 @@ tasks/my-task/
   env/          # Initial workspace files (required, can be empty)
   verify.sh     # Verification script (optional)
   meta.toml     # Task metadata (optional)
+  criteria.md   # Reviewer criteria (optional)
 ```
 
 ## task.md: the prompt
@@ -85,7 +86,7 @@ The entire `env/` tree is copied into a temporary directory for each trial, so t
 
 An optional executable shell script that checks whether the agent succeeded. Calibra runs it in the trial's workspace directory after the agent finishes. Exit code 0 means pass, anything else means fail. The script has a 30-second timeout; if the timeout expires or the script can't be executed (e.g., missing interpreter), it counts as a fail. The script's stdout and stderr are captured but not stored in the report. If `verify.sh` is not present, the trial won't have a `verified` field and pass rates can't be computed.
 
-When a campaign has a `[reviewer]` configured, `verify.sh` is skipped - the reviewer determines pass/fail instead. Tasks can include both `verify.sh` and be used with reviewer campaigns; the campaign config controls which verification method is used.
+When a campaign has a `[reviewer]` configured, `verify.sh` is skipped by default — the reviewer determines pass/fail. Set `verify = true` under `[reviewer]` to run both; both must pass for a trial to be verified.
 
 Make sure the script is executable: