sipyourdrink-ltd
diff --git a/‎docs/api/supervisor.md‎
Lines changed: 156 additions & 0 deletions b/‎docs/api/supervisor.md‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎src/bernstein/cli/commands/fleet_cmd.py‎
Lines changed: 30 additions & 0 deletions b/‎src/bernstein/cli/commands/fleet_cmd.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎src/bernstein/cli/commands/status_cmd.py‎
Lines changed: 50 additions & 0 deletions b/‎src/bernstein/cli/commands/status_cmd.py‎
Lines changed: 50 additions & 0 deletions
@@ -0,0 +1,156 @@
+# Supervisor surface
+
+This document describes the JSON shapes the `bernstein supervisor`
+command emits. Two surfaces are documented:
+
+1. **Aggregated supervisor snapshot** - the body returned by
+   `bernstein supervisor status --json` and embedded as the
+   `supervisor` field in `bernstein status --json`.
+2. **Signed escalation receipt** - the envelope persisted under
+   `.sdd/runtime/supervisor/receipts/` whenever the supervisor or the
+   operator escalates a stalled worker.
+
+Both shapes are versioned via an explicit `schema_version` field.
+
+## Supervisor snapshot
+
+```jsonc
+{
+  "schema_version": "1.0.0",
+  "generated_ts": 1700000000.0,
+  "stuck_count": 2,
+  "oldest_stall_age_s": 95.0,
+  "workers": [
+    {
+      "worker_id": "abc123def456",
+      "session_id": "sess-abc123",
+      "role": "backend",
+      "task_id": "t-12",
+      "worktree_id": "wt-007",
+      "last_heartbeat_age_s": 42.0,
+      "is_stuck": false,
+      "stall_reason": "unknown",
+      "recommended_action": "inspect",
+      "respawn_budget_remaining": 3,
+      "stuck_since_ts": null,
+      "details": {"status": "working"}
+    }
+  ]
+}
+```
+
+### Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `schema_version` | string | Aggregator schema version. Currently `1.0.0`. |
+| `generated_ts` | float | Unix timestamp the snapshot was captured. |
+| `stuck_count` | integer | Number of workers with `is_stuck=true`. |
+| `oldest_stall_age_s` | float \| null | Age, in seconds, of the oldest currently-stuck worker; `null` when no worker is stuck or no stall timestamp is available. |
+| `workers[].worker_id` | string | Operator-decodable worker handle. |
+| `workers[].session_id` | string | Adapter session id. |
+| `workers[].role` | string | Worker role (`manager`, `backend`, `qa`, ...). |
+| `workers[].task_id` | string | Current task id, or empty string. |
+| `workers[].worktree_id` | string | Worktree the worker is running in. |
+| `workers[].last_heartbeat_age_s` | float \| null | Seconds since the last heartbeat; `null` when none recorded. |
+| `workers[].is_stuck` | bool | True iff at least one detector classifies the row as stuck. |
+| `workers[].stall_reason` | string | One of `manager_no_children`, `watchdog_model_question`, `respawn_budget_exhausted`, `heartbeat_stale`, `no_progress`, or `unknown`. |
+| `workers[].recommended_action` | string | One of `respawn`, `escalate`, `park`, `inspect`. Deterministic over the chain slice (see below). |
+| `workers[].respawn_budget_remaining` | integer | Respawns remaining under the session's budget. |
+| `workers[].stuck_since_ts` | float \| null | Unix timestamp the stall first fired; `null` when not known. |
+| `workers[].details` | object | Free-form detector context. The aggregator currently includes `status` (raw agent status). |
+
+## Escalation receipt envelope
+
+```jsonc
+{
+  "schema_version": "1.0.0",
+  "worker_id": "abc123def456",
+  "worktree_id": "wt-007",
+  "session_id": "sess-abc123",
+  "stall_reason": "manager_no_children",
+  "recommended_action": "escalate",
+  "audit_entries": [
+    {
+      "event_type": "stalled_manager",
+      "session_id": "sess-abc123",
+      "details": {"runtime_s": 120.0, "hook_event_count": 12}
+    }
+  ],
+  "identity": {
+    "install_rev": "abc123def4567890",
+    "keyid": "...64 hex chars...",
+    "run_id": "run-2026-05-21-001"
+  },
+  "prev_chain_digest": "...64 hex chars...",
+  "payload_digest": "...64 hex chars...",
+  "signature_b64": "...base64 Ed25519 signature...",
+  "details": {
+    "operator_reason": "wedged on credential rotation",
+    "respawn_budget_remaining": 0
+  }
+}
+```
+
+### Receipt fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `schema_version` | string | Receipt schema version. Currently `1.0.0`. |
+| `worker_id` | string | Stable worker identifier. |
+| `worktree_id` | string | Worktree the worker was running in. |
+| `session_id` | string | Adapter session id. |
+| `stall_reason` | string | Structured stall reason - same vocabulary as the aggregator. |
+| `recommended_action` | string | Deterministic action - same vocabulary as the aggregator. |
+| `audit_entries` | array of object | Captured chain slice (default 16 trailing entries) leading up to the stall. |
+| `identity.install_rev` | string | Operator-decodable install fingerprint. |
+| `identity.keyid` | string | sha256 of the Ed25519 public key (hex). |
+| `identity.run_id` | string | Orchestrator run id, when known. |
+| `prev_chain_digest` | string | HMAC of the previous audit-chain entry. Links the receipt into the tamper-evident audit log. |
+| `payload_digest` | string | sha256 of the canonical signing payload. Lets verifiers detect a swapped signature blob. |
+| `signature_b64` | string | base64-encoded Ed25519 signature over the canonical payload. |
+| `details` | object | Free-form context. The CLI populates `operator_reason` and `respawn_budget_remaining`. |
+
+### Determinism contract
+
+`recommended_action` is a **pure function** of the receipt's
+`(stall_reason, audit_entries, respawn_budget_remaining)`. The function
+
+* never reads files or environment,
+* never opens a socket,
+* never reads a wall clock.
+
+Two operators handed the same receipt bytes (or independently
+reassembled receipts from the same chain prefix) compute the
+byte-identical `recommended_action`. The contract is enforced by the
+unit test
+`tests/unit/test_supervisor_receipt.py::test_recommended_action_determinism`,
+which drives the same chain slice through the function from two
+different temp dirs and asserts equality.
+
+### Cross-worktree fence
+
+Every receipt asserts that the stuck session never crossed worktree
+boundaries during the stall window. An audit entry whose
+`event_type` ends in `.resolved` or starts with `cross_worktree.` and
+references the stuck `session_id` from a sibling `worktree_id` is a
+fence violation and aborts receipt assembly. Verifiers re-run the same
+check from the receipt bytes alone, so a tampered audit slice that
+smuggled a leak past assembly fails verification.
+
+### Verification
+
+The standalone verifier loads only the public side of the install
+Ed25519 keypair (`<workdir>/.sdd/runtime/supervisor/install.key.pub`,
+PEM-encoded). It
+
+1. recomputes `payload_digest` over the canonical signing bytes and
+   asserts byte-equality with the receipt's `payload_digest`,
+2. re-asserts the cross-worktree fence,
+3. re-derives `recommended_action` from the embedded slice and
+   asserts equality with the receipt's `recommended_action`,
+4. verifies the Ed25519 signature over the canonical bytes.
+
+A receipt that survives all four checks is byte-portable: any auditor
+holding the install's public key validates it offline without
+contacting the orchestrator.
@@ -137,6 +137,36 @@ def _fallback_table_render(aggregator: FleetAggregator, config: FleetConfig) ->
     _console.print(table)
     _console.print(format_footer(config, rows, total))
 
+    supervisor_line = _fleet_supervisor_summary_line()
+    if supervisor_line:
+        _console.print(f"[dim]{supervisor_line}[/dim]")
+
+
+def _fleet_supervisor_summary_line() -> str:
+    """Return the stuck-count summary across the fleet's primary workspace.
+
+    The fleet view aggregates many projects but a single operator sits
+    inside one workspace, so we surface the supervisor snapshot for that
+    workspace as the most actionable signal. Returns an empty string on
+    any aggregator failure so the fleet command never errors here.
+    Failures are logged so an operator-visible drop can be debugged from
+    the orchestrator log without restarting the fleet view.
+    """
+    try:
+        from pathlib import Path as _Path
+
+        from bernstein.core.defaults import AGENT
+        from bernstein.core.orchestration.supervisor_aggregator import (
+            aggregator_snapshot,
+            format_summary_line,
+        )
+
+        snapshot = aggregator_snapshot(_Path.cwd(), heartbeat_stale_s=AGENT.heartbeat_stale_s)
+    except Exception:  # pragma: no cover - fleet renderer must never raise
+        logger.exception("fleet supervisor-summary aggregation failed")
+        return ""
+    return format_summary_line(snapshot)
+
 
 def _parse_bind(bind: str) -> tuple[str, int]:
     text = bind.strip()
 
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import os
 import sys
 import time
@@ -24,6 +25,8 @@
 from bernstein.core.agent_discovery import AgentCapabilities, DiscoveryResult, discover_agents_cached
 from bernstein.tui.worker_badges import format_worker_badge, get_badge_for_worker
 
+logger = logging.getLogger(__name__)
+
 _NOT_AUTHENTICATED_MSG = "not authenticated"
 
 _STORAGE_BACKEND_LABEL = "Storage backend"
@@ -184,6 +187,11 @@ def status(as_json: bool, no_color: bool, view_mode: str | None) -> None:
     if snapshots:
         data["rate_limit_meters"] = snapshots
 
+    # Attach the supervisor summary (stuck-count + oldest-stall age).
+    # Operators reading ``bernstein status`` should not have to remember
+    # the dedicated supervisor command to spot a wedged worker.
+    data["supervisor"] = _supervisor_status_summary(Path.cwd())
+
     if as_json or is_json():
         print_json(data)
         return
@@ -200,6 +208,48 @@ def status(as_json: bool, no_color: bool, view_mode: str | None) -> None:
 
     render_status(data, console=con, view_config=vc)
 
+    supervisor_line = _supervisor_summary_line(Path.cwd())
+    if supervisor_line:
+        con.print(f"[dim]{supervisor_line}[/dim]")
+
+
+def _supervisor_status_summary(workdir: Path) -> dict[str, Any]:
+    """Return the supervisor stuck-count summary for ``bernstein status --json``.
+
+    Returns an empty dict if the aggregator raises - the command must
+    never fail on a missing or malformed runtime tree. Failures are
+    logged so an operator can correlate an empty summary with a real
+    cause instead of treating silence as healthy.
+    """
+    try:
+        from bernstein.core.defaults import AGENT
+        from bernstein.core.orchestration.supervisor_aggregator import (
+            aggregator_snapshot,
+            snapshot_to_dict,
+        )
+
+        snapshot = aggregator_snapshot(workdir, heartbeat_stale_s=AGENT.heartbeat_stale_s)
+    except Exception:  # pragma: no cover - status must never error on this
+        logger.exception("supervisor status summary failed")
+        return {}
+    return snapshot_to_dict(snapshot)
+
+
+def _supervisor_summary_line(workdir: Path) -> str:
+    """Return the one-line supervisor summary string for the human view."""
+    try:
+        from bernstein.core.defaults import AGENT
+        from bernstein.core.orchestration.supervisor_aggregator import (
+            aggregator_snapshot,
+            format_summary_line,
+        )
+
+        snapshot = aggregator_snapshot(workdir, heartbeat_stale_s=AGENT.heartbeat_stale_s)
+    except Exception:  # pragma: no cover - status must never error on this
+        logger.exception("supervisor summary line render failed")
+        return ""
+    return format_summary_line(snapshot)
+
 
 # ---------------------------------------------------------------------------
 # ps - process visibility