diff --git a/cmd/gc/city_runtime.go b/cmd/gc/city_runtime.go index 8fb14641c..1d4fdd6cf 100644 --- a/cmd/gc/city_runtime.go +++ b/cmd/gc/city_runtime.go @@ -39,12 +39,16 @@ type CityRuntime struct { buildFn func(*config.City, runtime.Provider, beads.Store) DesiredStateResult buildFnWithSessionBeads func(*config.City, runtime.Provider, beads.Store, map[string]beads.Store, *sessionBeadSnapshot, *sessionReconcilerTraceCycle) DesiredStateResult - dops drainOps - ct crashTracker - it idleTracker - wg wispGC + dops drainOps + ct crashTracker + it idleTracker + wg wispGC + // stuck is owned by the tick goroutine; written in reloadConfigTraced, read in runStuckSweep — no additional locking required. + stuck stuckTracker od orderDispatcher - trace *sessionReconcilerTraceManager + // runner is owned by the tick goroutine; written in reloadConfigTraced, read in runStuckSweep — no additional locking required. + runner beads.CommandRunner + trace *sessionReconcilerTraceManager rec events.Recorder cs *controllerState // nil when controller-managed bead stores are unavailable @@ -126,6 +130,32 @@ func newCityRuntime(p CityRuntimeParams) *CityRuntime { p.Cfg.Daemon.WispTTLDuration(), bdCommandRunnerForCity(p.CityPath)) } + stuck, stuckErr := newStuckTracker(p.Cfg.Daemon) + if stuckErr != nil { + fmt.Fprintf(p.Stderr, "%s: stuck sweep disabled: %v\n", //nolint:errcheck // best-effort stderr + firstNonEmpty(p.LogPrefix, "gc start"), stuckErr) + stuck = noopStuckTracker{} + } + if p.Cfg.Daemon.StuckSweepEnabled() && stuckErr == nil { + if m, ok := stuck.(*memoryStuckTracker); ok { + if len(m.patterns) == 0 { + // AC1/AC3: progress-mismatch axis only — regex axis disabled. + fmt.Fprintf(p.Stderr, //nolint:errcheck // best-effort stderr + "%s: stuck_sweep enabled: regex-axis=disabled (no patterns) progress-mismatch-axis=active threshold=%s label=%s\n", + firstNonEmpty(p.LogPrefix, "gc start"), + m.wispThresholdDuration(), m.warrantLabelOrDefault()) + } else { + fmt.Fprintf(p.Stderr, //nolint:errcheck // best-effort stderr + "%s: stuck_sweep enabled: regex-axis=enabled (patterns=%d) progress-mismatch-axis=active threshold=%s peek_lines=%d label=%s\n", + firstNonEmpty(p.LogPrefix, "gc start"), + len(m.patterns), m.wispThresholdDuration(), + m.peekLinesOrDefault(), m.warrantLabelOrDefault()) + } + } + } + + runner := bdCommandRunnerForCity(p.CityPath) + // Clear stale halt file from a previous session so a service restart // always begins in the running state. An operator who wants the halt // to survive a restart can re-issue "gc halt" after startup. @@ -171,7 +201,9 @@ func newCityRuntime(p CityRuntimeParams) *CityRuntime { ct: ct, it: it, wg: wg, + stuck: stuck, od: od, + runner: runner, trace: newSessionReconcilerTraceManager(p.CityPath, p.CityName, p.Stderr), rec: p.Rec, poolSessions: p.PoolSessions, @@ -485,6 +517,14 @@ func (cr *CityRuntime) tick( return } + // Stuck-agent sweep: controller-level non-LLM liveness check. Runs + // inside the halt gate (already gated above) and after wispGC so it + // sees freshly-GC'd wisp state. + cr.runStuckSweep(ctx, time.Now()) + if ctx.Err() != nil { + return + } + // Order dispatch. if cr.od != nil { cr.od.dispatch(ctx, cityRoot, time.Now()) @@ -661,6 +701,27 @@ func (cr *CityRuntime) reloadConfigTraced( cr.wg = nil } + wasStuckEnabled := cr.cfg.Daemon.StuckSweepEnabled() + if nextStuck, err := newStuckTracker(nextCfg.Daemon); err != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep disabled on reload: %v\n", cr.logPrefix, err) //nolint:errcheck // best-effort stderr + cr.stuck = noopStuckTracker{} + } else { + cr.stuck = nextStuck + } + // AC11: emit a state-change log when stuck sweep enabled/disabled + // transitions across reload so operators can see the feature flip. + if nowEnabled := nextCfg.Daemon.StuckSweepEnabled(); nowEnabled != wasStuckEnabled { + fmt.Fprintf(cr.stderr, "%s: stuck sweep config reloaded: enabled=%v\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, nowEnabled) + } + // Note: cr.stuck and cr.runner are assigned prior to the + // serviceStateMu-protected cfg/sp swap below. They are read by + // runStuckSweep under the same tick goroutine that holds the controller + // loop, so no additional locking is required; the ordering preserves the + // invariant that a sweep observes either the old (cfg, stuck, runner) or + // the new triple, never a mix. + cr.runner = bdCommandRunnerForCity(cityRoot) + cr.od = buildOrderDispatcher(cityRoot, nextCfg, bdCommandRunnerForCity(cityRoot), cr.rec, cr.stderr) cr.serviceStateMu.Lock() diff --git a/cmd/gc/cmd_doctor.go b/cmd/gc/cmd_doctor.go index f8a9b8eab..59e3baebd 100644 --- a/cmd/gc/cmd_doctor.go +++ b/cmd/gc/cmd_doctor.go @@ -76,6 +76,7 @@ func doDoctor(fix, verbose bool, stdout, stderr io.Writer) int { d.Register(doctor.NewBuiltinPackFamilyCheck(cfg, cityPath)) d.Register(doctor.NewConfigSemanticsCheck(cfg, filepath.Join(cityPath, "city.toml"))) d.Register(doctor.NewDurationRangeCheck(cfg)) + d.Register(doctor.NewStuckSweepCheck(cfg)) } // System formulas check. diff --git a/cmd/gc/cmd_start.go b/cmd/gc/cmd_start.go index 29865d728..debd9ad7f 100644 --- a/cmd/gc/cmd_start.go +++ b/cmd/gc/cmd_start.go @@ -514,6 +514,13 @@ func doStartStandalone(args []string, controllerMode bool, stdout, stderr io.Wri return 1 } + // AC10: pre-flight compile of stuck_error_patterns. Invalid regex must + // abort startup so operators discover misconfiguration at gc start. + if err := config.ValidateStuckPatterns(cfg); err != nil { + fmt.Fprintf(stderr, "gc start: %v\n", err) //nolint:errcheck // best-effort stderr + return 1 + } + // Validate install_agent_hooks (workspace + all agents). if ih := cfg.Workspace.InstallAgentHooks; len(ih) > 0 { if err := hooks.Validate(ih); err != nil { diff --git a/cmd/gc/cmd_supervisor.go b/cmd/gc/cmd_supervisor.go index f790e2ffe..02ff7a645 100644 --- a/cmd/gc/cmd_supervisor.go +++ b/cmd/gc/cmd_supervisor.go @@ -1454,6 +1454,13 @@ func prepareCityForSupervisor(cityPath, cityName string, cfg *config.City, stder return fmt.Errorf("validate agents: %w", err) } + // AC10: pre-flight compile of stuck_error_patterns. + if err := runStep("validating_stuck_patterns", func() error { + return config.ValidateStuckPatterns(cfg) + }); err != nil { + return fmt.Errorf("validate stuck patterns: %w", err) + } + // Validate install_agent_hooks (workspace + all agents). if err := runStep("validating_hooks", func() error { if ih := cfg.Workspace.InstallAgentHooks; len(ih) > 0 { diff --git a/cmd/gc/stuck_sweep.go b/cmd/gc/stuck_sweep.go new file mode 100644 index 000000000..dd18104d7 --- /dev/null +++ b/cmd/gc/stuck_sweep.go @@ -0,0 +1,194 @@ +package main + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/gastownhall/gascity/internal/beads" + "github.com/gastownhall/gascity/internal/events" + "github.com/gastownhall/gascity/internal/telemetry" +) + +// runStuckSweep iterates controller-owned running sessions, checks each +// against the stuckTracker predicate, and files a warrant bead for any +// session flagged as stuck. Idempotent per target: an existing open warrant +// for the same session suppresses duplicates. +// +// Fail-open semantics: +// - One bd list --json shellout for wisp freshness per sweep. Failure +// aborts the whole sweep (whole-sweep fail-open, not per-session). +// - Peek / GetLastActivity errors skip the session and continue. +// - agent.stuck event emitted ONLY on successful warrant Create. +func (cr *CityRuntime) runStuckSweep(ctx context.Context, now time.Time) { + if cr == nil || cr.stuck == nil { + return + } + // Short-circuit when stuck_sweep=false or stuck_warrant_label resolves + // empty: StuckSweepEnabled() covers both cases. The noop tracker path + // (noopStuckTracker) is never reached in M3 — the sweep is gated here + // before any bd shellout. Zero patterns no longer results in noop; + // the progress-mismatch axis remains active when patterns are absent. + if !cr.cfg.Daemon.StuckSweepEnabled() { + return + } + if cr.sp == nil { + return + } + if ctx.Err() != nil { + return + } + // Defense-in-depth halt gate (E14/R7): tick() already checks halt before + // calling runStuckSweep, but we re-check here so direct unit invocations + // honor the same ordering as production. + if cr.halt.check(cr.cityPath, cr.stderr) { + return + } + + runner := cr.runner + if runner == nil { + runner = bdCommandRunnerForCity(cr.cityPath) + } + freshnessBySession, err := sweepWispFreshness(cr.cityPath, runner) + if err != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep: %v (fail-open this tick)\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, err) + return + } + + running, err := cr.sp.ListRunning("") + if err != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep: list running: %v\n", cr.logPrefix, err) //nolint:errcheck // best-effort stderr + return + } + + store := cr.cityBeadStore() + // Route all tracker inputs through the stuckTracker interface and config + // accessors rather than a concrete-type cast: this keeps alternative + // implementations (e.g., noopStuckTracker) viable and keeps sweep behavior + // aligned with the config source of truth. + peekLines := cr.cfg.Daemon.StuckPeekLinesOrDefault() + warrantLabel := cr.cfg.Daemon.StuckWarrantLabelOrDefault() + + for _, session := range running { + if ctx.Err() != nil { + return + } + + freshness, hasWisp := freshnessBySession[session] + if !hasWisp { + continue + } + + paneOutput, peekErr := cr.sp.Peek(session, peekLines) + if peekErr != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep: peek %s: %v\n", cr.logPrefix, session, peekErr) //nolint:errcheck // best-effort stderr + continue + } + + lastActivity, actErr := cr.sp.GetLastActivity(session) + if actErr != nil { + // GetLastActivity error ⇒ insufficient evidence; fail-open. + continue + } + + stuck, reason, matchedPattern := cr.stuck.checkStuck(session, paneOutput, + freshness.updatedAt, lastActivity, now) + if !stuck { + continue + } + + // Idempotence: suppress if an open warrant already targets this + // session. Reads store, not tracker state, so it survives reload. + // Fail-open direction: on store error, treat as "already warranted" + // to avoid duplicate flood while the store is transiently unavailable. + if store != nil { + already, lookupErr := hasOpenStuckWarrant(store, warrantLabel, session) + if lookupErr != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep: warrant lookup %s: %v (skipping)\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, session, lookupErr) + continue + } + if already { + continue + } + } + + // Race check: if the process is already dead, the crashTracker + // will handle it; do not file a warrant. + if !cr.sp.ProcessAlive(session, nil) { + continue + } + + if store == nil { + // No bead store available; log detection for observability. + fmt.Fprintf(cr.stderr, "%s: stuck detected (no store): session=%s reason=%s\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, session, reason) + continue + } + + wispAge := now.Sub(freshness.updatedAt).Round(time.Second).String() + warrant := beads.Bead{ + Title: "stuck:" + session, + Type: "warrant", + Labels: []string{warrantLabel}, + Metadata: map[string]string{ + "target": session, + "reason": reason, + "requester": "controller", + "matched_pattern": matchedPattern, + "wisp_id": freshness.id, + "wisp_age": wispAge, + }, + } + if _, createErr := store.Create(warrant); createErr != nil { + fmt.Fprintf(cr.stderr, "%s: stuck sweep: filing warrant for %s: %v\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, session, createErr) + continue + } + + if cr.rec != nil { + cr.rec.Record(events.Event{ + Type: events.AgentStuck, + Actor: "controller", + Subject: session, + Message: reason, + }) + } + // axis is a low-cardinality summary: "regex" or "progress_mismatch". + // The session attribute is intentionally included (consistent with + // RecordAgentIdleKill) and may have high cardinality in large fleets — + // operators should be aware. Full reason lives on the warrant bead. + axis := "regex" + if matchedPattern == "" { + axis = "progress_mismatch" + } + telemetry.RecordAgentStuckWarrant(ctx, session, axis) + fmt.Fprintf(cr.stderr, "%s: stuck warrant filed: session=%s reason=%s\n", //nolint:errcheck // best-effort stderr + cr.logPrefix, session, reason) + } +} + +// hasOpenStuckWarrant reports whether the store already contains an open +// warrant bead with the given label and metadata.target matching session. +// +// Fail-open direction (R6): on ListByLabel error, returns (true, err). Treating +// an unknown-state query as "already warranted" prevents duplicate-warrant +// floods when the store is transiently unavailable. The caller should skip +// warrant creation (and optionally log the error) on error returns. +func hasOpenStuckWarrant(store beads.Store, label, session string) (bool, error) { + beadsList, err := store.ListByLabel(label, 0) + if err != nil { + return true, err + } + for _, b := range beadsList { + if b.Type != "warrant" { + continue + } + if strings.TrimSpace(b.Metadata["target"]) == session { + return true, nil + } + } + return false, nil +} diff --git a/cmd/gc/stuck_sweep_test.go b/cmd/gc/stuck_sweep_test.go new file mode 100644 index 000000000..2d2caa8d1 --- /dev/null +++ b/cmd/gc/stuck_sweep_test.go @@ -0,0 +1,251 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "strings" + "testing" + "time" + + "github.com/gastownhall/gascity/internal/beads" + "github.com/gastownhall/gascity/internal/config" + "github.com/gastownhall/gascity/internal/events" + "github.com/gastownhall/gascity/internal/runtime" +) + +// stuckSweepFreshWisps marshals a canned set of in-progress wisp beads +// into the shape emitted by `bd list --json`, so sweepWispFreshness +// can decode them without a real bd subprocess. +func stuckSweepFreshnessRunner(t *testing.T, entries []freshnessEntry) beads.CommandRunner { + t.Helper() + out, err := json.Marshal(entries) + if err != nil { + t.Fatalf("marshal freshness entries: %v", err) + } + return func(_, _ string, _ ...string) ([]byte, error) { + return out, nil + } +} + +// newStuckSweepRuntime builds a minimal CityRuntime wired with a stuck +// tracker, a fake session provider, an in-memory bead store, and a +// recording event recorder. Each call site supplies the runner. +func newStuckSweepRuntime(t *testing.T, daemon config.DaemonConfig, runner beads.CommandRunner, rec events.Recorder) (*CityRuntime, *runtime.Fake, *beads.MemStore) { + t.Helper() + tr, err := newStuckTracker(daemon) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + sp := runtime.NewFake() + store := beads.NewMemStore() + cr := &CityRuntime{ + cityPath: t.TempDir(), + cityName: "test-city", + cfg: &config.City{Daemon: daemon}, + sp: sp, + standaloneCityStore: store, + stuck: tr, + runner: runner, + rec: rec, + stdout: io.Discard, + stderr: io.Discard, + logPrefix: "gc start", + } + return cr, sp, store +} + +// enabledDaemon returns a DaemonConfig with the stuck sweep enabled and +// a single pattern so StuckSweepEnabled() reports true. +func enabledDaemon() config.DaemonConfig { + peekLines := 50 + return config.DaemonConfig{ + StuckSweep: true, + StuckErrorPatterns: []string{`(?i)rate limit`}, + StuckWispThreshold: "10m", + StuckPeekLines: &peekLines, + StuckWarrantLabel: "pool:dog", + } +} + +// TestStuckSweep_NoAgentsConfigured asserts SDK self-sufficiency (AC16/E21): +// with zero [[agent]] entries the sweep completes cleanly — no panic, +// no warrants — provided ListRunning is empty. +func TestStuckSweep_NoAgentsConfigured(t *testing.T) { + // ListRunning will be empty since no sessions are Start()-ed. + runner := stuckSweepFreshnessRunner(t, nil) + var rec memRecorder + cr, _, store := newStuckSweepRuntime(t, enabledDaemon(), runner, &rec) + // No [[agent]] entries in cfg.Agents — zero SDK-role dependency. + if len(cr.cfg.Agents) != 0 { + t.Fatalf("precondition: Agents must be empty, got %d", len(cr.cfg.Agents)) + } + + cr.runStuckSweep(context.Background(), time.Now()) + + warrants, err := store.ListByLabel("pool:dog", 0) + if err != nil { + t.Fatalf("ListByLabel: %v", err) + } + if len(warrants) != 0 { + t.Fatalf("no-agents sweep must file zero warrants, got %d", len(warrants)) + } + if rec.hasType(events.AgentStuck) { + t.Fatal("no-agents sweep must emit zero agent.stuck events") + } +} + +// TestStuckSweep_IdempotenceAcrossTicks asserts E11: running the sweep +// twice against the same stuck session produces exactly one warrant. +func TestStuckSweep_IdempotenceAcrossTicks(t *testing.T) { + now := time.Now() + wispOpenedAt := now.Add(-30 * time.Minute) // stale beyond 10m threshold + entries := []freshnessEntry{{ + ID: "bd-1", + UpdatedAt: wispOpenedAt.Format(time.RFC3339), + Metadata: map[string]string{"session_name": "worker-1"}, + }} + runner := stuckSweepFreshnessRunner(t, entries) + var rec memRecorder + cr, sp, store := newStuckSweepRuntime(t, enabledDaemon(), runner, &rec) + + if err := sp.Start(context.Background(), "worker-1", runtime.Config{}); err != nil { + t.Fatalf("sp.Start: %v", err) + } + sp.SetPeekOutput("worker-1", "HTTP 429: rate limit exceeded\n") + sp.SetActivity("worker-1", now.Add(-20*time.Minute)) + + cr.runStuckSweep(context.Background(), now) + cr.runStuckSweep(context.Background(), now.Add(time.Second)) + + warrants, err := store.ListByLabel("pool:dog", 0) + if err != nil { + t.Fatalf("ListByLabel: %v", err) + } + if len(warrants) != 1 { + t.Fatalf("expected exactly 1 warrant after two sweeps, got %d", len(warrants)) + } + if got := warrants[0].Metadata["target"]; got != "worker-1" { + t.Fatalf("warrant target = %q, want worker-1", got) + } +} + +// TestStuckSweep_HaltGateSuppresses asserts E14: with the halt flag set, +// tick() short-circuits before the sweep runs, so no warrant is filed +// even when a genuinely stuck session is present. +func TestStuckSweep_HaltGateSuppresses(t *testing.T) { + now := time.Now() + wispOpenedAt := now.Add(-30 * time.Minute) + entries := []freshnessEntry{{ + ID: "bd-1", + UpdatedAt: wispOpenedAt.Format(time.RFC3339), + Metadata: map[string]string{"session_name": "worker-1"}, + }} + runner := stuckSweepFreshnessRunner(t, entries) + var rec memRecorder + cr, sp, store := newStuckSweepRuntime(t, enabledDaemon(), runner, &rec) + + if err := sp.Start(context.Background(), "worker-1", runtime.Config{}); err != nil { + t.Fatalf("sp.Start: %v", err) + } + sp.SetPeekOutput("worker-1", "rate limit exceeded") + sp.SetActivity("worker-1", now.Add(-20*time.Minute)) + + // Write the halt file so the in-sweep defense-in-depth halt gate + // short-circuits. We then call runStuckSweep directly and assert no + // warrant is filed — exercising the halt gate rather than skipping it. + if err := writeHaltFile(cr.cityPath); err != nil { + t.Fatalf("writeHaltFile: %v", err) + } + cr.runStuckSweep(context.Background(), now) + + warrants, err := store.ListByLabel("pool:dog", 0) + if err != nil { + t.Fatalf("ListByLabel: %v", err) + } + if len(warrants) != 0 { + t.Fatalf("halted tick must file zero warrants, got %d", len(warrants)) + } + if rec.hasType(events.AgentStuck) { + t.Fatal("halted tick must emit zero agent.stuck events") + } +} + +// TestStuckSweep_WispQueryFailsOpenSweep asserts E24/AC18: when the +// bd-list runner returns an error, the whole sweep fails open — no +// panic, no warrants filed. +func TestStuckSweep_WispQueryFailsOpenSweep(t *testing.T) { + var stderr bytes.Buffer + failingRunner := func(_, _ string, _ ...string) ([]byte, error) { + return nil, fmt.Errorf("bd unavailable") + } + var rec memRecorder + cr, sp, store := newStuckSweepRuntime(t, enabledDaemon(), failingRunner, &rec) + cr.stderr = &stderr + + if err := sp.Start(context.Background(), "worker-1", runtime.Config{}); err != nil { + t.Fatalf("sp.Start: %v", err) + } + sp.SetPeekOutput("worker-1", "rate limit exceeded") + sp.SetActivity("worker-1", time.Now().Add(-time.Hour)) + + // Must not panic. + cr.runStuckSweep(context.Background(), time.Now()) + + warrants, err := store.ListByLabel("pool:dog", 0) + if err != nil { + t.Fatalf("ListByLabel: %v", err) + } + if len(warrants) != 0 { + t.Fatalf("fail-open sweep must file zero warrants, got %d", len(warrants)) + } + if rec.hasType(events.AgentStuck) { + t.Fatal("fail-open sweep must emit zero agent.stuck events") + } + if !strings.Contains(stderr.String(), "stuck sweep:") { + t.Fatalf("expected warn log on stderr, got %q", stderr.String()) + } +} + +// TestStuckSweep_EventOnCreateSuccess asserts AC17: an agent.stuck event +// is emitted exactly once when a warrant bead is successfully created. +func TestStuckSweep_EventOnCreateSuccess(t *testing.T) { + now := time.Now() + wispOpenedAt := now.Add(-30 * time.Minute) + entries := []freshnessEntry{{ + ID: "bd-1", + UpdatedAt: wispOpenedAt.Format(time.RFC3339), + Metadata: map[string]string{"session_name": "worker-1"}, + }} + runner := stuckSweepFreshnessRunner(t, entries) + var rec memRecorder + cr, sp, store := newStuckSweepRuntime(t, enabledDaemon(), runner, &rec) + + if err := sp.Start(context.Background(), "worker-1", runtime.Config{}); err != nil { + t.Fatalf("sp.Start: %v", err) + } + sp.SetPeekOutput("worker-1", "HTTP 429: rate limit exceeded\n") + sp.SetActivity("worker-1", now.Add(-20*time.Minute)) + + cr.runStuckSweep(context.Background(), now) + + // Exactly one warrant, exactly one agent.stuck event on Create success. + warrants, err := store.ListByLabel("pool:dog", 0) + if err != nil { + t.Fatalf("ListByLabel: %v", err) + } + if len(warrants) != 1 { + t.Fatalf("expected exactly 1 warrant, got %d", len(warrants)) + } + count := 0 + for _, e := range rec.events { + if e.Type == events.AgentStuck && e.Subject == "worker-1" { + count++ + } + } + if count != 1 { + t.Fatalf("expected exactly 1 agent.stuck event on Create success, got %d", count) + } +} diff --git a/cmd/gc/stuck_tracker.go b/cmd/gc/stuck_tracker.go new file mode 100644 index 000000000..52be3e6b6 --- /dev/null +++ b/cmd/gc/stuck_tracker.go @@ -0,0 +1,181 @@ +package main + +import ( + "fmt" + "regexp" + "sort" + "strings" + "time" + + "github.com/gastownhall/gascity/internal/config" +) + +// stuckTracker is a pure predicate that classifies a session as stuck or +// not-stuck based on cognition-independent signals supplied by the caller: +// recent pane output, the session's in-progress wisp updated-at time, the +// session's last I/O-activity time, and the current wall-clock time. +// +// Following the nil-guard convention used by idleTracker, crashTracker, and +// wispGC, the tracker does no I/O and reads no global clock. The CityRuntime +// wrapper is responsible for collecting inputs (via runtime.Provider.Peek, +// runtime.Provider.GetLastActivity, and a bd list shellout for wisp +// freshness) and for acting on a positive result (filing a warrant bead, +// emitting the agent.stuck event). +// +// The pure-predicate shape matches R10 of the design brief and enables +// deterministic table-driven tests. +type stuckTracker interface { + // checkStuck returns (true, reason) if the session should be flagged as + // stuck, or (false, "") otherwise. The reason string is a short + // human-readable explanation suitable for embedding in a warrant's + // metadata and the agent.stuck event message. + // + // matchedPattern is the regex source string of the first pattern that + // matched paneOutput, or a sorted comma-joined list when multiple + // patterns matched. Empty when the positive result derives from the + // progress-mismatch axis alone. + checkStuck( + session string, + paneOutput string, + wispUpdatedAt time.Time, + lastActivity time.Time, + now time.Time, + ) (stuck bool, reason string, matchedPattern string) +} + +// noopStuckTracker is returned when the sweep is disabled. It always reports +// not-stuck so callers can use it without nil-guarding. +type noopStuckTracker struct{} + +func (noopStuckTracker) checkStuck(string, string, time.Time, time.Time, time.Time) (bool, string, string) { + return false, "", "" +} + +// memoryStuckTracker is the production implementation. Compiled regexes are +// immutable after construction (R13 / round-2 disagreement resolved): the +// slice is never mutated; reload builds a fresh tracker. +type memoryStuckTracker struct { + wispThreshold time.Duration + patterns []*regexp.Regexp + peekLines int + warrantLabel string +} + +// wispThresholdDuration returns the configured staleness threshold. Exposed +// so the CityRuntime wrapper can decide how far back to Peek or adjust its +// own behavior without re-reading config. +func (m *memoryStuckTracker) wispThresholdDuration() time.Duration { + return m.wispThreshold +} + +// peekLinesOrDefault returns the configured peek-lines setting. +func (m *memoryStuckTracker) peekLinesOrDefault() int { + return m.peekLines +} + +// warrantLabelOrDefault returns the configured warrant label. +func (m *memoryStuckTracker) warrantLabelOrDefault() string { + return m.warrantLabel +} + +// newStuckTracker constructs the tracker from daemon config. Returns a +// noopStuckTracker when the sweep is disabled or misconfigured. Returns an +// error when any pattern fails to compile; callers should treat this as a +// fail-fast startup error (E3). +func newStuckTracker(d config.DaemonConfig) (stuckTracker, error) { + if !d.StuckSweepEnabled() { + return noopStuckTracker{}, nil + } + patterns := make([]*regexp.Regexp, 0, len(d.StuckErrorPatterns)) + for i, src := range d.StuckErrorPatterns { + re, err := regexp.Compile(src) + if err != nil { + return nil, fmt.Errorf( + "invalid regex in [daemon].stuck_error_patterns[%d] %q: %w", + i, src, err) + } + patterns = append(patterns, re) + } + return &memoryStuckTracker{ + wispThreshold: d.StuckWispThresholdDuration(), + patterns: patterns, + peekLines: d.StuckPeekLinesOrDefault(), + warrantLabel: d.StuckWarrantLabelOrDefault(), + }, nil +} + +// checkStuck implements the default "convergent evidence" composition: +// a session is stuck when its open wisp is stale AND at least one of the +// following holds: +// +// - an error pattern matches recent pane output; or +// - the session's last-activity time is older than the wisp +// UpdatedAt by more than the wisp-staleness threshold (i.e., the +// agent has produced I/O but no progress since the wisp opened). +// +// Fail-open semantics live in the caller: zero timestamps, empty pane +// output, or missing wisp-freshness data result in false (not stuck). +func (m *memoryStuckTracker) checkStuck( + _ string, + paneOutput string, + wispUpdatedAt time.Time, + lastActivity time.Time, + now time.Time, +) (bool, string, string) { + if wispUpdatedAt.IsZero() { + return false, "", "" + } + wispAge := now.Sub(wispUpdatedAt) + if wispAge <= m.wispThreshold { + return false, "", "" + } + + matched := m.matchPatterns(paneOutput) + progressMismatch := false + if !lastActivity.IsZero() && lastActivity.Before(wispUpdatedAt) { + // Activity strictly older than wisp open — agent is producing no + // I/O against the in-progress wisp. + if now.Sub(lastActivity) > m.wispThreshold { + progressMismatch = true + } + } + + if matched == "" && !progressMismatch { + return false, "", "" + } + + reason := buildStuckReason(wispAge, matched, progressMismatch) + return true, reason, matched +} + +// matchPatterns returns a deterministic, comma-joined string of regex +// sources that match paneOutput, or "" if none match. The list is sorted +// so repeated evaluations produce identical metadata. +func (m *memoryStuckTracker) matchPatterns(paneOutput string) string { + if paneOutput == "" || len(m.patterns) == 0 { + return "" + } + var hits []string + for _, re := range m.patterns { + if re.MatchString(paneOutput) { + hits = append(hits, re.String()) + } + } + if len(hits) == 0 { + return "" + } + sort.Strings(hits) + return strings.Join(hits, ",") +} + +// buildStuckReason formats a short human-readable reason string. +func buildStuckReason(wispAge time.Duration, matched string, progressMismatch bool) string { + parts := []string{fmt.Sprintf("wisp stale %s", wispAge.Round(time.Second))} + if matched != "" { + parts = append(parts, fmt.Sprintf("pattern %q", matched)) + } + if progressMismatch { + parts = append(parts, "no progress since wisp opened") + } + return "stuck: " + strings.Join(parts, "; ") +} diff --git a/cmd/gc/stuck_tracker_test.go b/cmd/gc/stuck_tracker_test.go new file mode 100644 index 000000000..d13a03453 --- /dev/null +++ b/cmd/gc/stuck_tracker_test.go @@ -0,0 +1,269 @@ +package main + +import ( + "strings" + "testing" + "time" + + "github.com/gastownhall/gascity/internal/config" +) + +// doStuckTracker constructs a memoryStuckTracker for tests with explicit +// daemon config knobs. Mirrors the do*() convention used by idle/crash +// tracker tests for readable call-sites. +func doStuckTracker(t *testing.T, patterns []string) stuckTracker { + t.Helper() + peekLines := 50 + d := config.DaemonConfig{ + StuckSweep: true, + StuckWispThreshold: (10 * time.Minute).String(), + StuckErrorPatterns: patterns, + StuckPeekLines: &peekLines, + StuckWarrantLabel: "pool:dog", + } + tr, err := newStuckTracker(d) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + return tr +} + +func TestStuckTracker_DisabledReturnsNoop(t *testing.T) { + // StuckSweep=false → noopStuckTracker, always reports not-stuck. + tr, err := newStuckTracker(config.DaemonConfig{}) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + if _, ok := tr.(noopStuckTracker); !ok { + t.Fatalf("expected noopStuckTracker when disabled, got %T", tr) + } + now := time.Now() + stuck, _, _ := tr.checkStuck("s1", "ERROR: boom", now.Add(-time.Hour), now.Add(-time.Hour), now) + if stuck { + t.Fatal("noop tracker must never report stuck") + } +} + +func TestStuckTracker_EmptyPatternsKeepsTracker(t *testing.T) { + // Post-M3: StuckSweep=true with no patterns is valid — regex axis + // is disabled but progress-mismatch axis remains live, so we still + // construct a memoryStuckTracker with an empty patterns slice. + tr, err := newStuckTracker(config.DaemonConfig{ + StuckSweep: true, + StuckWarrantLabel: "pool:dog", + }) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + m, ok := tr.(*memoryStuckTracker) + if !ok { + t.Fatalf("expected *memoryStuckTracker when patterns empty, got %T", tr) + } + if len(m.patterns) != 0 { + t.Fatalf("expected zero compiled patterns, got %d", len(m.patterns)) + } +} + +func TestStuckTracker_EmptyPatternsProgressMismatchFires(t *testing.T) { + // Post-M3: with no patterns, the progress-mismatch axis still classifies + // a stale wisp + stale activity as stuck. + tr, err := newStuckTracker(config.DaemonConfig{ + StuckSweep: true, + StuckWispThreshold: (10 * time.Minute).String(), + StuckWarrantLabel: "pool:dog", + }) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + now := time.Now() + wispOpened := now.Add(-30 * time.Minute) + lastActivity := wispOpened.Add(-30 * time.Minute) // older than wisp, well past threshold + stuck, reason, matched := tr.checkStuck("s1", "quiet pane", wispOpened, lastActivity, now) + if !stuck { + t.Fatalf("expected stuck via progress-mismatch with empty patterns; reason=%q", reason) + } + if matched != "" { + t.Fatalf("expected empty matched pattern, got %q", matched) + } + if !strings.Contains(reason, "no progress") { + t.Fatalf("reason should mention progress: %q", reason) + } +} + +func TestStuckTracker_InvalidRegexFailsFast(t *testing.T) { + _, err := newStuckTracker(config.DaemonConfig{ + StuckSweep: true, + StuckErrorPatterns: []string{"fine", "(unclosed"}, + StuckWarrantLabel: "pool:dog", + }) + if err == nil { + t.Fatal("expected error for invalid regex, got nil") + } + if !strings.Contains(err.Error(), "stuck_error_patterns[1]") { + t.Fatalf("error should identify bad index: %v", err) + } +} + +func TestStuckTracker_StalePlusPatternDetected(t *testing.T) { + tr := doStuckTracker(t, []string{`(?i)rate limit`}) + now := time.Now() + // Wisp opened 30m ago (stale); activity recent so no progress-mismatch. + stuck, reason, matched := tr.checkStuck( + "s1", + "hello\nrate limit exceeded\n", + now.Add(-30*time.Minute), + now.Add(-30*time.Second), + now, + ) + if !stuck { + t.Fatalf("expected stuck; reason=%q", reason) + } + if matched == "" { + t.Fatal("expected matched pattern to be populated") + } + if !strings.Contains(reason, "pattern") { + t.Fatalf("reason should mention pattern: %q", reason) + } +} + +func TestStuckTracker_StalePlusProgressMismatchDetected(t *testing.T) { + // No pattern match but last-activity is older than the wisp UpdatedAt + // by more than the threshold → stuck on the progress-mismatch axis. + tr := doStuckTracker(t, []string{`never-matches-xyz`}) + now := time.Now() + wispOpened := now.Add(-30 * time.Minute) + lastActivity := wispOpened.Add(-30 * time.Minute) // older than wisp, > threshold old overall + stuck, reason, matched := tr.checkStuck("s1", "quiet pane", wispOpened, lastActivity, now) + if !stuck { + t.Fatalf("expected stuck via progress-mismatch; reason=%q", reason) + } + if matched != "" { + t.Fatalf("no pattern should have matched, got %q", matched) + } + if !strings.Contains(reason, "no progress") { + t.Fatalf("reason should mention progress: %q", reason) + } +} + +func TestStuckTracker_FreshWispNotDetected(t *testing.T) { + tr := doStuckTracker(t, []string{`(?i)rate limit`}) + now := time.Now() + // Wisp opened 2 minutes ago — fresh — even with an error pattern present. + stuck, _, _ := tr.checkStuck( + "s1", + "rate limit exceeded", + now.Add(-2*time.Minute), + now.Add(-1*time.Minute), + now, + ) + if stuck { + t.Fatal("fresh wisp must not be flagged even on pattern match") + } +} + +func TestStuckTracker_StaleButNoPatternAndActivityAlignedNotDetected(t *testing.T) { + tr := doStuckTracker(t, []string{`never-matches-xyz`}) + now := time.Now() + wispOpened := now.Add(-30 * time.Minute) + // Activity AFTER wisp open → no progress-mismatch. + lastActivity := now.Add(-30 * time.Second) + stuck, _, _ := tr.checkStuck("s1", "still working", wispOpened, lastActivity, now) + if stuck { + t.Fatal("stale wisp with aligned activity and no pattern must not be flagged") + } +} + +func TestStuckTracker_ZeroWispUpdatedAtFailOpen(t *testing.T) { + tr := doStuckTracker(t, []string{`(?i)error`}) + now := time.Now() + stuck, _, _ := tr.checkStuck("s1", "error error", time.Time{}, now.Add(-time.Hour), now) + if stuck { + t.Fatal("zero wispUpdatedAt must fail-open (not stuck)") + } +} + +func TestStuckTracker_BoundaryEqualsThresholdNotDetected(t *testing.T) { + // E16: strict > against threshold. + tr := doStuckTracker(t, []string{`(?i)error`}) + now := time.Now() + wispOpened := now.Add(-10 * time.Minute) // exactly at threshold + stuck, _, _ := tr.checkStuck("s1", "error", wispOpened, now.Add(-time.Second), now) + if stuck { + t.Fatal("wisp age exactly at threshold must not be flagged (strict >)") + } +} + +func TestStuckTracker_MultiplePatternsDeterministicJoin(t *testing.T) { + // E12: multiple matches → sorted comma-joined matched string. + tr := doStuckTracker(t, []string{`zeta`, `alpha`, `beta`}) + now := time.Now() + stuck, _, matched := tr.checkStuck( + "s1", + "alpha beta zeta together", + now.Add(-20*time.Minute), + now.Add(-time.Second), + now, + ) + if !stuck { + t.Fatal("expected stuck") + } + if matched != "alpha,beta,zeta" { + t.Fatalf("matched should be sorted-joined, got %q", matched) + } +} + +func TestStuckTracker_EmptyPaneAndNoMismatchNotDetected(t *testing.T) { + tr := doStuckTracker(t, []string{`(?i)error`}) + now := time.Now() + wispOpened := now.Add(-30 * time.Minute) + // Empty pane, activity aligned with wisp (no mismatch). + stuck, _, _ := tr.checkStuck("s1", "", wispOpened, now.Add(-time.Second), now) + if stuck { + t.Fatal("empty pane with no progress-mismatch must not be flagged") + } +} + +func TestStuckTracker_PeekLinesAndLabelAccessors(t *testing.T) { + d := config.DaemonConfig{ + StuckSweep: true, + StuckErrorPatterns: []string{`x`}, + StuckPeekLines: nil, // nil → default + } + tr, err := newStuckTracker(d) + if err != nil { + t.Fatalf("newStuckTracker: %v", err) + } + m, ok := tr.(*memoryStuckTracker) + if !ok { + t.Fatalf("expected *memoryStuckTracker, got %T", tr) + } + if m.peekLinesOrDefault() != config.DefaultStuckPeekLines { + t.Fatalf("peekLines default not applied: %d", m.peekLinesOrDefault()) + } + if m.warrantLabelOrDefault() != config.DefaultStuckWarrantLabel { + t.Fatalf("warrantLabel default not applied: %q", m.warrantLabelOrDefault()) + } + if m.wispThresholdDuration() <= 0 { + t.Fatalf("wispThreshold default not applied: %v", m.wispThresholdDuration()) + } +} + +func TestStuckSweepEnabled_RequiresAllPreconditions(t *testing.T) { + cases := []struct { + name string + d config.DaemonConfig + want bool + }{ + {"all unset", config.DaemonConfig{}, false}, + {"flag off", config.DaemonConfig{StuckErrorPatterns: []string{"x"}}, false}, + {"flag+label (no patterns)", config.DaemonConfig{StuckSweep: true, StuckWarrantLabel: "pool:dog"}, true}, + {"flag+pattern (default label applies)", config.DaemonConfig{StuckSweep: true, StuckErrorPatterns: []string{"x"}}, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if got := tc.d.StuckSweepEnabled(); got != tc.want { + t.Fatalf("got %v, want %v", got, tc.want) + } + }) + } +} diff --git a/cmd/gc/wisp_freshness.go b/cmd/gc/wisp_freshness.go new file mode 100644 index 000000000..eda97920c --- /dev/null +++ b/cmd/gc/wisp_freshness.go @@ -0,0 +1,63 @@ +package main + +import ( + "encoding/json" + "fmt" + "time" + + "github.com/gastownhall/gascity/internal/beads" +) + +// wispFreshness is an opaque snapshot of one session's most-recently-updated +// open wisp. It lives inside cmd/gc so raw bd JSON types (updated_at etc.) +// do not leak across the stuckTracker boundary. +type wispFreshness struct { + id string + updatedAt time.Time +} + +// freshnessEntry is the minimal JSON shape parsed from bd list --json. +// beads.Bead does not expose updated_at, so the stuck sweep parses the raw +// bd output rather than extending the Bead struct (rule-of-three not met). +type freshnessEntry struct { + ID string `json:"id"` + UpdatedAt string `json:"updated_at"` + Metadata map[string]string `json:"metadata"` +} + +// sweepWispFreshness queries bd for all in-progress wisps in one shellout and +// groups them by session_name, returning the most-recently-updated entry per +// session. One call per sweep (not per session) per the design brief. +// +// Returns an empty map and nil error when there are no in-progress wisps. +// On shellout or parse error the entire sweep fails open (whole-map fail); +// callers must treat the error as a signal to skip the tick. +func sweepWispFreshness(cityPath string, runner beads.CommandRunner) (map[string]wispFreshness, error) { + out, err := runner(cityPath, "bd", "list", "--json", "--limit=0", + "--status=in_progress", "--type=wisp") + if err != nil { + return nil, fmt.Errorf("listing in-progress wisps: %w", err) + } + var entries []freshnessEntry + if err := json.Unmarshal(out, &entries); err != nil { + return nil, fmt.Errorf("parsing wisp list: %w", err) + } + result := make(map[string]wispFreshness, len(entries)) + for _, e := range entries { + sn := e.Metadata["session_name"] + if sn == "" { + continue + } + updated, err := time.Parse(time.RFC3339, e.UpdatedAt) + if err != nil { + continue + } + // Tiebreaker: when multiple open wisps map to one session (legal + // during handoff), pick the most-recent updated_at. + if cur, ok := result[sn]; ok && !updated.After(cur.updatedAt) { + continue + } + result[sn] = wispFreshness{id: e.ID, updatedAt: updated} + } + return result, nil +} diff --git a/cmd/gc/wisp_freshness_test.go b/cmd/gc/wisp_freshness_test.go new file mode 100644 index 000000000..b467529b7 --- /dev/null +++ b/cmd/gc/wisp_freshness_test.go @@ -0,0 +1,125 @@ +package main + +import ( + "errors" + "strings" + "testing" + "time" +) + +// fakeFreshnessRunner returns a beads.CommandRunner closure that always returns the +// given output and error, ignoring its inputs. This pins production parsing +// behavior against a literal-JSON fixture. +func fakeFreshnessRunner(out []byte, err error) func(string, string, ...string) ([]byte, error) { + return func(string, string, ...string) ([]byte, error) { + return out, err + } +} + +// TestSweepWispFreshness_LiteralJSONFieldNames pins the JSON tag names that +// production code reads from `bd list --json`: `id`, `updated_at`, and +// `metadata.session_name`. If any of these tags are renamed without updating +// production parsing, this regression test fails. +func TestSweepWispFreshness_LiteralJSONFieldNames(t *testing.T) { + const fixture = `[{"id":"bd-1","updated_at":"2026-01-01T12:00:00Z","metadata":{"session_name":"worker-1"}}]` + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte(fixture), nil)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + wf, ok := got["worker-1"] + if !ok { + t.Fatalf("expected entry for session worker-1, got %v", got) + } + if wf.id != "bd-1" { + t.Errorf("id: got %q want %q", wf.id, "bd-1") + } + want, _ := time.Parse(time.RFC3339, "2026-01-01T12:00:00Z") + if !wf.updatedAt.Equal(want) { + t.Errorf("updatedAt: got %v want %v", wf.updatedAt, want) + } +} + +// TestSweepWispFreshness_MostRecentWins verifies that when multiple wisps +// map to one session, the entry with the most-recent updated_at wins. +func TestSweepWispFreshness_MostRecentWins(t *testing.T) { + const fixture = `[ + {"id":"bd-old","updated_at":"2026-01-01T10:00:00Z","metadata":{"session_name":"s"}}, + {"id":"bd-new","updated_at":"2026-01-01T12:00:00Z","metadata":{"session_name":"s"}}, + {"id":"bd-mid","updated_at":"2026-01-01T11:00:00Z","metadata":{"session_name":"s"}} + ]` + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte(fixture), nil)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got["s"].id != "bd-new" { + t.Fatalf("expected most-recent bd-new to win, got %q", got["s"].id) + } +} + +// TestSweepWispFreshness_EmptySessionSkipped verifies entries with empty +// session_name are dropped. +func TestSweepWispFreshness_EmptySessionSkipped(t *testing.T) { + const fixture = `[{"id":"bd-1","updated_at":"2026-01-01T12:00:00Z","metadata":{"session_name":""}}]` + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte(fixture), nil)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(got) != 0 { + t.Fatalf("expected empty map, got %v", got) + } +} + +// TestSweepWispFreshness_MalformedUpdatedAtSkipped verifies entries with an +// unparseable updated_at are dropped (not propagated as an error). +func TestSweepWispFreshness_MalformedUpdatedAtSkipped(t *testing.T) { + const fixture = `[{"id":"bd-1","updated_at":"not-a-date","metadata":{"session_name":"s"}}]` + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte(fixture), nil)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(got) != 0 { + t.Fatalf("expected empty map for malformed updated_at, got %v", got) + } +} + +// TestSweepWispFreshness_RunnerErrorReturnsError verifies that a runner +// failure surfaces as a wrapped error and a nil map. +func TestSweepWispFreshness_RunnerErrorReturnsError(t *testing.T) { + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner(nil, errors.New("bd exploded"))) + if err == nil { + t.Fatal("expected error, got nil") + } + if got != nil { + t.Fatalf("expected nil map on runner error, got %v", got) + } + if !strings.Contains(err.Error(), "listing in-progress wisps") { + t.Fatalf("expected wrapped error context, got %v", err) + } +} + +// TestSweepWispFreshness_UnmarshalErrorReturnsError verifies that non-JSON +// output surfaces as a wrapped parse error. +func TestSweepWispFreshness_UnmarshalErrorReturnsError(t *testing.T) { + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte("not json"), nil)) + if err == nil { + t.Fatal("expected unmarshal error, got nil") + } + if got != nil { + t.Fatalf("expected nil map on unmarshal error, got %v", got) + } + if !strings.Contains(err.Error(), "parsing wisp list") { + t.Fatalf("expected wrapped error context, got %v", err) + } +} + +// TestSweepWispFreshness_EmptyArrayReturnsEmptyMap verifies an empty JSON +// array yields an empty map and a nil error. +func TestSweepWispFreshness_EmptyArrayReturnsEmptyMap(t *testing.T) { + got, err := sweepWispFreshness("/tmp/city", fakeFreshnessRunner([]byte("[]"), nil)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(got) != 0 { + t.Fatalf("expected empty map, got %v", got) + } +} diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 36d413cdb..88455d28c 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -29,10 +29,12 @@ gc [flags] | [gc convoy](#gc-convoy) | Manage convoys — graphs of related work | | [gc dashboard](#gc-dashboard) | Web dashboard for monitoring the city | | [gc doctor](#gc-doctor) | Check workspace health | +| [gc dolt](#gc-dolt) | Commands from the dolt pack | | [gc event](#gc-event) | Event operations | | [gc events](#gc-events) | Show the event log | | [gc formula](#gc-formula) | Manage and inspect formulas | | [gc graph](#gc-graph) | Show dependency graph for beads | +| [gc halt](#gc-halt) | Pause the supervisor reconciliation tick | | [gc handoff](#gc-handoff) | Send handoff mail and restart this session | | [gc help](#gc-help) | Help about any command | | [gc hook](#gc-hook) | Check for available work (use --inject for Stop hook output) | @@ -672,6 +674,107 @@ gc doctor | `--fix` | bool | | attempt to fix issues automatically | | `-v`, `--verbose` | bool | | show extra diagnostic details | +## gc dolt + +Commands from the dolt pack + +``` +gc dolt +``` + +| Subcommand | Description | +|------------|-------------| +| [gc dolt cleanup](#gc-dolt-cleanup) | Find and remove orphaned Dolt databases | +| [gc dolt health](#gc-dolt-health) | Check Dolt data-plane health | +| [gc dolt list](#gc-dolt-list) | List Dolt databases | +| [gc dolt logs](#gc-dolt-logs) | Tail the Dolt server log file | +| [gc dolt recover](#gc-dolt-recover) | Recover Dolt from read-only state | +| [gc dolt rollback](#gc-dolt-rollback) | List or restore from migration backups | +| [gc dolt sql](#gc-dolt-sql) | Open an interactive Dolt SQL shell | +| [gc dolt start](#gc-dolt-start) | Start the Dolt server if not already running | +| [gc dolt status](#gc-dolt-status) | Check if the Dolt server is running | +| [gc dolt sync](#gc-dolt-sync) | Push databases to configured remotes | + +## gc dolt cleanup + +Find and remove orphaned Dolt databases + +``` +gc dolt cleanup +``` + +## gc dolt health + +Check Dolt data-plane health + +``` +gc dolt health +``` + +## gc dolt list + +List Dolt databases + +``` +gc dolt list +``` + +## gc dolt logs + +Tail the Dolt server log file + +``` +gc dolt logs +``` + +## gc dolt recover + +Recover Dolt from read-only state + +``` +gc dolt recover +``` + +## gc dolt rollback + +List or restore from migration backups + +``` +gc dolt rollback +``` + +## gc dolt sql + +Open an interactive Dolt SQL shell + +``` +gc dolt sql +``` + +## gc dolt start + +Start the Dolt server if not already running + +``` +gc dolt start +``` + +## gc dolt status + +Check if the Dolt server is running + +``` +gc dolt status +``` + +## gc dolt sync + +Push databases to configured remotes + +``` +gc dolt sync +``` + ## gc event Event operations @@ -834,6 +937,24 @@ gc graph gc-42 # expand convoy children | `--mermaid` | bool | | output Mermaid.js flowchart | | `--tree` | bool | | output Unicode dependency tree | +## gc halt + +Halt the supervisor reconciliation tick for a city by creating +a flag file at <city>/.gc/runtime/halt. While the flag is present the +supervisor loop skips tick work (no session wakes, no convergence, +no order dispatch) but keeps the process alive, logs, and control +socket responsive. + +This is a soft circuit breaker for emergencies: it stops disk thrash +from a runaway reconciler without requiring "systemctl stop". The +supervisor process itself is not killed. + +Idempotent. Use "gc resume" to clear the flag. + +``` +gc halt [path] +``` + ## gc handoff Convenience command for context handoff. @@ -1318,6 +1439,8 @@ gc restart [path] ## gc resume Resume a suspended city by clearing workspace.suspended in city.toml. +Also clears the halt flag file (if any) created by "gc halt", so this +is the single verb that takes a city out of every soft-pause state. Restores normal operation: the reconciler will spawn agents again and gc hook/prime will return work. Use "gc agent resume" to resume @@ -1364,6 +1487,10 @@ Use --prefix to set the bead ID prefix explicitly (default: derived from name). Use --start-suspended to add the rig in a suspended state (dormant-by-default). The rig's agents won't spawn until explicitly resumed with "gc rig resume". +Use --adopt to register a directory that already has a fully initialized +.beads/ directory (must include both metadata.json and config.yaml). +Skips beads init; the git repo check remains informational. + ``` gc rig add [flags] ``` @@ -1376,10 +1503,12 @@ gc rig add /path/to/project gc rig add /path/to/project --prefix r1 gc rig add ./my-project --include packs/gastown gc rig add ./my-project --include packs/gastown --start-suspended + gc rig add /path/to/existing --adopt ``` | Flag | Type | Default | Description | |------|------|---------|-------------| +| `--adopt` | bool | | adopt existing .beads/ directory (skip init) | | `--include` | string | | pack directory for rig agents | | `--name` | string | | rig name (default: directory basename) | | `--prefix` | string | | bead ID prefix (default: derived from name) | diff --git a/docs/reference/config.md b/docs/reference/config.md index fed4f2a10..25e202f11 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -247,6 +247,11 @@ DaemonConfig holds controller daemon settings. | `drift_drain_timeout` | string | | `2m` | DriftDrainTimeout is the maximum time to wait for an agent to acknowledge a drain signal during a config-drift restart. If the agent doesn't ack within this window, the controller force-kills and restarts it. Duration string (e.g., "2m", "5m"). Defaults to "2m". | | `observe_paths` | []string | | | ObservePaths lists extra directories to search for Claude JSONL session files (e.g., aimux session paths). The default search path (~/.claude/projects/) is always included. | | `probe_concurrency` | integer | | `8` | ProbeConcurrency bounds the number of concurrent bd subprocess probes issued by the pool scale_check and work_query paths. bd serializes on a shared dolt sql-server, so unbounded parallelism causes contention. Nil (unset) defaults to 8. Set higher for workspaces with a fast dedicated dolt server, or lower to reduce contention on slow storage. | +| `stuck_sweep` | boolean | | | StuckSweep enables the controller-level stuck-agent sweep. When true, the controller periodically evaluates cognition-independent signals (pane content, last-activity time, wisp freshness) and files a warrant bead for sessions that appear wedged. Off by default. | +| `stuck_wisp_threshold` | string | | `10m` | StuckWispThreshold is the age at which an in-progress wisp is considered "stale" for stuck-sweep evaluation. Duration string (e.g., "5m", "10m", "1h"). Defaults to "10m" when StuckSweep is enabled. | +| `stuck_error_patterns` | []string | | | StuckErrorPatterns is a list of Go regexp patterns. If any pattern matches the recent pane output of an agent with a stale wisp, the session is considered stuck. Empty by default (SDK ships no vendor strings); example patterns live in the gastown pack. | +| `stuck_peek_lines` | integer | | `50` | StuckPeekLines is the number of lines of pane scrollback to inspect per session during the sweep. Clamped to [1, 2000]. Nil or zero defaults to 50. | +| `stuck_warrant_label` | string | | `pool:dog` | StuckWarrantLabel is the beads label applied to warrants filed by the sweep. Configurable so downstream formulas can route warrants without hardcoding a specific pool name. Defaults to "pool:dog". | ## DoltConfig diff --git a/docs/schema/city-schema.json b/docs/schema/city-schema.json index 5fff678e2..4732b31e9 100644 --- a/docs/schema/city-schema.json +++ b/docs/schema/city-schema.json @@ -943,6 +943,32 @@ "type": "integer", "description": "ProbeConcurrency bounds the number of concurrent bd subprocess probes\nissued by the pool scale_check and work_query paths. bd serializes on\na shared dolt sql-server, so unbounded parallelism causes contention.\nNil (unset) defaults to 8. Set higher for workspaces with a fast\ndedicated dolt server, or lower to reduce contention on slow storage.", "default": 8 + }, + "stuck_sweep": { + "type": "boolean", + "description": "StuckSweep enables the controller-level stuck-agent sweep. When true,\nthe controller periodically evaluates cognition-independent signals\n(pane content, last-activity time, wisp freshness) and files a warrant\nbead for sessions that appear wedged. Off by default." + }, + "stuck_wisp_threshold": { + "type": "string", + "description": "StuckWispThreshold is the age at which an in-progress wisp is\nconsidered \"stale\" for stuck-sweep evaluation. Duration string\n(e.g., \"5m\", \"10m\", \"1h\"). Defaults to \"10m\" when StuckSweep is\nenabled.", + "default": "10m" + }, + "stuck_error_patterns": { + "items": { + "type": "string" + }, + "type": "array", + "description": "StuckErrorPatterns is a list of Go regexp patterns. If any pattern\nmatches the recent pane output of an agent with a stale wisp, the\nsession is considered stuck. Empty by default (SDK ships no vendor\nstrings); example patterns live in the gastown pack." + }, + "stuck_peek_lines": { + "type": "integer", + "description": "StuckPeekLines is the number of lines of pane scrollback to inspect\nper session during the sweep. Clamped to [1, 2000]. Nil or zero\ndefaults to 50.", + "default": 50 + }, + "stuck_warrant_label": { + "type": "string", + "description": "StuckWarrantLabel is the beads label applied to warrants filed by\nthe sweep. Configurable so downstream formulas can route warrants\nwithout hardcoding a specific pool name. Defaults to \"pool:dog\".", + "default": "pool:dog" } }, "additionalProperties": false, diff --git a/engdocs/architecture/controller.md b/engdocs/architecture/controller.md index 03f57dae9..b3e00b782 100644 --- a/engdocs/architecture/controller.md +++ b/engdocs/architecture/controller.md @@ -3,7 +3,7 @@ title: "Controller" --- -> Last verified against code: 2026-03-01 +> Last verified against code: 2026-04-14 ## Summary @@ -45,7 +45,7 @@ automations, and garbage-collects expired wisps. parallel via goroutines. - **Nil-Guard Tracker Pattern**: Optional subsystems (crash tracker, idle - tracker, wisp GC, order dispatcher) follow a nil-means-disabled + tracker, wisp GC, stuck tracker, order dispatcher) follow a nil-means-disabled convention. Callers check `if tracker != nil` before use. This avoids conditional plumbing and keeps the loop body clean. @@ -85,6 +85,7 @@ gc start --foreground │ ├─ buildAgents(cfg) → evaluate pools in parallel │ ├─ doReconcileAgents() │ ├─ wispGC.runGC() + │ ├─ runStuckSweep() ← controller-level stuck detection │ └─ orderDispatcher.dispatch() │ └─ shutdown: @@ -119,7 +120,15 @@ Each tick of `controllerLoop()` (`cmd/gc/controller.go:268-320`) performs: `wisp_ttl` both set), queries closed molecules via `bd list` and deletes those older than the TTL cutoff. -5. **Order dispatch** (`ad.dispatch()`): Evaluates gate conditions +5. **Stuck sweep** (`runStuckSweep()`): If `[daemon].stuck_sweep` is + enabled, inspects each running session for wedge signals (stale + in-progress wisp + pane error pattern or no-progress-since-wisp-open) + and files a warrant bead (`type=warrant`, configurable label) on + detection. Emits `agent.stuck` on successful Create. Idempotent per + target. See [Health Patrol](health-patrol.md) for signal composition + and the known-limitation on `updated_at` parsing. + +6. **Order dispatch** (`ad.dispatch()`): Evaluates gate conditions for all non-manual orders. See [Health Patrol](health-patrol.md) for gate evaluation and dispatch details. @@ -243,6 +252,9 @@ All controller implementation lives in `cmd/gc/`: | `cmd/gc/order_dispatch.go` | `orderDispatcher` interface, `memoryOrderDispatcher`, `buildOrderDispatcher()` | | `cmd/gc/crash_tracker.go` | `crashTracker` interface, `memoryCrashTracker` | | `cmd/gc/idle_tracker.go` | `idleTracker` interface, `memoryIdleTracker` | +| `cmd/gc/stuck_tracker.go` | `stuckTracker` interface, `memoryStuckTracker` (pure convergent-evidence predicate) | +| `cmd/gc/stuck_sweep.go` | `CityRuntime.runStuckSweep()` — tick wrapper, warrant filing, idempotence | +| `cmd/gc/wisp_freshness.go` | `wispFreshness` opaque snapshot, `sweepWispFreshness()` bd-list parser | | `cmd/gc/cmd_agent_drain.go` | `drainOps` interface, `providerDrainOps` (session metadata-backed drain signals) | Supporting packages: @@ -266,6 +278,11 @@ restart_window = "1h" # sliding window for restart counting (default: 1h) shutdown_timeout = "5s" # grace period before force-kill (default: 5s) wisp_gc_interval = "5m" # wisp GC run frequency (disabled if unset) wisp_ttl = "24h" # how long closed wisps survive (disabled if unset) +stuck_sweep = false # opt-in controller-level stuck detection +stuck_wisp_threshold = "10m" # wisp staleness threshold (default 10m) +stuck_error_patterns = [] # Go regexp sources; empty disables pattern axis +stuck_peek_lines = 50 # pane lines inspected (clamped [1,2000]) +stuck_warrant_label = "pool:dog" # label applied to warrant beads ``` Session provider selection (affects all controller session operations): diff --git a/engdocs/architecture/health-patrol.md b/engdocs/architecture/health-patrol.md index 6fc82f692..1a831491a 100644 --- a/engdocs/architecture/health-patrol.md +++ b/engdocs/architecture/health-patrol.md @@ -3,7 +3,7 @@ title: "Health Patrol" --- -> Last verified against code: 2026-03-18 +> Last verified against code: 2026-04-14 ## Summary @@ -52,6 +52,15 @@ are child specs, and "let it crash" is realized through GUPP + beads ## Architecture +### Concurrency Model + +All tracker state (`cr.stuck`, `cr.runner`, `cr.ct`, `cr.it`, `cr.wg`) is +owned by the single tick goroutine in `controllerLoop()`. Trackers are +written in `reloadConfigTraced` and read in the per-tick sweep functions +on the same goroutine; no additional synchronization is required while +this single-goroutine model holds. If trackers are ever read off-tick, +add appropriate synchronization at that time. + The Health Patrol is not a standalone subsystem with its own package. It is composed from several collaborating components wired together inside the controller loop in `cmd/gc/controller.go`. The controller @@ -89,6 +98,11 @@ use): │ └──────────────┬──────────────┘ │ │ ▼ │ │ ┌─────────────────────────────┐ │ + │ │ runStuckSweep() │ │ + │ │ (stuckTracker + warrants) │ │ + │ └──────────────┬──────────────┘ │ + │ ▼ │ + │ ┌─────────────────────────────┐ │ │ │ orderDispatcher │ │ │ │ .dispatch() │ │ │ └─────────────────────────────┘ │ @@ -102,8 +116,8 @@ A single controller tick proceeds as follows: 1. **Config reload** (conditional). If the `dirty` atomic flag is set (via fsnotify debounce on config directory changes), `tryReloadConfig()` re-parses `city.toml` with includes and patches. - If the reload succeeds, the crash tracker, idle tracker, wisp GC, and - order dispatcher are all rebuilt from the new config. + If the reload succeeds, the crash tracker, idle tracker, wisp GC, stuck + tracker, and order dispatcher are all rebuilt from the new config. 2. **Agent list build**. `buildFn(cfg)` re-evaluates the desired agent set, including pool `check` commands for elastic scaling. @@ -115,7 +129,15 @@ A single controller tick proceeds as follows: 4. **Wisp GC**. If enabled, purges expired closed molecules older than `wisp_ttl`. -5. **Order dispatch** (`ad.dispatch()`). Evaluates all non-manual +5. **Stuck sweep** (`runStuckSweep()`). If enabled, inspects each running + session for wedge signals — stale in-progress wisp combined with either + a pane error-pattern match or a progress mismatch (last activity older + than wisp open) — and files an idempotent warrant bead (labeled per + `stuck_warrant_label`) for any session classified as stuck, emitting + `agent.stuck` on successful create. Detects cognition-broken agents + via convergent evidence without LLM involvement. + +6. **Order dispatch** (`ad.dispatch()`). Evaluates all non-manual order gates. For each due order, creates a tracking bead synchronously (to prevent re-fire), then dispatches in a goroutine. @@ -177,6 +199,15 @@ waves with bounded parallelism. `runtime.Provider.GetLastActivity()` and compares against per-agent timeout durations. +- **`stuckTracker`** (`cmd/gc/stuck_tracker.go`): Pure predicate for + cognition-independent wedge detection. Production impl + `memoryStuckTracker` composes stale-wisp with pattern-match and + progress-mismatch signals to classify a session as stuck without + LLM involvement. The CityRuntime wrapper (`runStuckSweep()`) collects + inputs via `runtime.Provider.Peek`, `runtime.Provider.GetLastActivity`, + and `sweepWispFreshness()` (bd list --json), files warrant beads on + detection, and emits `agent.stuck` events only on successful Create. + - **`reconcileOps`** (`cmd/gc/reconcile.go`): Interface for session-level operations needed by reconciliation: `listRunning()`, `storeConfigHash()`, `configHash()`. Backed by @@ -267,7 +298,7 @@ Health Patrol follows Erlang/OTP patterns mapped to Gas City: |---|---| | `internal/config` | Parses `DaemonConfig` for patrol interval, max restarts, restart window, shutdown timeout. Provides `Revision()` for config reload detection. | | `internal/runtime` | `Provider` interface for Start/Stop/IsRunning/ListRunning/GetLastActivity/SetMeta/GetMeta. `ConfigFingerprint()` for drift detection. | -| `internal/events` | `Recorder` interface for emitting lifecycle events (`agent.started`, `agent.stopped`, `agent.crashed`, `agent.quarantined`, `agent.idle_killed`, `agent.suspended`, `controller.started`, `controller.stopped`, `order.fired`, `order.completed`, `order.failed`). `Provider` interface for event gate queries. | +| `internal/events` | `Recorder` interface for emitting lifecycle events (`agent.started`, `agent.stopped`, `agent.crashed`, `agent.quarantined`, `agent.idle_killed`, `agent.stuck`, `agent.suspended`, `controller.started`, `controller.stopped`, `order.fired`, `order.completed`, `order.failed`). `Provider` interface for event gate queries. | | `internal/beads` | `Store` interface for order tracking beads (create, update, list by label). `CommandRunner` for bd CLI invocation. | | `internal/orders` | `Scan()` to discover orders from formula layers. `CheckGate()` to evaluate gate conditions. `Order` struct for dispatch metadata. | | `internal/agent` | `Agent` interface wrapping config + session provider for `Start()`/`Stop()`/`IsRunning()`/`SessionName()` operations. | @@ -288,6 +319,9 @@ All Health Patrol implementation lives in `cmd/gc/`: | `cmd/gc/reconcile.go` | `reconcileOps` interface, `doReconcileAgents()` (4-state reconciliation + parallel starts + orphan cleanup), `doStopOrphans()` | | `cmd/gc/crash_tracker.go` | `crashTracker` interface, `memoryCrashTracker` (in-memory restart history with sliding window pruning) | | `cmd/gc/idle_tracker.go` | `idleTracker` interface, `memoryIdleTracker` (per-agent timeout + GetLastActivity query) | +| `cmd/gc/stuck_tracker.go` | `stuckTracker` interface, `memoryStuckTracker` (pure convergent-evidence predicate) | +| `cmd/gc/stuck_sweep.go` | `CityRuntime.runStuckSweep()` — tick wrapper, warrant filing, idempotence | +| `cmd/gc/wisp_freshness.go` | `wispFreshness` opaque snapshot, `sweepWispFreshness()` bd-list parser | | `cmd/gc/order_dispatch.go` | `orderDispatcher` interface, `memoryOrderDispatcher` (gate evaluation, exec dispatch, wisp dispatch, tracking bead lifecycle) | | `internal/config/config.go` | `DaemonConfig` struct with `PatrolIntervalDuration()`, `MaxRestartsOrDefault()`, `RestartWindowDuration()`, `ShutdownTimeoutDuration()` | | `internal/config/revision.go` | `Revision()` (SHA-256 bundle hash of all config sources + pack dirs), `WatchDirs()` | @@ -308,12 +342,19 @@ restart_window = "1h" # sliding window for restart counting (default: 1h) shutdown_timeout = "5s" # grace period before force-kill on shutdown (default: 5s) wisp_gc_interval = "5m" # how often to purge expired wisps (disabled if unset) wisp_ttl = "24h" # how long closed wisps survive (disabled if unset) +stuck_sweep = false # opt-in controller-level stuck detection +stuck_wisp_threshold = "10m" # wisp staleness threshold (default 10m) +stuck_error_patterns = [] # Go regexp sources; empty disables pattern axis +stuck_peek_lines = 50 # pane lines inspected (clamped [1,2000]) +stuck_warrant_label = "pool:dog" # label applied to warrant beads [orders] skip = ["noisy-order"] # order names to exclude from dispatch max_timeout = "120s" # hard cap on per-order timeout (default: uncapped) ``` +Run `gc doctor daemon-stuck-sweep` to inspect the effective stuck-sweep configuration. + Per-agent idle timeout is configured on individual `[[agent]]` entries: ```toml @@ -332,6 +373,8 @@ Each Health Patrol component has dedicated unit tests: | `cmd/gc/reconcile_test.go` | All four reconciliation states (not running/healthy/orphan/drifted), parallel starts, zombie capture, crash loop quarantine integration, idle restart, pool drain, suspended agent handling | | `cmd/gc/crash_tracker_test.go` | Sliding window pruning, quarantine threshold, clear history, nil-guard (disabled tracker) | | `cmd/gc/idle_tracker_test.go` | Timeout detection, zero time handling, per-agent timeout configuration, nil-guard | +| `cmd/gc/stuck_tracker_test.go` | Pure predicate truth table for convergent-evidence classification (stale-wisp × pattern-match × progress-mismatch), pattern compile errors, disabled/noop paths | +| `cmd/gc/stuck_sweep_test.go` | `runStuckSweep()` tick wrapper: no-agents SDK self-sufficiency, idempotence across ticks, halt-gate suppression, wisp-query fail-open, `agent.stuck` emission on successful warrant create | | `cmd/gc/order_dispatch_test.go` | Gate evaluation (cooldown, cron, condition, event, manual), exec dispatch, wisp dispatch, tracking bead creation, timeout capping, rig-scoped orders | All tests use in-memory fakes (`runtime.Fake`, `events.Discard`, @@ -340,6 +383,23 @@ stubbed `ExecRunner`) with no external infrastructure dependencies. See ## Known Limitations +- **Stuck sweep reads `updated_at` via `bd list --json`**: `beads.Bead` + has no `UpdatedAt` field. The CityRuntime wrapper + (`sweepWispFreshness()`) parses the raw `bd list --json` `updated_at` + once per sweep and builds an opaque `wispFreshness` struct private to + `cmd/gc`. This is an intentional pragmatic workaround pending a + broader Provider review (rule-of-three not met; we only need + `updated_at` here). Raw bd JSON types do not leak past the tracker + boundary. + +- **Multi-wisp tiebreaker**: When multiple open wisps map to a single + session (legal during handoff), `sweepWispFreshness()` picks the + most-recent `updated_at` as the session's freshness source. + +- **`agent.stuck` emitted only on successful warrant Create**: If + `store.Create` errors the event is not recorded; idempotence remains + intact because no warrant exists to suppress, so the next tick retries. + - **No cascading restarts**: Erlang/OTP supports `one_for_all` and `rest_for_one` restart strategies. Gas City currently implements only `one_for_one` (restart the dead agent, nothing else). There is no diff --git a/internal/config/config.go b/internal/config/config.go index adfcdbc88..886dee92e 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -988,6 +988,30 @@ type DaemonConfig struct { // Nil (unset) defaults to 8. Set higher for workspaces with a fast // dedicated dolt server, or lower to reduce contention on slow storage. ProbeConcurrency *int `toml:"probe_concurrency,omitempty" jsonschema:"default=8"` + + // StuckSweep enables the controller-level stuck-agent sweep. When true, + // the controller periodically evaluates cognition-independent signals + // (pane content, last-activity time, wisp freshness) and files a warrant + // bead for sessions that appear wedged. Off by default. + StuckSweep bool `toml:"stuck_sweep,omitempty"` + // StuckWispThreshold is the age at which an in-progress wisp is + // considered "stale" for stuck-sweep evaluation. Duration string + // (e.g., "5m", "10m", "1h"). Defaults to "10m" when StuckSweep is + // enabled. + StuckWispThreshold string `toml:"stuck_wisp_threshold,omitempty" jsonschema:"default=10m"` + // StuckErrorPatterns is a list of Go regexp patterns. If any pattern + // matches the recent pane output of an agent with a stale wisp, the + // session is considered stuck. Empty by default (SDK ships no vendor + // strings); example patterns live in the gastown pack. + StuckErrorPatterns []string `toml:"stuck_error_patterns,omitempty"` + // StuckPeekLines is the number of lines of pane scrollback to inspect + // per session during the sweep. Clamped to [1, 2000]. Nil or zero + // defaults to 50. + StuckPeekLines *int `toml:"stuck_peek_lines,omitempty" jsonschema:"default=50"` + // StuckWarrantLabel is the beads label applied to warrants filed by + // the sweep. Configurable so downstream formulas can route warrants + // without hardcoding a specific pool name. Defaults to "pool:dog". + StuckWarrantLabel string `toml:"stuck_warrant_label,omitempty" jsonschema:"default=pool:dog"` } // PatrolIntervalDuration returns the patrol interval as a time.Duration. @@ -1101,6 +1125,68 @@ func (d *DaemonConfig) WispGCEnabled() bool { return d.WispGCIntervalDuration() > 0 && d.WispTTLDuration() > 0 } +// defaultStuckWispThreshold is the fallback stale-wisp age used when +// StuckSweep is enabled but StuckWispThreshold is unset or unparseable. +const defaultStuckWispThreshold = 10 * time.Minute + +// DefaultStuckPeekLines is the default number of pane lines inspected per +// session during the sweep when StuckPeekLines is unset or out of range. +const DefaultStuckPeekLines = 50 + +// DefaultStuckWarrantLabel is the default beads label for warrants filed +// by the stuck-agent sweep. +const DefaultStuckWarrantLabel = "pool:dog" + +// maxStuckPeekLines caps pane reads to a sane upper bound to prevent +// pathologically large scrollback captures per tick. +const maxStuckPeekLines = 2000 + +// StuckWispThresholdDuration returns the stuck-sweep wisp staleness +// threshold as a time.Duration. Defaults to 10m when empty or +// unparseable. +func (d *DaemonConfig) StuckWispThresholdDuration() time.Duration { + if d.StuckWispThreshold == "" { + return defaultStuckWispThreshold + } + dur, err := time.ParseDuration(d.StuckWispThreshold) + if err != nil { + return defaultStuckWispThreshold + } + return dur +} + +// StuckPeekLinesOrDefault returns the sweep peek-lines setting, clamped +// to [1, maxStuckPeekLines]. Nil, zero, or negative values return the default. +func (d *DaemonConfig) StuckPeekLinesOrDefault() int { + if d.StuckPeekLines == nil || *d.StuckPeekLines <= 0 { + return DefaultStuckPeekLines + } + if *d.StuckPeekLines > maxStuckPeekLines { + return maxStuckPeekLines + } + return *d.StuckPeekLines +} + +// StuckWarrantLabelOrDefault returns the configured warrant label, or +// the default "pool:dog" when empty. +func (d *DaemonConfig) StuckWarrantLabelOrDefault() string { + if d.StuckWarrantLabel == "" { + return DefaultStuckWarrantLabel + } + return d.StuckWarrantLabel +} + +// StuckSweepEnabled reports whether the stuck-agent sweep is active. +// The sweep requires opt-in (StuckSweep=true) and a non-empty warrant +// label. Patterns are optional: zero patterns disables the regex axis +// only, leaving the progress-mismatch axis live. When either required +// precondition fails the sweep is disabled (tracker constructor +// returns a noop). +func (d *DaemonConfig) StuckSweepEnabled() bool { + return d.StuckSweep && + d.StuckWarrantLabelOrDefault() != "" +} + // FormulasDir returns the formulas directory, defaulting to "formulas". func (c *City) FormulasDir() string { if c.Formulas.Dir != "" { diff --git a/internal/config/validate_durations.go b/internal/config/validate_durations.go index 2c06184eb..b53ad43da 100644 --- a/internal/config/validate_durations.go +++ b/internal/config/validate_durations.go @@ -17,7 +17,7 @@ func ValidateDurations(cfg *City, source string) []string { } if _, err := time.ParseDuration(value); err != nil { warnings = append(warnings, fmt.Sprintf( - "%s: %s %s = %q is not a valid duration: %v", + "%s: invalid duration for %s.%s: %q: %v", source, context, field, value, err)) } } @@ -46,6 +46,7 @@ func ValidateDurations(cfg *City, source string) []string { check("[daemon]", "wisp_gc_interval", cfg.Daemon.WispGCInterval) check("[daemon]", "wisp_ttl", cfg.Daemon.WispTTL) check("[daemon]", "drift_drain_timeout", cfg.Daemon.DriftDrainTimeout) + check("[daemon]", "stuck_wisp_threshold", cfg.Daemon.StuckWispThreshold) // Orders config durations. check("[orders]", "max_timeout", cfg.Orders.MaxTimeout) diff --git a/internal/config/validate_stuck_patterns.go b/internal/config/validate_stuck_patterns.go new file mode 100644 index 000000000..d3606aed8 --- /dev/null +++ b/internal/config/validate_stuck_patterns.go @@ -0,0 +1,27 @@ +package config + +import ( + "fmt" + "regexp" +) + +// ValidateStuckPatterns returns a non-nil error if any entry in +// cfg.Daemon.StuckErrorPatterns fails regexp.Compile. The error is indexed +// ("stuck_error_patterns[]: ") so operators can locate the +// offending pattern in city.toml. +// +// This is a hard-error pre-flight check invoked on the same path as +// ValidateAgents: an invalid regex must cause `gc start` to exit non-zero +// rather than silently disabling the feature (AC10). The in-process +// newStuckTracker path remains as defense-in-depth. +func ValidateStuckPatterns(cfg *City) error { + if cfg == nil { + return nil + } + for i, src := range cfg.Daemon.StuckErrorPatterns { + if _, err := regexp.Compile(src); err != nil { + return fmt.Errorf("stuck_error_patterns[%d]: %w", i, err) + } + } + return nil +} diff --git a/internal/doctor/checks_semantic.go b/internal/doctor/checks_semantic.go index 21dba4eb7..4fdbf9a2b 100644 --- a/internal/doctor/checks_semantic.go +++ b/internal/doctor/checks_semantic.go @@ -235,3 +235,59 @@ func (c *ConfigSemanticsCheck) CanFix() bool { return false } // Fix is a no-op. func (c *ConfigSemanticsCheck) Fix(_ *CheckContext) error { return nil } + +// --- Stuck sweep configuration check --- + +// StuckSweepCheck reports the effective stuck-sweep configuration for +// operator visibility. It surfaces which axes are active (regex, +// progress-mismatch) and the resolved threshold/label without inferring +// any judgment about whether the configuration is "right" — the only +// warning case is a structural misconfig (flag on, label unset/empty). +type StuckSweepCheck struct { + cfg *config.City +} + +// NewStuckSweepCheck creates a check for stuck-sweep configuration. +func NewStuckSweepCheck(cfg *config.City) *StuckSweepCheck { + return &StuckSweepCheck{cfg: cfg} +} + +// Name returns the check identifier. +func (c *StuckSweepCheck) Name() string { return "daemon-stuck-sweep" } + +// Run reports the effective stuck-sweep configuration. +func (c *StuckSweepCheck) Run(_ *CheckContext) *CheckResult { + r := &CheckResult{Name: c.Name()} + d := &c.cfg.Daemon + if !d.StuckSweep { + r.Status = StatusOK + r.Message = "stuck sweep disabled" + return r + } + if !d.StuckSweepEnabled() { + r.Status = StatusWarning + r.Message = "stuck sweep misconfigured: stuck_warrant_label is empty" + r.FixHint = "Set stuck_warrant_label in [daemon] config to enable the stuck-agent sweep" + return r + } + patterns := len(d.StuckErrorPatterns) + threshold := d.StuckWispThresholdDuration() + label := d.StuckWarrantLabelOrDefault() + r.Status = StatusOK + if patterns == 0 { + r.Message = fmt.Sprintf( + "stuck sweep active: regex axis disabled (no patterns), progress-mismatch axis enabled; threshold=%s label=%s", + threshold, label) + } else { + r.Message = fmt.Sprintf( + "stuck sweep active: regex axis enabled (%d pattern(s)), progress-mismatch axis enabled; threshold=%s label=%s", + patterns, threshold, label) + } + return r +} + +// CanFix returns false — stuck-sweep config must be corrected by the user. +func (c *StuckSweepCheck) CanFix() bool { return false } + +// Fix is a no-op. +func (c *StuckSweepCheck) Fix(_ *CheckContext) error { return nil } diff --git a/internal/doctor/checks_semantic_test.go b/internal/doctor/checks_semantic_test.go index 70877f720..55b8f1c5d 100644 --- a/internal/doctor/checks_semantic_test.go +++ b/internal/doctor/checks_semantic_test.go @@ -308,3 +308,51 @@ func TestHumanSize(t *testing.T) { } } } + +// --- StuckSweepCheck --- + +func TestStuckSweepCheck_Disabled(t *testing.T) { + cfg := &config.City{Daemon: config.DaemonConfig{StuckSweep: false}} + r := NewStuckSweepCheck(cfg).Run(&CheckContext{}) + if r.Status != StatusOK { + t.Fatalf("status = %d, want OK", r.Status) + } + if !strings.Contains(r.Message, "disabled") { + t.Fatalf("message should mention disabled: %q", r.Message) + } +} + +func TestStuckSweepCheck_EnabledWithPatterns(t *testing.T) { + cfg := &config.City{Daemon: config.DaemonConfig{ + StuckSweep: true, + StuckErrorPatterns: []string{"x", "y"}, + StuckWarrantLabel: "pool:dog", + }} + r := NewStuckSweepCheck(cfg).Run(&CheckContext{}) + if r.Status != StatusOK { + t.Fatalf("status = %d, want OK; msg=%s", r.Status, r.Message) + } + if !strings.Contains(r.Message, "regex axis enabled") { + t.Fatalf("message should mention regex axis enabled: %q", r.Message) + } + if !strings.Contains(r.Message, "progress-mismatch axis enabled") { + t.Fatalf("message should mention progress-mismatch axis: %q", r.Message) + } +} + +func TestStuckSweepCheck_EnabledNoPatterns(t *testing.T) { + cfg := &config.City{Daemon: config.DaemonConfig{ + StuckSweep: true, + StuckWarrantLabel: "pool:dog", + }} + r := NewStuckSweepCheck(cfg).Run(&CheckContext{}) + if r.Status != StatusOK { + t.Fatalf("status = %d, want OK; msg=%s", r.Status, r.Message) + } + if !strings.Contains(r.Message, "regex axis disabled") { + t.Fatalf("message should report regex axis disabled: %q", r.Message) + } + if !strings.Contains(r.Message, "progress-mismatch axis enabled") { + t.Fatalf("message should report progress-mismatch active: %q", r.Message) + } +} diff --git a/internal/events/events.go b/internal/events/events.go index 79d8a27cc..0405aa54f 100644 --- a/internal/events/events.go +++ b/internal/events/events.go @@ -34,6 +34,7 @@ const ( SessionUndrained = "session.undrained" SessionQuarantined = "session.quarantined" SessionIdleKilled = "session.idle_killed" + AgentStuck = "agent.stuck" SessionSuspended = "session.suspended" SessionUpdated = "session.updated" ConvoyCreated = "convoy.created" diff --git a/internal/telemetry/recorder.go b/internal/telemetry/recorder.go index 5f4e296a9..cd08e1a3a 100644 --- a/internal/telemetry/recorder.go +++ b/internal/telemetry/recorder.go @@ -26,17 +26,18 @@ const ( // recorderInstruments holds all lazy-initialized OTel metric instruments. type recorderInstruments struct { // Counters — Phase 1 (11) - agentStartTotal metric.Int64Counter - agentStopTotal metric.Int64Counter - agentCrashTotal metric.Int64Counter - agentQuarantineTotal metric.Int64Counter - agentIdleKillTotal metric.Int64Counter - reconcileCycleTotal metric.Int64Counter - nudgeTotal metric.Int64Counter - configReloadTotal metric.Int64Counter - controllerTotal metric.Int64Counter - bdTotal metric.Int64Counter - slingTotal metric.Int64Counter + agentStartTotal metric.Int64Counter + agentStopTotal metric.Int64Counter + agentCrashTotal metric.Int64Counter + agentQuarantineTotal metric.Int64Counter + agentIdleKillTotal metric.Int64Counter + agentStuckWarrantTotal metric.Int64Counter + reconcileCycleTotal metric.Int64Counter + nudgeTotal metric.Int64Counter + configReloadTotal metric.Int64Counter + controllerTotal metric.Int64Counter + bdTotal metric.Int64Counter + slingTotal metric.Int64Counter // Counters — Phase 2 (4) poolSpawnTotal metric.Int64Counter @@ -86,6 +87,9 @@ func initInstruments() { inst.agentIdleKillTotal, _ = m.Int64Counter("gc.agent.idle_kills.total", metric.WithDescription("Total agent idle timeout restarts"), ) + inst.agentStuckWarrantTotal, _ = m.Int64Counter("gc.agent.stuck_warrants.total", + metric.WithDescription("Total stuck-agent warrant beads filed"), + ) inst.reconcileCycleTotal, _ = m.Int64Counter("gc.reconcile.cycles.total", metric.WithDescription("Total reconciliation cycles"), ) @@ -273,6 +277,25 @@ func RecordAgentIdleKill(ctx context.Context, agentName string) { ) } +// RecordAgentStuckWarrant records a stuck-agent warrant bead being filed (metrics + log event). +// axis is one of "regex" or "progress_mismatch" — a low-cardinality summary of which axis fired. +// The session attribute is intentionally included (consistent with RecordAgentIdleKill) and +// may have high cardinality in large fleets — operators should be aware. The full warrant reason +// is captured on the warrant bead itself. +func RecordAgentStuckWarrant(ctx context.Context, session, axis string) { + initInstruments() + inst.agentStuckWarrantTotal.Add(ctx, 1, + metric.WithAttributes( + attribute.String("session", session), + attribute.String("axis", axis), + ), + ) + emit(ctx, "agent.stuck_warrant", otellog.SeverityInfo, + otellog.String("session", session), + otellog.String("axis", axis), + ) +} + // RecordReconcileCycle records a reconciliation cycle with counts (metrics + log event). func RecordReconcileCycle(ctx context.Context, started, stopped, skipped int) { initInstruments()