Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 66 additions & 5 deletions cmd/gc/city_runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@ type CityRuntime struct {
buildFn func(*config.City, runtime.Provider, beads.Store) DesiredStateResult
buildFnWithSessionBeads func(*config.City, runtime.Provider, beads.Store, map[string]beads.Store, *sessionBeadSnapshot, *sessionReconcilerTraceCycle) DesiredStateResult

dops drainOps
ct crashTracker
it idleTracker
wg wispGC
dops drainOps
ct crashTracker
it idleTracker
wg wispGC
// stuck is owned by the tick goroutine; written in reloadConfigTraced, read in runStuckSweep — no additional locking required.
stuck stuckTracker
od orderDispatcher
trace *sessionReconcilerTraceManager
// runner is owned by the tick goroutine; written in reloadConfigTraced, read in runStuckSweep — no additional locking required.
runner beads.CommandRunner
trace *sessionReconcilerTraceManager

rec events.Recorder
cs *controllerState // nil when controller-managed bead stores are unavailable
Expand Down Expand Up @@ -126,6 +130,32 @@ func newCityRuntime(p CityRuntimeParams) *CityRuntime {
p.Cfg.Daemon.WispTTLDuration(), bdCommandRunnerForCity(p.CityPath))
}

stuck, stuckErr := newStuckTracker(p.Cfg.Daemon)
if stuckErr != nil {
fmt.Fprintf(p.Stderr, "%s: stuck sweep disabled: %v\n", //nolint:errcheck // best-effort stderr
firstNonEmpty(p.LogPrefix, "gc start"), stuckErr)
stuck = noopStuckTracker{}
}
if p.Cfg.Daemon.StuckSweepEnabled() && stuckErr == nil {
if m, ok := stuck.(*memoryStuckTracker); ok {
if len(m.patterns) == 0 {
// AC1/AC3: progress-mismatch axis only — regex axis disabled.
fmt.Fprintf(p.Stderr, //nolint:errcheck // best-effort stderr
"%s: stuck_sweep enabled: regex-axis=disabled (no patterns) progress-mismatch-axis=active threshold=%s label=%s\n",
firstNonEmpty(p.LogPrefix, "gc start"),
m.wispThresholdDuration(), m.warrantLabelOrDefault())
} else {
fmt.Fprintf(p.Stderr, //nolint:errcheck // best-effort stderr
"%s: stuck_sweep enabled: regex-axis=enabled (patterns=%d) progress-mismatch-axis=active threshold=%s peek_lines=%d label=%s\n",
firstNonEmpty(p.LogPrefix, "gc start"),
len(m.patterns), m.wispThresholdDuration(),
m.peekLinesOrDefault(), m.warrantLabelOrDefault())
}
}
}

runner := bdCommandRunnerForCity(p.CityPath)

// Clear stale halt file from a previous session so a service restart
// always begins in the running state. An operator who wants the halt
// to survive a restart can re-issue "gc halt" after startup.
Expand Down Expand Up @@ -171,7 +201,9 @@ func newCityRuntime(p CityRuntimeParams) *CityRuntime {
ct: ct,
it: it,
wg: wg,
stuck: stuck,
od: od,
runner: runner,
trace: newSessionReconcilerTraceManager(p.CityPath, p.CityName, p.Stderr),
rec: p.Rec,
poolSessions: p.PoolSessions,
Expand Down Expand Up @@ -485,6 +517,14 @@ func (cr *CityRuntime) tick(
return
}

// Stuck-agent sweep: controller-level non-LLM liveness check. Runs
// inside the halt gate (already gated above) and after wispGC so it
// sees freshly-GC'd wisp state.
cr.runStuckSweep(ctx, time.Now())
if ctx.Err() != nil {
return
}

// Order dispatch.
if cr.od != nil {
cr.od.dispatch(ctx, cityRoot, time.Now())
Expand Down Expand Up @@ -661,6 +701,27 @@ func (cr *CityRuntime) reloadConfigTraced(
cr.wg = nil
}

wasStuckEnabled := cr.cfg.Daemon.StuckSweepEnabled()
if nextStuck, err := newStuckTracker(nextCfg.Daemon); err != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep disabled on reload: %v\n", cr.logPrefix, err) //nolint:errcheck // best-effort stderr
cr.stuck = noopStuckTracker{}
} else {
cr.stuck = nextStuck
}
// AC11: emit a state-change log when stuck sweep enabled/disabled
// transitions across reload so operators can see the feature flip.
if nowEnabled := nextCfg.Daemon.StuckSweepEnabled(); nowEnabled != wasStuckEnabled {
fmt.Fprintf(cr.stderr, "%s: stuck sweep config reloaded: enabled=%v\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, nowEnabled)
}
// Note: cr.stuck and cr.runner are assigned prior to the
// serviceStateMu-protected cfg/sp swap below. They are read by
// runStuckSweep under the same tick goroutine that holds the controller
// loop, so no additional locking is required; the ordering preserves the
// invariant that a sweep observes either the old (cfg, stuck, runner) or
// the new triple, never a mix.
cr.runner = bdCommandRunnerForCity(cityRoot)

cr.od = buildOrderDispatcher(cityRoot, nextCfg, bdCommandRunnerForCity(cityRoot), cr.rec, cr.stderr)

cr.serviceStateMu.Lock()
Expand Down
1 change: 1 addition & 0 deletions cmd/gc/cmd_doctor.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func doDoctor(fix, verbose bool, stdout, stderr io.Writer) int {
d.Register(doctor.NewBuiltinPackFamilyCheck(cfg, cityPath))
d.Register(doctor.NewConfigSemanticsCheck(cfg, filepath.Join(cityPath, "city.toml")))
d.Register(doctor.NewDurationRangeCheck(cfg))
d.Register(doctor.NewStuckSweepCheck(cfg))
}

// System formulas check.
Expand Down
7 changes: 7 additions & 0 deletions cmd/gc/cmd_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,13 @@ func doStartStandalone(args []string, controllerMode bool, stdout, stderr io.Wri
return 1
}

// AC10: pre-flight compile of stuck_error_patterns. Invalid regex must
// abort startup so operators discover misconfiguration at gc start.
if err := config.ValidateStuckPatterns(cfg); err != nil {
fmt.Fprintf(stderr, "gc start: %v\n", err) //nolint:errcheck // best-effort stderr
return 1
}

// Validate install_agent_hooks (workspace + all agents).
if ih := cfg.Workspace.InstallAgentHooks; len(ih) > 0 {
if err := hooks.Validate(ih); err != nil {
Expand Down
7 changes: 7 additions & 0 deletions cmd/gc/cmd_supervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1454,6 +1454,13 @@ func prepareCityForSupervisor(cityPath, cityName string, cfg *config.City, stder
return fmt.Errorf("validate agents: %w", err)
}

// AC10: pre-flight compile of stuck_error_patterns.
if err := runStep("validating_stuck_patterns", func() error {
return config.ValidateStuckPatterns(cfg)
}); err != nil {
return fmt.Errorf("validate stuck patterns: %w", err)
}

// Validate install_agent_hooks (workspace + all agents).
if err := runStep("validating_hooks", func() error {
if ih := cfg.Workspace.InstallAgentHooks; len(ih) > 0 {
Expand Down
194 changes: 194 additions & 0 deletions cmd/gc/stuck_sweep.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
package main

import (
"context"
"fmt"
"strings"
"time"

"github.com/gastownhall/gascity/internal/beads"
"github.com/gastownhall/gascity/internal/events"
"github.com/gastownhall/gascity/internal/telemetry"
)

// runStuckSweep iterates controller-owned running sessions, checks each
// against the stuckTracker predicate, and files a warrant bead for any
// session flagged as stuck. Idempotent per target: an existing open warrant
// for the same session suppresses duplicates.
//
// Fail-open semantics:
// - One bd list --json shellout for wisp freshness per sweep. Failure
// aborts the whole sweep (whole-sweep fail-open, not per-session).
// - Peek / GetLastActivity errors skip the session and continue.
// - agent.stuck event emitted ONLY on successful warrant Create.
func (cr *CityRuntime) runStuckSweep(ctx context.Context, now time.Time) {
if cr == nil || cr.stuck == nil {
return
}
// Short-circuit when stuck_sweep=false or stuck_warrant_label resolves
// empty: StuckSweepEnabled() covers both cases. The noop tracker path
// (noopStuckTracker) is never reached in M3 — the sweep is gated here
// before any bd shellout. Zero patterns no longer results in noop;
// the progress-mismatch axis remains active when patterns are absent.
if !cr.cfg.Daemon.StuckSweepEnabled() {
return
}
if cr.sp == nil {
return
}
if ctx.Err() != nil {
return
}
// Defense-in-depth halt gate (E14/R7): tick() already checks halt before
// calling runStuckSweep, but we re-check here so direct unit invocations
// honor the same ordering as production.
if cr.halt.check(cr.cityPath, cr.stderr) {
return
}

runner := cr.runner
if runner == nil {
runner = bdCommandRunnerForCity(cr.cityPath)
}
freshnessBySession, err := sweepWispFreshness(cr.cityPath, runner)
if err != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep: %v (fail-open this tick)\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, err)
return
}

running, err := cr.sp.ListRunning("")
if err != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep: list running: %v\n", cr.logPrefix, err) //nolint:errcheck // best-effort stderr
return
}

store := cr.cityBeadStore()
// Route all tracker inputs through the stuckTracker interface and config
// accessors rather than a concrete-type cast: this keeps alternative
// implementations (e.g., noopStuckTracker) viable and keeps sweep behavior
// aligned with the config source of truth.
peekLines := cr.cfg.Daemon.StuckPeekLinesOrDefault()
warrantLabel := cr.cfg.Daemon.StuckWarrantLabelOrDefault()

for _, session := range running {
if ctx.Err() != nil {
return
}

freshness, hasWisp := freshnessBySession[session]
if !hasWisp {
continue
}

paneOutput, peekErr := cr.sp.Peek(session, peekLines)
if peekErr != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep: peek %s: %v\n", cr.logPrefix, session, peekErr) //nolint:errcheck // best-effort stderr
continue
}

lastActivity, actErr := cr.sp.GetLastActivity(session)
if actErr != nil {
// GetLastActivity error ⇒ insufficient evidence; fail-open.
continue
}

stuck, reason, matchedPattern := cr.stuck.checkStuck(session, paneOutput,
freshness.updatedAt, lastActivity, now)
if !stuck {
continue
}

// Idempotence: suppress if an open warrant already targets this
// session. Reads store, not tracker state, so it survives reload.
// Fail-open direction: on store error, treat as "already warranted"
// to avoid duplicate flood while the store is transiently unavailable.
if store != nil {
already, lookupErr := hasOpenStuckWarrant(store, warrantLabel, session)
if lookupErr != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep: warrant lookup %s: %v (skipping)\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, session, lookupErr)
continue
}
if already {
continue
}
}

// Race check: if the process is already dead, the crashTracker
// will handle it; do not file a warrant.
if !cr.sp.ProcessAlive(session, nil) {
continue
}

if store == nil {
// No bead store available; log detection for observability.
fmt.Fprintf(cr.stderr, "%s: stuck detected (no store): session=%s reason=%s\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, session, reason)
continue
}

wispAge := now.Sub(freshness.updatedAt).Round(time.Second).String()
warrant := beads.Bead{
Title: "stuck:" + session,
Type: "warrant",
Labels: []string{warrantLabel},
Metadata: map[string]string{
"target": session,
"reason": reason,
"requester": "controller",
"matched_pattern": matchedPattern,
"wisp_id": freshness.id,
"wisp_age": wispAge,
},
}
if _, createErr := store.Create(warrant); createErr != nil {
fmt.Fprintf(cr.stderr, "%s: stuck sweep: filing warrant for %s: %v\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, session, createErr)
continue
}

if cr.rec != nil {
cr.rec.Record(events.Event{
Type: events.AgentStuck,
Actor: "controller",
Subject: session,
Message: reason,
})
}
// axis is a low-cardinality summary: "regex" or "progress_mismatch".
// The session attribute is intentionally included (consistent with
// RecordAgentIdleKill) and may have high cardinality in large fleets —
// operators should be aware. Full reason lives on the warrant bead.
axis := "regex"
if matchedPattern == "" {
axis = "progress_mismatch"
}
telemetry.RecordAgentStuckWarrant(ctx, session, axis)
fmt.Fprintf(cr.stderr, "%s: stuck warrant filed: session=%s reason=%s\n", //nolint:errcheck // best-effort stderr
cr.logPrefix, session, reason)
}
}

// hasOpenStuckWarrant reports whether the store already contains an open
// warrant bead with the given label and metadata.target matching session.
//
// Fail-open direction (R6): on ListByLabel error, returns (true, err). Treating
// an unknown-state query as "already warranted" prevents duplicate-warrant
// floods when the store is transiently unavailable. The caller should skip
// warrant creation (and optionally log the error) on error returns.
func hasOpenStuckWarrant(store beads.Store, label, session string) (bool, error) {
beadsList, err := store.ListByLabel(label, 0)
if err != nil {
return true, err
}
for _, b := range beadsList {
if b.Type != "warrant" {
continue
}
if strings.TrimSpace(b.Metadata["target"]) == session {
return true, nil
}
}
return false, nil
}
Loading
Loading