Skip to content

Commit fc4b9de

Browse files
steveyeggeclaude
andcommitted
fix: use tmux for agent liveness in daemon checks (gt-zecmc)
Complete the "discover, don't track" refactoring: - checkGUPPViolations: use tmux.IsClaudeRunning() instead of agent_state - checkOrphanedWork: derive dead agents from tmux, not agent_state=dead - assessStaleness: rely on HasActiveSession (tmux), not agent_state Non-observable states (stuck, awaiting-gate) are still respected. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 9729e05 commit fc4b9de

2 files changed

Lines changed: 57 additions & 61 deletions

File tree

internal/daemon/lifecycle.go

Lines changed: 43 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -759,9 +759,14 @@ func (d *Daemon) checkRigGUPPViolations(rigName string) {
759759
continue // No hooked work - no GUPP violation possible
760760
}
761761

762-
// Check if agent is actively working
763-
if agent.AgentState == "working" || agent.AgentState == "running" {
764-
// Check when the agent bead was last updated
762+
// Per gt-zecmc: derive running state from tmux, not agent_state
763+
// Extract polecat name from agent ID (gt-polecat-<rig>-<name> -> <name>)
764+
polecatName := strings.TrimPrefix(agent.ID, prefix)
765+
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
766+
767+
// Check if tmux session exists and Claude is running
768+
if d.tmux.IsClaudeRunning(sessionName) {
769+
// Session is alive - check if it's been stuck too long
765770
updatedAt, err := time.Parse(time.RFC3339, agent.UpdatedAt)
766771
if err != nil {
767772
continue
@@ -803,69 +808,61 @@ Action needed: Check if agent is alive and responsive. Consider restarting if st
803808

804809
// checkOrphanedWork looks for work assigned to dead agents.
805810
// Orphaned work needs to be reassigned or the agent needs to be restarted.
811+
// Per gt-zecmc: derive agent liveness from tmux, not agent_state.
806812
func (d *Daemon) checkOrphanedWork() {
807-
// Get list of dead agents
808-
deadAgents := d.getDeadAgents()
809-
if len(deadAgents) == 0 {
810-
return
811-
}
812-
813-
// For each dead agent, check if they have hooked work
814-
// Use HookBead from database column directly (not parsed from description)
815-
for _, agent := range deadAgents {
816-
if agent.HookBead == "" {
817-
continue // No hooked work to orphan
818-
}
819-
820-
d.logger.Printf("Orphaned work detected: agent %s is dead but has hook_bead=%s",
821-
agent.ID, agent.HookBead)
822-
823-
// Determine the rig from the agent ID (gt-polecat-<rig>-<name>)
824-
rigName := d.extractRigFromAgentID(agent.ID)
825-
if rigName != "" {
826-
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
827-
}
813+
// Check all polecat agents with hooked work
814+
rigs := d.getKnownRigs()
815+
for _, rigName := range rigs {
816+
d.checkRigOrphanedWork(rigName)
828817
}
829818
}
830819

831-
// deadAgentInfo holds info about a dead agent for orphaned work detection.
832-
type deadAgentInfo struct {
833-
ID string
834-
HookBead string // Read from database column, not description
835-
}
836-
837-
// getDeadAgents returns all agent beads with state=dead.
838-
func (d *Daemon) getDeadAgents() []deadAgentInfo {
820+
// checkRigOrphanedWork checks polecats in a specific rig for orphaned work.
821+
func (d *Daemon) checkRigOrphanedWork(rigName string) {
839822
cmd := exec.Command("bd", "list", "--type=agent", "--json")
840823
cmd.Dir = d.config.TownRoot
841824

842825
output, err := cmd.Output()
843826
if err != nil {
844-
return nil
827+
return
845828
}
846829

847830
var agents []struct {
848-
ID string `json:"id"`
849-
Type string `json:"issue_type"`
850-
HookBead string `json:"hook_bead"` // Read from database column
851-
AgentState string `json:"agent_state"` // Read from database column
831+
ID string `json:"id"`
832+
HookBead string `json:"hook_bead"`
852833
}
853834

854835
if err := json.Unmarshal(output, &agents); err != nil {
855-
return nil
836+
return
856837
}
857838

858-
var dead []deadAgentInfo
839+
prefix := "gt-polecat-" + rigName + "-"
859840
for _, agent := range agents {
860-
if agent.AgentState == "dead" {
861-
dead = append(dead, deadAgentInfo{
862-
ID: agent.ID,
863-
HookBead: agent.HookBead,
864-
})
841+
// Only check polecats for this rig
842+
if !strings.HasPrefix(agent.ID, prefix) {
843+
continue
865844
}
866-
}
867845

868-
return dead
846+
// No hooked work = nothing to orphan
847+
if agent.HookBead == "" {
848+
continue
849+
}
850+
851+
// Check if tmux session is alive (derive state from tmux, not bead)
852+
polecatName := strings.TrimPrefix(agent.ID, prefix)
853+
sessionName := fmt.Sprintf("gt-%s-%s", rigName, polecatName)
854+
855+
// Session running = not orphaned (work is being processed)
856+
if d.tmux.IsClaudeRunning(sessionName) {
857+
continue
858+
}
859+
860+
// Session dead but has hooked work = orphaned!
861+
d.logger.Printf("Orphaned work detected: agent %s session is dead but has hook_bead=%s",
862+
agent.ID, agent.HookBead)
863+
864+
d.notifyWitnessOfOrphanedWork(rigName, agent.ID, agent.HookBead)
865+
}
869866
}
870867

871868
// extractRigFromAgentID extracts the rig name from a polecat agent ID.

internal/polecat/manager.go

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -866,40 +866,39 @@ func countCommitsBehind(g *git.Git, defaultBranch string) int {
866866
}
867867

868868
// assessStaleness determines if a polecat should be cleaned up.
869+
// Per gt-zecmc: uses tmux state (HasActiveSession) rather than agent_state
870+
// since observable states (running, done, idle) are no longer recorded in beads.
869871
func assessStaleness(info *StalenessInfo, threshold int) (bool, string) {
870872
// Never clean up if there's uncommitted work
871873
if info.HasUncommittedWork {
872874
return false, "has uncommitted work"
873875
}
874876

875-
// If session is active, not stale
877+
// If session is active, not stale (tmux is source of truth for liveness)
876878
if info.HasActiveSession {
877879
return false, "session active"
878880
}
879881

880-
// No active session - check other indicators
882+
// No active session - this polecat is a cleanup candidate
883+
// Check for reasons to keep it:
881884

882-
// If agent reports "running" state but no session, that's suspicious
883-
// but give benefit of doubt (session may have just died)
884-
if info.AgentState == "running" {
885-
return false, "agent reports running (session may be restarting)"
885+
// Check for non-observable states that indicate intentional pause
886+
// (stuck, awaiting-gate are still stored in beads per gt-zecmc)
887+
if info.AgentState == "stuck" || info.AgentState == "awaiting-gate" {
888+
return false, fmt.Sprintf("agent_state=%s (intentional pause)", info.AgentState)
886889
}
887890

888-
// If agent reports "done" or "idle", it's a cleanup candidate
889-
if info.AgentState == "done" || info.AgentState == "idle" {
890-
return true, fmt.Sprintf("agent_state=%s, no active session", info.AgentState)
891-
}
892-
893-
// Way behind main is a strong staleness signal
891+
// No session and way behind main = stale
894892
if info.CommitsBehind >= threshold {
895893
return true, fmt.Sprintf("%d commits behind main, no active session", info.CommitsBehind)
896894
}
897895

898-
// No agent bead and no session - likely abandoned
896+
// No session and no agent bead = abandoned, clean up
899897
if info.AgentState == "" {
900898
return true, "no agent bead, no active session"
901899
}
902900

903-
// Default: not enough evidence to consider stale
904-
return false, "insufficient staleness indicators"
901+
// No session but has agent bead without special state = clean up
902+
// (The session is the source of truth for liveness)
903+
return true, "no active session"
905904
}

0 commit comments

Comments
 (0)