Skip to content

Commit 99bb517

Browse files
fix: daemon idle reaper checks work bead assignee before killing polecats
The idle reaper checked agent bead hook_bead to determine if a polecat had active work, but updateAgentHookBead was made a no-op (declaring work bead assignee as authoritative). This caused the reaper to kill working polecats whose agent bead hook_bead pointed to stale/closed beads from previous swarms. Add hasAssignedOpenWork() fallback: before killing a polecat as "working-no-hook", query bd list for beads assigned to this polecat with hooked/in_progress/open status. If any exist, the polecat has active work and should not be reaped. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7dfcdab commit 99bb517

File tree

1 file changed

+34
-0
lines changed

1 file changed

+34
-0
lines changed

internal/daemon/daemon.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2192,6 +2192,29 @@ func (d *Daemon) isBeadClosed(beadID string) bool {
21922192
return issues[0].Status == "closed"
21932193
}
21942194

2195+
// hasAssignedOpenWork checks if any work bead is assigned to the given polecat
2196+
// with a non-terminal status (hooked, in_progress, or open). This is the
2197+
// authoritative source of polecat work — the sling code sets status=hooked +
2198+
// assignee on the work bead, but no longer maintains the agent bead's hook_bead
2199+
// field (updateAgentHookBead is a no-op). Without this fallback, the idle reaper
2200+
// kills working polecats whose agent bead hook_bead is stale.
2201+
func (d *Daemon) hasAssignedOpenWork(rigName, assignee string) bool {
2202+
for _, status := range []string{"hooked", "in_progress", "open"} {
2203+
cmd := exec.Command(d.bdPath, "list", "--rig="+rigName, "--assignee="+assignee, "--status="+status, "--json") //nolint:gosec // G204: args are constructed internally
2204+
cmd.Dir = d.config.TownRoot
2205+
cmd.Env = os.Environ()
2206+
output, err := cmd.Output()
2207+
if err != nil {
2208+
continue
2209+
}
2210+
var issues []json.RawMessage
2211+
if json.Unmarshal(output, &issues) == nil && len(issues) > 0 {
2212+
return true
2213+
}
2214+
}
2215+
return false
2216+
}
2217+
21952218
// notifyWitnessOfCrashedPolecat notifies the witness when a polecat crash is detected.
21962219
// The stuck-agent-dog plugin handles context-aware restart decisions.
21972220
func (d *Daemon) notifyWitnessOfCrashedPolecat(rigName, polecatName, hookBead string) {
@@ -2300,6 +2323,17 @@ func (d *Daemon) reapIdlePolecat(rigName, polecatName string, timeout time.Durat
23002323
return
23012324
}
23022325

2326+
// Fallback: agent bead hook_bead may be stale (updateAgentHookBead is a
2327+
// no-op since the sling code declared work bead assignee as authoritative).
2328+
// Before killing, check if any work bead is assigned to this polecat with
2329+
// a non-terminal status. This prevents the reaper from killing polecats
2330+
// whose agent bead hook_bead points to a closed bead from a previous swarm
2331+
// while the polecat is actively working on a newly-slung bead.
2332+
assignee := fmt.Sprintf("%s/polecats/%s", rigName, polecatName)
2333+
if d.hasAssignedOpenWork(rigName, assignee) {
2334+
return
2335+
}
2336+
23032337
// No hooked work + stale heartbeat = idle polecat
23042338
d.killIdlePolecat(rigName, polecatName, sessionName, staleDuration, timeout, "working-no-hook")
23052339
}

0 commit comments

Comments
 (0)