Skip to content

Commit bf0e34e

Browse files
committed
fix(daemon): dispatcher skips idle dogs with leaked tmux sessions (gt-o24)
dispatchPlugins was calling mgr.GetIdleDog(), which returns the first idle dog from mgr.List() in directory order. When a dog's registry state was marked idle before its tmux session fully terminated (the window between 'gt dog done' setting StateIdle and tmux actually killing the session), the daemon would pick that dog, send it plugin mail, then hit sm.Start failing with "session already running: hq-dog-<name>". The failed assignment was rolled back and the next heartbeat picked the same dog again — infinite loop on one dog while the rest of the pack sat idle. Fix: dispatchPlugins now iterates the kennel itself, skipping any dog whose registry state is non-idle OR whose tmux session is already running per sm.IsRunning. A transient termination race on one dog no longer blocks dispatch to the other three; worst case the dispatcher defers one tick. Extracted the filter into findDispatchableDog so it's unit-testable and the intent is explicit at the call site. Tests cover: idle-with-working-peer picks the idle one; all-working returns nil; empty kennel returns nil; two idle dogs (no live sessions) pick one. The "skip idle-with-live-session" regression path is exercised at runtime via sm.IsRunning, whose tmux correctness is covered in session_manager tests. Observed in production 2026-04-19: alpha locked the dispatcher for 20+ minutes while bravo/charlie/delta idle, ~14 plugin beads queued with assignees but none pulled. Manual workaround was 'tmux kill-session -t hq-dog-alpha'; with this fix the dispatcher advances on its own. Closes gt-o24. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Executed-By: gastown/crew/woodhouse
1 parent db74d75 commit bf0e34e

2 files changed

Lines changed: 122 additions & 7 deletions

File tree

internal/daemon/handler.go

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package daemon
22

33
import (
44
"fmt"
5+
"log"
56
"path/filepath"
67
"time"
78

@@ -256,14 +257,16 @@ func (d *Daemon) dispatchPlugins(mgr *dog.Manager, sm *dog.SessionManager, rigsC
256257
}
257258
}
258259

259-
// Find an idle dog.
260-
idleDog, err := mgr.GetIdleDog()
261-
if err != nil {
262-
d.logger.Printf("Handler: error finding idle dog: %v", err)
263-
return // No point continuing if we can't list dogs
264-
}
260+
// Find an idle dog that doesn't already have a live tmux session.
261+
// A leaked session (dog marked idle before its tmux terminated) would
262+
// cause sm.Start to fail with "session already running", and since
263+
// mgr.List() returns dogs in directory order, GetIdleDog would always
264+
// pick the same first idle dog — infinite-looping the same failed
265+
// dispatch instead of advancing to the next idle dog in the pack.
266+
// See gt-o24.
267+
idleDog := findDispatchableDog(mgr, sm, d.logger)
265268
if idleDog == nil {
266-
d.logger.Printf("Handler: no idle dogs available, deferring remaining plugins")
269+
d.logger.Printf("Handler: no dispatchable idle dogs available, deferring remaining plugins")
267270
return
268271
}
269272

@@ -320,6 +323,41 @@ func (d *Daemon) dispatchPlugins(mgr *dog.Manager, sm *dog.SessionManager, rigsC
320323
}
321324
}
322325

326+
// findDispatchableDog returns the first dog in the kennel whose registry
327+
// state is idle AND whose tmux session is NOT currently running. Returns nil
328+
// when no dog satisfies both conditions.
329+
//
330+
// This exists because a dog can be marked idle (via gt dog done or the reaper)
331+
// before its tmux session fully terminates, producing a transient window where
332+
// sm.Start would fail with "session already running". Picking that dog every
333+
// dispatch tick infinite-loops the same failed dispatch instead of advancing
334+
// to another genuinely-free dog in the pack. See gt-o24.
335+
//
336+
// IsRunning errors are logged and treated as "not dispatchable" so a flaky
337+
// tmux check can't wedge the whole dispatch cycle.
338+
func findDispatchableDog(mgr *dog.Manager, sm *dog.SessionManager, logger *log.Logger) *dog.Dog {
339+
dogs, err := mgr.List()
340+
if err != nil {
341+
logger.Printf("Handler: failed to list dogs while picking dispatch target: %v", err)
342+
return nil
343+
}
344+
for _, d := range dogs {
345+
if d.State != dog.StateIdle {
346+
continue
347+
}
348+
running, err := sm.IsRunning(d.Name)
349+
if err != nil {
350+
logger.Printf("Handler: IsRunning check failed for dog %s: %v; skipping", d.Name, err)
351+
continue
352+
}
353+
if running {
354+
continue
355+
}
356+
return d
357+
}
358+
return nil
359+
}
360+
323361
// loadRigsConfig loads the rigs configuration from mayor/rigs.json.
324362
func (d *Daemon) loadRigsConfig() (*config.RigsConfig, error) {
325363
rigsPath := filepath.Join(d.config.TownRoot, "mayor", "rigs.json")

internal/daemon/handler_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,80 @@ func TestReapIdleDogs_Constants(t *testing.T) {
395395
t.Errorf("maxDogPoolSize = %d, want 4", maxDogPoolSize)
396396
}
397397
}
398+
399+
// TestFindDispatchableDog covers the idle-pool filter used by dispatchPlugins.
400+
// The critical behavior is that dogs with a live tmux session are skipped
401+
// even when their registry state is idle, preventing the infinite-loop seen
402+
// in gt-o24 where GetIdleDog kept picking the same dog during a termination race.
403+
func TestFindDispatchableDog_SkipsWorkingDogs(t *testing.T) {
404+
townRoot := t.TempDir()
405+
d := testHandlerDaemon(t, townRoot)
406+
407+
testSetupWorkingDogState(t, townRoot, "alpha", "plugin:x", time.Now())
408+
testSetupDogState(t, townRoot, "bravo", dog.StateIdle, time.Now())
409+
410+
mgr := dog.NewManager(townRoot, nil)
411+
sm := dog.NewSessionManager(tmux.NewTmux(), townRoot, mgr)
412+
413+
got := findDispatchableDog(mgr, sm, d.logger)
414+
if got == nil {
415+
t.Fatal("findDispatchableDog returned nil, want bravo")
416+
}
417+
if got.Name != "bravo" {
418+
t.Errorf("findDispatchableDog = %q, want bravo (working alpha must be skipped)", got.Name)
419+
}
420+
}
421+
422+
func TestFindDispatchableDog_AllWorkingReturnsNil(t *testing.T) {
423+
townRoot := t.TempDir()
424+
d := testHandlerDaemon(t, townRoot)
425+
426+
testSetupWorkingDogState(t, townRoot, "alpha", "plugin:x", time.Now())
427+
testSetupWorkingDogState(t, townRoot, "bravo", "plugin:y", time.Now())
428+
429+
mgr := dog.NewManager(townRoot, nil)
430+
sm := dog.NewSessionManager(tmux.NewTmux(), townRoot, mgr)
431+
432+
got := findDispatchableDog(mgr, sm, d.logger)
433+
if got != nil {
434+
t.Errorf("findDispatchableDog = %q, want nil (all working)", got.Name)
435+
}
436+
}
437+
438+
func TestFindDispatchableDog_EmptyKennelReturnsNil(t *testing.T) {
439+
townRoot := t.TempDir()
440+
d := testHandlerDaemon(t, townRoot)
441+
442+
mgr := dog.NewManager(townRoot, nil)
443+
sm := dog.NewSessionManager(tmux.NewTmux(), townRoot, mgr)
444+
445+
got := findDispatchableDog(mgr, sm, d.logger)
446+
if got != nil {
447+
t.Errorf("findDispatchableDog = %q, want nil (empty kennel)", got.Name)
448+
}
449+
}
450+
451+
// TestFindDispatchableDog_PicksFirstIdleWhenNoSessionsLive verifies the
452+
// default path (no tmux sessions exist for any dog): the first idle dog
453+
// from mgr.List is returned. The "skip idle dogs with live sessions"
454+
// behavior — the actual gt-o24 regression — is exercised at runtime via
455+
// sm.IsRunning, whose tmux-backed correctness is covered in session_manager
456+
// tests.
457+
func TestFindDispatchableDog_PicksFirstIdleWhenNoSessionsLive(t *testing.T) {
458+
townRoot := t.TempDir()
459+
d := testHandlerDaemon(t, townRoot)
460+
461+
testSetupDogState(t, townRoot, "alpha", dog.StateIdle, time.Now())
462+
testSetupDogState(t, townRoot, "bravo", dog.StateIdle, time.Now())
463+
464+
mgr := dog.NewManager(townRoot, nil)
465+
sm := dog.NewSessionManager(tmux.NewTmux(), townRoot, mgr)
466+
467+
got := findDispatchableDog(mgr, sm, d.logger)
468+
if got == nil {
469+
t.Fatal("findDispatchableDog returned nil; expected an idle dog")
470+
}
471+
if got.Name != "alpha" && got.Name != "bravo" {
472+
t.Errorf("findDispatchableDog = %q, want alpha or bravo", got.Name)
473+
}
474+
}

0 commit comments

Comments
 (0)