Skip to content

Commit 2eabd12

Browse files
steveyeggequad341claude
committed
fix(sling): kill session unconditionally when reusing idle polecat
When gt sling reuses an idle polecat, any existing tmux session is now killed unconditionally (not just when stale). Previously, a non-stale session caused ReuseIdlePolecat to return ErrSessionRunning, forcing allocation of a new polecat. But heartbeat freshness can race with session lifecycle (compact/resume hooks refresh the heartbeat while Claude sits idle at a dead prompt), leaving the old session alive and new work undiscovered. Also cleans up the heartbeat file after killing to prevent stale data from confusing SessionManager.Start. Closes #2936 Co-Authored-By: quad341 <quad341@users.noreply.github.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 32507e2 commit 2eabd12

3 files changed

Lines changed: 227 additions & 21 deletions

File tree

internal/cmd/polecat_spawn.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
package cmd
33

44
import (
5-
"errors"
65
"fmt"
76
"os"
87
"os/exec"
@@ -187,16 +186,12 @@ func SpawnPolecatForSling(rigName string, opts SlingSpawnOptions) (*SpawnedPolec
187186
}
188187
reuseOK := false
189188
if _, err := polecatMgr.ReuseIdlePolecat(polecatName, addOpts); err != nil {
190-
if errors.Is(err, polecat.ErrSessionRunning) {
191-
fmt.Printf(" Idle polecat %s still has a live session, allocating new...\n", polecatName)
189+
// Branch-only reuse failed — try full worktree repair as fallback
190+
fmt.Printf(" Branch-only reuse failed for idle polecat %s: %v, trying full repair...\n", polecatName, err)
191+
if _, err := polecatMgr.RepairWorktreeWithOptions(polecatName, true, addOpts); err != nil {
192+
fmt.Printf(" Full repair also failed for %s: %v, allocating new...\n", polecatName, err)
192193
} else {
193-
// Branch-only reuse failed — try full worktree repair as fallback
194-
fmt.Printf(" Branch-only reuse failed for idle polecat %s: %v, trying full repair...\n", polecatName, err)
195-
if _, err := polecatMgr.RepairWorktreeWithOptions(polecatName, true, addOpts); err != nil {
196-
fmt.Printf(" Full repair also failed for %s: %v, allocating new...\n", polecatName, err)
197-
} else {
198-
reuseOK = true
199-
}
194+
reuseOK = true
200195
}
201196
} else {
202197
reuseOK = true

internal/polecat/manager.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1510,18 +1510,25 @@ func (m *Manager) ReuseIdlePolecat(name string, opts AddOptions) (*Polecat, erro
15101510
return nil, ErrPolecatNotFound
15111511
}
15121512

1513-
// Revalidate session state under the polecat lock. A prior dispatcher may
1514-
// have observed this polecat as idle, but by the time reuse begins the tmux
1515-
// session may still be alive or may have revived.
1516-
if running, stale := m.polecatSessionState(name); running {
1517-
if stale {
1518-
sessionName := session.PolecatSessionName(session.PrefixFor(m.rig.Name), name)
1519-
if err := m.tmux.KillSessionWithProcesses(sessionName); err != nil {
1520-
return nil, fmt.Errorf("killing stale session %s: %w", sessionName, err)
1521-
}
1522-
} else {
1523-
return nil, ErrSessionRunning
1513+
// Kill any existing session unconditionally before reuse.
1514+
// The polecat was found idle (no hooked work), so even a "live" session is
1515+
// just Claude sitting at a dead ❯ prompt from the previous task. Leaving it
1516+
// alive prevents StartSession from creating a fresh session with a proper
1517+
// gt prime --hook cycle to discover the newly hooked bead.
1518+
//
1519+
// Previously, non-stale sessions returned ErrSessionRunning here, causing the
1520+
// caller to allocate a new polecat. But heartbeat freshness can race with
1521+
// session lifecycle (e.g. a compact/resume hook refreshes the heartbeat while
1522+
// the session is functionally idle), leaving the old session alive and the
1523+
// new work undiscovered.
1524+
if running, _ := m.polecatSessionState(name); running {
1525+
sessionName := session.PolecatSessionName(session.PrefixFor(m.rig.Name), name)
1526+
if err := m.tmux.KillSessionWithProcesses(sessionName); err != nil {
1527+
return nil, fmt.Errorf("killing existing session %s for reuse: %w", sessionName, err)
15241528
}
1529+
// Remove stale heartbeat so SessionManager.Start doesn't see leftover data.
1530+
townRoot := filepath.Dir(m.rig.Path)
1531+
RemoveSessionHeartbeat(townRoot, sessionName)
15251532
}
15261533

15271534
// Get worktree path (must already exist for reuse)

internal/polecat/manager_test.go

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package polecat
22

33
import (
4+
"errors"
45
"fmt"
56
"os"
67
"os/exec"
@@ -17,6 +18,7 @@ import (
1718
"github.com/steveyegge/gastown/internal/rig"
1819
"github.com/steveyegge/gastown/internal/session"
1920
"github.com/steveyegge/gastown/internal/testutil"
21+
"github.com/steveyegge/gastown/internal/tmux"
2022
)
2123

2224
// installMockBd places a fake bd binary in PATH that handles the commands
@@ -1692,3 +1694,205 @@ func TestAllocateAndAdd_NoDuplicateNames(t *testing.T) {
16921694
}
16931695
}
16941696
}
1697+
1698+
// TestReuseIdlePolecat_KillsLiveSession verifies that ReuseIdlePolecat kills
1699+
// an existing live (non-stale) tmux session instead of returning ErrSessionRunning.
1700+
// This is the regression test for the sling-reuse-stale-session bug: idle polecats
1701+
// with a live Claude session at a dead ❯ prompt must have their session killed so
1702+
// StartSession can create a fresh session with a proper gt prime --hook cycle.
1703+
func TestReuseIdlePolecat_KillsLiveSession(t *testing.T) {
1704+
if runtime.GOOS == "windows" {
1705+
t.Skip("tmux not supported on Windows")
1706+
}
1707+
if _, err := exec.LookPath("tmux"); err != nil {
1708+
t.Skip("tmux not installed")
1709+
}
1710+
1711+
townRoot := t.TempDir()
1712+
rigName := "testreuse"
1713+
rigPath := filepath.Join(townRoot, rigName)
1714+
polecatName := "toast"
1715+
1716+
// Create minimal polecat directory structure
1717+
polecatDir := filepath.Join(rigPath, "polecats", polecatName)
1718+
if err := os.MkdirAll(polecatDir, 0755); err != nil {
1719+
t.Fatalf("mkdir polecat dir: %v", err)
1720+
}
1721+
1722+
// Register a unique prefix for session naming
1723+
reg := session.NewPrefixRegistry()
1724+
reg.Register("gt", rigName)
1725+
old := session.DefaultRegistry()
1726+
session.SetDefaultRegistry(reg)
1727+
t.Cleanup(func() { session.SetDefaultRegistry(old) })
1728+
1729+
tm := tmux.NewTmux()
1730+
r := &rig.Rig{Name: rigName, Path: rigPath}
1731+
mgr := NewManager(r, git.NewGit(rigPath), tm)
1732+
1733+
// Create a live tmux session (simulates Claude sitting at ❯ after gt done)
1734+
sessMgr := NewSessionManager(tm, r)
1735+
sessionName := sessMgr.SessionName(polecatName)
1736+
if err := tm.NewSessionWithCommand(sessionName, townRoot, "sleep 300"); err != nil {
1737+
t.Fatalf("create tmux session: %v", err)
1738+
}
1739+
t.Cleanup(func() { _ = tm.KillSessionWithProcesses(sessionName) })
1740+
1741+
// Write a fresh heartbeat (simulating a session that just finished gt done
1742+
// but hasn't gone stale yet — this is the exact scenario that previously
1743+
// caused ReuseIdlePolecat to return ErrSessionRunning)
1744+
TouchSessionHeartbeat(townRoot, sessionName)
1745+
1746+
// Verify session is alive and heartbeat exists
1747+
running, err := tm.HasSession(sessionName)
1748+
if err != nil || !running {
1749+
t.Fatalf("precondition: session %s should be running", sessionName)
1750+
}
1751+
if hb := ReadSessionHeartbeat(townRoot, sessionName); hb == nil {
1752+
t.Fatal("precondition: heartbeat should exist")
1753+
}
1754+
1755+
// Call ReuseIdlePolecat — it will kill the session, then fail on worktree
1756+
// operations (no real git repo). The important thing is it does NOT return
1757+
// ErrSessionRunning.
1758+
_, reuseErr := mgr.ReuseIdlePolecat(polecatName, AddOptions{})
1759+
1760+
// Verify it did NOT return ErrSessionRunning (the old buggy behavior)
1761+
if errors.Is(reuseErr, ErrSessionRunning) {
1762+
t.Fatalf("ReuseIdlePolecat returned ErrSessionRunning for live session — "+
1763+
"this is the sling-reuse-stale-session bug: idle polecats with live "+
1764+
"sessions must have their session killed, not rejected")
1765+
}
1766+
1767+
// We expect an error from later steps (worktree not found), but not from session handling
1768+
if reuseErr == nil {
1769+
t.Fatal("expected error from worktree operations (test has no real git repo)")
1770+
}
1771+
if !strings.Contains(reuseErr.Error(), "worktree") {
1772+
t.Logf("ReuseIdlePolecat error (expected worktree-related): %v", reuseErr)
1773+
}
1774+
1775+
// Verify the session was killed
1776+
running, _ = tm.HasSession(sessionName)
1777+
if running {
1778+
t.Error("session should have been killed by ReuseIdlePolecat")
1779+
}
1780+
1781+
// Verify heartbeat was cleaned up
1782+
if hb := ReadSessionHeartbeat(townRoot, sessionName); hb != nil {
1783+
t.Error("heartbeat should have been removed after session kill")
1784+
}
1785+
}
1786+
1787+
// TestReuseIdlePolecat_KillsStaleSession verifies that ReuseIdlePolecat also
1788+
// handles the stale-session case correctly (regression: the original code path
1789+
// that worked before the fix should still work after).
1790+
func TestReuseIdlePolecat_KillsStaleSession(t *testing.T) {
1791+
if runtime.GOOS == "windows" {
1792+
t.Skip("tmux not supported on Windows")
1793+
}
1794+
if _, err := exec.LookPath("tmux"); err != nil {
1795+
t.Skip("tmux not installed")
1796+
}
1797+
1798+
townRoot := t.TempDir()
1799+
rigName := "teststale"
1800+
rigPath := filepath.Join(townRoot, rigName)
1801+
polecatName := "marmalade"
1802+
1803+
polecatDir := filepath.Join(rigPath, "polecats", polecatName)
1804+
if err := os.MkdirAll(polecatDir, 0755); err != nil {
1805+
t.Fatalf("mkdir polecat dir: %v", err)
1806+
}
1807+
1808+
reg := session.NewPrefixRegistry()
1809+
reg.Register("gt", rigName)
1810+
old := session.DefaultRegistry()
1811+
session.SetDefaultRegistry(reg)
1812+
t.Cleanup(func() { session.SetDefaultRegistry(old) })
1813+
1814+
tm := tmux.NewTmux()
1815+
r := &rig.Rig{Name: rigName, Path: rigPath}
1816+
mgr := NewManager(r, git.NewGit(rigPath), tm)
1817+
1818+
sessMgr := NewSessionManager(tm, r)
1819+
sessionName := sessMgr.SessionName(polecatName)
1820+
if err := tm.NewSessionWithCommand(sessionName, townRoot, "sleep 300"); err != nil {
1821+
t.Fatalf("create tmux session: %v", err)
1822+
}
1823+
t.Cleanup(func() { _ = tm.KillSessionWithProcesses(sessionName) })
1824+
1825+
// Write a STALE heartbeat (old timestamp)
1826+
dir := filepath.Join(townRoot, ".runtime", "heartbeats")
1827+
if err := os.MkdirAll(dir, 0755); err != nil {
1828+
t.Fatal(err)
1829+
}
1830+
oldTime := time.Now().Add(-10 * time.Minute).UTC()
1831+
data := []byte(`{"timestamp":"` + oldTime.Format(time.RFC3339Nano) + `","state":"exiting"}`)
1832+
if err := os.WriteFile(filepath.Join(dir, sessionName+".json"), data, 0644); err != nil {
1833+
t.Fatal(err)
1834+
}
1835+
1836+
_, reuseErr := mgr.ReuseIdlePolecat(polecatName, AddOptions{})
1837+
1838+
// Should not return ErrSessionRunning
1839+
if errors.Is(reuseErr, ErrSessionRunning) {
1840+
t.Fatal("ReuseIdlePolecat should not return ErrSessionRunning for stale session")
1841+
}
1842+
1843+
// Session should be killed
1844+
running, _ := tm.HasSession(sessionName)
1845+
if running {
1846+
t.Error("stale session should have been killed")
1847+
}
1848+
1849+
// Heartbeat should be cleaned up
1850+
if hb := ReadSessionHeartbeat(townRoot, sessionName); hb != nil {
1851+
t.Error("heartbeat should have been removed after stale session kill")
1852+
}
1853+
}
1854+
1855+
// TestReuseIdlePolecat_NoSessionNoop verifies that ReuseIdlePolecat proceeds
1856+
// normally when there's no existing session (the most common reuse case: session
1857+
// was already killed by the Witness or expired).
1858+
func TestReuseIdlePolecat_NoSessionNoop(t *testing.T) {
1859+
if runtime.GOOS == "windows" {
1860+
t.Skip("tmux not supported on Windows")
1861+
}
1862+
if _, err := exec.LookPath("tmux"); err != nil {
1863+
t.Skip("tmux not installed")
1864+
}
1865+
1866+
townRoot := t.TempDir()
1867+
rigName := "testnoop"
1868+
rigPath := filepath.Join(townRoot, rigName)
1869+
polecatName := "jam"
1870+
1871+
polecatDir := filepath.Join(rigPath, "polecats", polecatName)
1872+
if err := os.MkdirAll(polecatDir, 0755); err != nil {
1873+
t.Fatalf("mkdir polecat dir: %v", err)
1874+
}
1875+
1876+
reg := session.NewPrefixRegistry()
1877+
reg.Register("gt", rigName)
1878+
old := session.DefaultRegistry()
1879+
session.SetDefaultRegistry(reg)
1880+
t.Cleanup(func() { session.SetDefaultRegistry(old) })
1881+
1882+
tm := tmux.NewTmux()
1883+
r := &rig.Rig{Name: rigName, Path: rigPath}
1884+
mgr := NewManager(r, git.NewGit(rigPath), tm)
1885+
1886+
// No tmux session, no heartbeat — the common idle case
1887+
_, reuseErr := mgr.ReuseIdlePolecat(polecatName, AddOptions{})
1888+
1889+
// Should not return ErrSessionRunning
1890+
if errors.Is(reuseErr, ErrSessionRunning) {
1891+
t.Fatal("ReuseIdlePolecat should not return ErrSessionRunning when no session exists")
1892+
}
1893+
1894+
// Error should be from later steps (worktree ops), not session handling
1895+
if reuseErr == nil {
1896+
t.Fatal("expected error from worktree operations")
1897+
}
1898+
}

0 commit comments

Comments
 (0)