Skip to content

Commit 49c6975

Browse files
committed
fix: reap zombie processes
1 parent ad31ae8 commit 49c6975

3 files changed

Lines changed: 99 additions & 0 deletions

File tree

src/app/process.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,14 @@ func (p *Process) waitForStdOutErr() {
187187
ctx, cancel := context.WithCancel(context.Background())
188188
if p.procConf.IsDaemon {
189189
ctx, cancel = context.WithTimeout(context.Background(), time.Duration(p.procConf.LaunchTimeout)*time.Second)
190+
} else if p.isState(types.ProcessStateTerminating) {
191+
// In terminating state, never block forever on stdout/stderr drain.
192+
// Reaching command.Wait() is required to reap child processes.
193+
timeoutSec := p.procConf.ShutDownParams.ShutDownTimeout
194+
if timeoutSec == UndefinedShutdownTimeoutSec {
195+
timeoutSec = DefaultShutdownTimeoutSec
196+
}
197+
ctx, cancel = context.WithTimeout(context.Background(), time.Duration(timeoutSec)*time.Second)
190198
}
191199
defer cancel()
192200
if p.stdOutDone != nil {

src/app/project_runner.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,20 @@ func (p *ProjectRunner) runProcess(config *types.ProcessConfig) {
195195
procLog = pclog.NewLogBuffer(0)
196196
}
197197
procState, _ := p.GetProcessState(config.ReplicaName)
198+
// Recover from stale "Terminating" state snapshots (for example when a
199+
// process ended as zombie and never transitioned to Completed). Reusing
200+
// that snapshot would cause Process.run() to short-circuit immediately.
201+
if procState != nil &&
202+
procState.Status == types.ProcessStateTerminating &&
203+
!procState.IsRunning {
204+
log.Warn().
205+
Str("process", config.ReplicaName).
206+
Msg("Resetting stale terminating state before start")
207+
procState = types.NewProcessState(config)
208+
p.statesMutex.Lock()
209+
p.processStates[config.ReplicaName] = procState
210+
p.statesMutex.Unlock()
211+
}
198212
isMain := config.Name == p.mainProcess
199213
hasMain := p.mainProcess != ""
200214
printLogs := !hasMain && !p.isTuiOn && !p.project.MCPServer.IsStdio()

src/app/system_test.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,6 +1431,83 @@ func TestSystem_ConcurrentRestartRaceCondition(t *testing.T) {
14311431
}
14321432
}
14331433

1434+
func TestSystem_StartProcessResetsStaleTerminatingState(t *testing.T) {
1435+
testProcess := "stale_terminating"
1436+
shell := command.DefaultShellConfig()
1437+
1438+
project := &types.Project{
1439+
Processes: map[string]types.ProcessConfig{
1440+
testProcess: {
1441+
Name: testProcess,
1442+
ReplicaName: testProcess,
1443+
Executable: shell.ShellCommand,
1444+
Args: []string{shell.ShellArgument, getSleepCommand(1.0)},
1445+
RestartPolicy: types.RestartPolicyConfig{
1446+
Restart: types.RestartPolicyNo,
1447+
},
1448+
},
1449+
},
1450+
ShellConfig: shell,
1451+
}
1452+
1453+
runner, err := NewProjectRunner(&ProjectOpts{
1454+
project: project,
1455+
processesToRun: []string{},
1456+
noDeps: false,
1457+
mainProcess: "",
1458+
mainProcessArgs: []string{},
1459+
isTuiOn: false,
1460+
})
1461+
if err != nil {
1462+
t.Error(err.Error())
1463+
return
1464+
}
1465+
1466+
// Simulate stale state from a prior broken termination.
1467+
runner.statesMutex.Lock()
1468+
runner.processStates[testProcess].Status = types.ProcessStateTerminating
1469+
runner.processStates[testProcess].IsRunning = false
1470+
runner.statesMutex.Unlock()
1471+
runner.runProcMutex.Lock()
1472+
runner.runningProcesses = make(map[string]*Process)
1473+
runner.runProcMutex.Unlock()
1474+
runner.doneProcMutex.Lock()
1475+
runner.doneProcesses = make(map[string]*Process)
1476+
runner.doneProcMutex.Unlock()
1477+
runner.logger = pclog.NewNilLogger()
1478+
1479+
if err := runner.StartProcess(testProcess); err != nil {
1480+
t.Fatalf("failed to start process: %v", err)
1481+
}
1482+
1483+
var lastStatus string
1484+
for attempts := range 200 {
1485+
state, stateErr := runner.GetProcessState(testProcess)
1486+
if stateErr != nil {
1487+
t.Fatalf("failed to get process state: %v", stateErr)
1488+
}
1489+
lastStatus = state.Status
1490+
if state.Status == types.ProcessStateRunning || state.Status == types.ProcessStateCompleted {
1491+
break
1492+
}
1493+
time.Sleep(10 * time.Millisecond)
1494+
if attempts == 199 {
1495+
t.Fatalf("process failed to leave stale Terminating state, last status=%s", state.Status)
1496+
}
1497+
}
1498+
1499+
if lastStatus == types.ProcessStateTerminating {
1500+
t.Fatalf("expected process to recover from stale Terminating state, got %s", lastStatus)
1501+
}
1502+
1503+
// Cleanup if still running.
1504+
if runner.getRunningProcess(testProcess) != nil {
1505+
if err := runner.StopProcess(testProcess); err != nil {
1506+
t.Fatalf("failed to stop process: %v", err)
1507+
}
1508+
}
1509+
}
1510+
14341511
func TestReadinessProbeRestart(t *testing.T) {
14351512
proc := &types.ProcessConfig{
14361513
Name: "test",

0 commit comments

Comments
 (0)