Skip to content

Commit b946474

Browse files
committed
feature: zombie process reaper
Add a background goroutine that continuously reaps any children that get re-parented to the crossplane-opentofu-provider process. Signed-off-by: Sebastian Trebitz <sebastian@nephosolutions.com>
1 parent 49c1422 commit b946474

6 files changed

Lines changed: 315 additions & 0 deletions

File tree

cmd/provider/main.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import (
4545
"github.com/upbound/provider-opentofu/internal/controller/gc"
4646
namespacedcontroller "github.com/upbound/provider-opentofu/internal/controller/namespaced"
4747
"github.com/upbound/provider-opentofu/internal/features"
48+
"github.com/upbound/provider-opentofu/internal/reaper"
4849
)
4950

5051
func init() {
@@ -75,6 +76,11 @@ func main() {
7576
// https://github.com/kubernetes-sigs/controller-runtime/pull/2317
7677
ctrl.SetLogger(zl)
7778

79+
// Start zombie-process reaper. tofu init spawns git subprocesses; when
80+
// tofu exits those grandchildren are re-parented to us. Without reaping
81+
// they accumulate as <defunct> entries in the process table.
82+
kingpin.FatalIfError(reaper.Start(log), "Cannot start zombie process reaper")
83+
7884
log.Debug("Starting",
7985
"sync-period", syncInterval.String(),
8086
"poll-interval", pollInterval.String(),

internal/reaper/reaper.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
// Package reaper implements zombie-process reaping for the provider process.
8+
// When the provider is PID 1 (or a subreaper), any grandchild processes that
9+
// finish but whose direct parent has already exited are re-parented to this
10+
// process. Without explicit reaping they accumulate as <defunct> (zombie)
11+
// entries in the process table.
12+
package reaper
13+
14+
import (
15+
"os"
16+
"os/signal"
17+
"syscall"
18+
19+
"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
20+
)
21+
22+
// Start registers this process as a subreaper so that orphaned grandchildren
23+
// are re-parented to it, then launches a background goroutine that reaps them
24+
// as they exit. It returns immediately; reaping continues until ctx is done.
25+
func Start(log logging.Logger) error {
26+
if err := setSubreaper(); err != nil {
27+
return err
28+
}
29+
30+
go reapLoop(log)
31+
return nil
32+
}
33+
34+
// reapLoop listens for SIGCHLD and calls waitpid in a non-blocking loop to
35+
// collect every zombie that has been re-parented to us.
36+
func reapLoop(log logging.Logger) {
37+
ch := make(chan os.Signal, 1)
38+
signal.Notify(ch, syscall.SIGCHLD)
39+
40+
for range ch {
41+
reapAll(log)
42+
}
43+
}
44+
45+
// reapAll drains all pending zombie children in a tight loop.
46+
func reapAll(log logging.Logger) {
47+
for {
48+
var ws syscall.WaitStatus
49+
pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, nil)
50+
if pid <= 0 || err != nil {
51+
// No more children to reap (ECHILD) or nothing ready yet.
52+
return
53+
}
54+
log.Debug("Reaped zombie child process", "pid", pid, "exitStatus", ws.ExitStatus())
55+
}
56+
}
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
//go:build linux
2+
3+
/*
4+
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>
5+
6+
SPDX-License-Identifier: Apache-2.0
7+
*/
8+
9+
package reaper
10+
11+
import (
12+
"fmt"
13+
"os"
14+
"os/exec"
15+
"syscall"
16+
"testing"
17+
"time"
18+
19+
"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
20+
)
21+
22+
// TestSetSubreaper verifies that prctl(PR_SET_CHILD_SUBREAPER) succeeds.
23+
func TestSetSubreaper(t *testing.T) {
24+
if err := setSubreaper(); err != nil {
25+
t.Fatalf("setSubreaper() returned unexpected error: %v", err)
26+
}
27+
}
28+
29+
// TestSetSubreaperIdempotent verifies that calling setSubreaper multiple times
30+
// does not return an error.
31+
func TestSetSubreaperIdempotent(t *testing.T) {
32+
for i := 0; i < 3; i++ {
33+
if err := setSubreaper(); err != nil {
34+
t.Fatalf("setSubreaper() call %d returned unexpected error: %v", i+1, err)
35+
}
36+
}
37+
}
38+
39+
// TestReapAllReapsDirectChild verifies that reapAll successfully waits for and
40+
// removes a finished direct child, so that it does not remain as a zombie.
41+
func TestReapAllReapsDirectChild(t *testing.T) {
42+
cmd := exec.Command("/bin/true")
43+
if err := cmd.Start(); err != nil {
44+
t.Fatalf("failed to start child process: %v", err)
45+
}
46+
childPID := cmd.Process.Pid
47+
48+
// Wait until the child has actually exited (it becomes a zombie at this
49+
// point because we have not called cmd.Wait yet).
50+
waitForZombie(t, childPID)
51+
52+
reapAll(logging.NewNopLogger())
53+
54+
// After reaping, /proc/<pid> should no longer exist.
55+
if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); !os.IsNotExist(err) {
56+
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", childPID)
57+
}
58+
}
59+
60+
// TestReapAllReapsMultipleChildren verifies that reapAll drains multiple zombie
61+
// children in a single call (the internal loop runs until WNOHANG returns
62+
// pid <= 0).
63+
func TestReapAllReapsMultipleChildren(t *testing.T) {
64+
const childCount = 5
65+
pids := make([]int, 0, childCount)
66+
67+
for i := 0; i < childCount; i++ {
68+
cmd := exec.Command("/bin/true")
69+
if err := cmd.Start(); err != nil {
70+
t.Fatalf("failed to start child process %d: %v", i, err)
71+
}
72+
pids = append(pids, cmd.Process.Pid)
73+
}
74+
75+
// Wait for all children to become zombies.
76+
for _, pid := range pids {
77+
waitForZombie(t, pid)
78+
}
79+
80+
reapAll(logging.NewNopLogger())
81+
82+
for _, pid := range pids {
83+
if _, err := os.Stat(fmt.Sprintf("/proc/%d", pid)); !os.IsNotExist(err) {
84+
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", pid)
85+
}
86+
}
87+
}
88+
89+
// TestReapAllAfterSIGKILL verifies that reapAll correctly reaps a child that
90+
// was terminated with SIGKILL (non-zero exit status path).
91+
func TestReapAllAfterSIGKILL(t *testing.T) {
92+
cmd := exec.Command("/bin/sleep", "60")
93+
if err := cmd.Start(); err != nil {
94+
t.Fatalf("failed to start child process: %v", err)
95+
}
96+
childPID := cmd.Process.Pid
97+
98+
if err := cmd.Process.Signal(syscall.SIGKILL); err != nil {
99+
t.Fatalf("failed to send SIGKILL: %v", err)
100+
}
101+
102+
waitForZombie(t, childPID)
103+
104+
reapAll(logging.NewNopLogger())
105+
106+
if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); !os.IsNotExist(err) {
107+
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", childPID)
108+
}
109+
}
110+
111+
// TestStartReapsChildAfterExit verifies the full Start() flow: the background
112+
// goroutine responds to SIGCHLD and reaps a finished child without any manual
113+
// wait call.
114+
func TestStartReapsChildAfterExit(t *testing.T) {
115+
if err := Start(logging.NewNopLogger()); err != nil {
116+
t.Fatalf("Start() returned unexpected error: %v", err)
117+
}
118+
119+
cmd := exec.Command("/bin/true")
120+
if err := cmd.Start(); err != nil {
121+
t.Fatalf("failed to start child process: %v", err)
122+
}
123+
childPID := cmd.Process.Pid
124+
125+
// Poll until the SIGCHLD handler reaps the child or the deadline expires.
126+
deadline := time.Now().Add(5 * time.Second)
127+
for time.Now().Before(deadline) {
128+
if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); os.IsNotExist(err) {
129+
return // reaped successfully
130+
}
131+
time.Sleep(50 * time.Millisecond)
132+
}
133+
134+
t.Errorf("child process %d was not reaped within the deadline", childPID)
135+
}
136+
137+
// waitForZombie polls /proc/<pid>/status until the process state field is 'Z'
138+
// (zombie), confirming it has exited but not yet been waited on.
139+
func waitForZombie(t *testing.T, pid int) {
140+
t.Helper()
141+
deadline := time.Now().Add(5 * time.Second)
142+
for time.Now().Before(deadline) {
143+
state, err := readProcessState(pid)
144+
if err != nil {
145+
// /proc entry already gone — acceptable but unexpected at this point.
146+
return
147+
}
148+
if state == 'Z' {
149+
return
150+
}
151+
time.Sleep(10 * time.Millisecond)
152+
}
153+
t.Fatalf("process %d did not become a zombie within the deadline", pid)
154+
}
155+
156+
// readProcessState returns the single-character state byte from the
157+
// "State:" line of /proc/<pid>/status (e.g. 'R', 'S', 'Z').
158+
func readProcessState(pid int) (byte, error) {
159+
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
160+
if err != nil {
161+
return 0, err
162+
}
163+
// Each line has the form "Field:\tvalue\n".
164+
// The State line looks like: "State:\tZ (zombie)"
165+
start := 0
166+
for i, b := range data {
167+
if b != '\n' {
168+
continue
169+
}
170+
line := string(data[start:i])
171+
start = i + 1
172+
if len(line) < 7 || line[:6] != "State:" {
173+
continue
174+
}
175+
for _, ch := range line[6:] {
176+
if ch != '\t' && ch != ' ' {
177+
return byte(ch), nil
178+
}
179+
}
180+
}
181+
return 0, fmt.Errorf("State field not found in /proc/%d/status", pid)
182+
}

internal/reaper/reaper_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package reaper
8+
9+
import (
10+
"testing"
11+
"time"
12+
13+
"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
14+
)
15+
16+
// TestReapAllNoChildren verifies that reapAll returns immediately when there
17+
// are no children to reap (WNOHANG behaviour: pid <= 0 on first call).
18+
func TestReapAllNoChildren(t *testing.T) {
19+
done := make(chan struct{})
20+
go func() {
21+
defer close(done)
22+
reapAll(logging.NewNopLogger())
23+
}()
24+
25+
select {
26+
case <-done:
27+
// passed: reapAll returned without blocking
28+
case <-time.After(2 * time.Second):
29+
t.Fatal("reapAll blocked when no children were present")
30+
}
31+
}
32+
33+
// TestReapAllDoesNotPanic verifies that reapAll does not panic when called
34+
// multiple times in succession on a process with no children.
35+
func TestReapAllDoesNotPanic(t *testing.T) {
36+
log := logging.NewNopLogger()
37+
for i := 0; i < 5; i++ {
38+
reapAll(log)
39+
}
40+
}

internal/reaper/subreaper_linux.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/*
2+
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>
3+
4+
SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
package reaper
8+
9+
import (
10+
"golang.org/x/sys/unix"
11+
)
12+
13+
// setSubreaper uses prctl(PR_SET_CHILD_SUBREAPER) to make the current process
14+
// the subreaper for orphaned grandchildren on Linux.
15+
func setSubreaper() error {
16+
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
17+
}

internal/reaper/subreaper_other.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
//go:build !linux
2+
3+
/*
4+
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>
5+
6+
SPDX-License-Identifier: Apache-2.0
7+
*/
8+
9+
package reaper
10+
11+
// setSubreaper is a no-op on non-Linux platforms.
12+
func setSubreaper() error {
13+
return nil
14+
}

0 commit comments

Comments
 (0)