Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/provider/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ import (
"github.com/upbound/provider-opentofu/internal/controller/gc"
namespacedcontroller "github.com/upbound/provider-opentofu/internal/controller/namespaced"
"github.com/upbound/provider-opentofu/internal/features"
"github.com/upbound/provider-opentofu/internal/reaper"
)

func init() {
Expand Down Expand Up @@ -75,6 +76,11 @@ func main() {
// https://github.com/kubernetes-sigs/controller-runtime/pull/2317
ctrl.SetLogger(zl)

// Start zombie-process reaper. tofu init spawns git subprocesses; when
// tofu exits those grandchildren are re-parented to us. Without reaping
// they accumulate as <defunct> entries in the process table.
kingpin.FatalIfError(reaper.Start(log), "Cannot start zombie process reaper")

log.Debug("Starting",
"sync-period", syncInterval.String(),
"poll-interval", pollInterval.String(),
Expand Down
56 changes: 56 additions & 0 deletions internal/reaper/reaper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>

SPDX-License-Identifier: Apache-2.0
*/

// Package reaper implements zombie-process reaping for the provider process.
// When the provider is PID 1 (or a subreaper), any grandchild processes that
// finish but whose direct parent has already exited are re-parented to this
// process. Without explicit reaping they accumulate as <defunct> (zombie)
// entries in the process table.
package reaper

import (
"os"
"os/signal"
"syscall"

"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
)

// Start registers this process as a subreaper so that orphaned grandchildren
// are re-parented to it, then launches a background goroutine that reaps them
// as they exit. It returns immediately; reaping continues until ctx is done.
func Start(log logging.Logger) error {
if err := setSubreaper(); err != nil {
return err
}

go reapLoop(log)
return nil
}

// reapLoop listens for SIGCHLD and calls waitpid in a non-blocking loop to
// collect every zombie that has been re-parented to us.
func reapLoop(log logging.Logger) {
ch := make(chan os.Signal, 1)
signal.Notify(ch, syscall.SIGCHLD)

for range ch {
reapAll(log)
}
}

// reapAll drains all pending zombie children in a tight loop.
func reapAll(log logging.Logger) {
for {
var ws syscall.WaitStatus
pid, err := syscall.Wait4(-1, &ws, syscall.WNOHANG, nil)
if pid <= 0 || err != nil {
// No more children to reap (ECHILD) or nothing ready yet.
return
}
log.Debug("Reaped zombie child process", "pid", pid, "exitStatus", ws.ExitStatus())
}
}
182 changes: 182 additions & 0 deletions internal/reaper/reaper_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
//go:build linux

/*
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>

SPDX-License-Identifier: Apache-2.0
*/

package reaper

import (
"fmt"
"os"
"os/exec"
"syscall"
"testing"
"time"

"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
)

// TestSetSubreaper verifies that prctl(PR_SET_CHILD_SUBREAPER) succeeds.
func TestSetSubreaper(t *testing.T) {
if err := setSubreaper(); err != nil {
t.Fatalf("setSubreaper() returned unexpected error: %v", err)
}
}

// TestSetSubreaperIdempotent verifies that calling setSubreaper multiple times
// does not return an error.
func TestSetSubreaperIdempotent(t *testing.T) {
for i := 0; i < 3; i++ {
if err := setSubreaper(); err != nil {
t.Fatalf("setSubreaper() call %d returned unexpected error: %v", i+1, err)
}
}
}

// TestReapAllReapsDirectChild verifies that reapAll successfully waits for and
// removes a finished direct child, so that it does not remain as a zombie.
func TestReapAllReapsDirectChild(t *testing.T) {
cmd := exec.Command("/bin/true")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start child process: %v", err)
}
childPID := cmd.Process.Pid

// Wait until the child has actually exited (it becomes a zombie at this
// point because we have not called cmd.Wait yet).
waitForZombie(t, childPID)

reapAll(logging.NewNopLogger())

// After reaping, /proc/<pid> should no longer exist.
if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); !os.IsNotExist(err) {
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", childPID)
}
}

// TestReapAllReapsMultipleChildren verifies that reapAll drains multiple zombie
// children in a single call (the internal loop runs until WNOHANG returns
// pid <= 0).
func TestReapAllReapsMultipleChildren(t *testing.T) {
const childCount = 5
pids := make([]int, 0, childCount)

for i := 0; i < childCount; i++ {
cmd := exec.Command("/bin/true")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start child process %d: %v", i, err)
}
pids = append(pids, cmd.Process.Pid)
}

// Wait for all children to become zombies.
for _, pid := range pids {
waitForZombie(t, pid)
}

reapAll(logging.NewNopLogger())

for _, pid := range pids {
if _, err := os.Stat(fmt.Sprintf("/proc/%d", pid)); !os.IsNotExist(err) {
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", pid)
}
}
}

// TestReapAllAfterSIGKILL verifies that reapAll correctly reaps a child that
// was terminated with SIGKILL (non-zero exit status path).
func TestReapAllAfterSIGKILL(t *testing.T) {
cmd := exec.Command("/bin/sleep", "60")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start child process: %v", err)
}
childPID := cmd.Process.Pid

if err := cmd.Process.Signal(syscall.SIGKILL); err != nil {
t.Fatalf("failed to send SIGKILL: %v", err)
}

waitForZombie(t, childPID)

reapAll(logging.NewNopLogger())

if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); !os.IsNotExist(err) {
t.Errorf("expected /proc/%d to be gone after reapAll, but it still exists", childPID)
}
}

// TestStartReapsChildAfterExit verifies the full Start() flow: the background
// goroutine responds to SIGCHLD and reaps a finished child without any manual
// wait call.
func TestStartReapsChildAfterExit(t *testing.T) {
if err := Start(logging.NewNopLogger()); err != nil {
t.Fatalf("Start() returned unexpected error: %v", err)
}

cmd := exec.Command("/bin/true")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start child process: %v", err)
}
childPID := cmd.Process.Pid

// Poll until the SIGCHLD handler reaps the child or the deadline expires.
deadline := time.Now().Add(5 * time.Second)
for time.Now().Before(deadline) {
if _, err := os.Stat(fmt.Sprintf("/proc/%d", childPID)); os.IsNotExist(err) {
return // reaped successfully
}
time.Sleep(50 * time.Millisecond)
}

t.Errorf("child process %d was not reaped within the deadline", childPID)
}

// waitForZombie polls /proc/<pid>/status until the process state field is 'Z'
// (zombie), confirming it has exited but not yet been waited on.
func waitForZombie(t *testing.T, pid int) {
t.Helper()
deadline := time.Now().Add(5 * time.Second)
for time.Now().Before(deadline) {
state, err := readProcessState(pid)
if err != nil {
// /proc entry already gone — acceptable but unexpected at this point.
return
}
if state == 'Z' {
return
}
time.Sleep(10 * time.Millisecond)
}
t.Fatalf("process %d did not become a zombie within the deadline", pid)
}

// readProcessState returns the single-character state byte from the
// "State:" line of /proc/<pid>/status (e.g. 'R', 'S', 'Z').
func readProcessState(pid int) (byte, error) {
data, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
if err != nil {
return 0, err
}
// Each line has the form "Field:\tvalue\n".
// The State line looks like: "State:\tZ (zombie)"
start := 0
for i, b := range data {
if b != '\n' {
continue
}
line := string(data[start:i])
start = i + 1
if len(line) < 7 || line[:6] != "State:" {
continue
}
for _, ch := range line[6:] {
if ch != '\t' && ch != ' ' {
return byte(ch), nil
}
}
}
return 0, fmt.Errorf("State field not found in /proc/%d/status", pid)
}
40 changes: 40 additions & 0 deletions internal/reaper/reaper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>

SPDX-License-Identifier: Apache-2.0
*/

package reaper

import (
"testing"
"time"

"github.com/crossplane/crossplane-runtime/v2/pkg/logging"
)

// TestReapAllNoChildren verifies that reapAll returns immediately when there
// are no children to reap (WNOHANG behaviour: pid <= 0 on first call).
func TestReapAllNoChildren(t *testing.T) {
done := make(chan struct{})
go func() {
defer close(done)
reapAll(logging.NewNopLogger())
}()

select {
case <-done:
// passed: reapAll returned without blocking
case <-time.After(2 * time.Second):
t.Fatal("reapAll blocked when no children were present")
}
}

// TestReapAllDoesNotPanic verifies that reapAll does not panic when called
// multiple times in succession on a process with no children.
func TestReapAllDoesNotPanic(t *testing.T) {
log := logging.NewNopLogger()
for i := 0; i < 5; i++ {
reapAll(log)
}
}
17 changes: 17 additions & 0 deletions internal/reaper/subreaper_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>

SPDX-License-Identifier: Apache-2.0
*/

package reaper

import (
"golang.org/x/sys/unix"
)

// setSubreaper uses prctl(PR_SET_CHILD_SUBREAPER) to make the current process
// the subreaper for orphaned grandchildren on Linux.
func setSubreaper() error {
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
}
14 changes: 14 additions & 0 deletions internal/reaper/subreaper_other.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//go:build !linux

/*
SPDX-FileCopyrightText: 2026 NephoSolutions srl <https://nephosolutions.com>

SPDX-License-Identifier: Apache-2.0
*/

package reaper

// setSubreaper is a no-op on non-Linux platforms.
func setSubreaper() error {
return nil
}