From 75972a8f245afcac91e70e2f956cbd8740393677 Mon Sep 17 00:00:00 2001 From: Mateusz Gozdek Date: Mon, 2 Mar 2026 19:28:47 +0100 Subject: [PATCH 1/2] Revert "test/testenv: run dial-stdio with Pdeathsig and process group" This reverts commit 5fa64841a4f6574b0ec093aaef18bfd675a33e09. Signed-off-by: Mateusz Gozdek --- test/testenv/buildx.go | 38 +++++++------------------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/test/testenv/buildx.go b/test/testenv/buildx.go index fdc205f44..bed6011bc 100644 --- a/test/testenv/buildx.go +++ b/test/testenv/buildx.go @@ -11,10 +11,8 @@ import ( "net" "os" "os/exec" - "runtime" "strings" "sync" - "syscall" "testing" "time" @@ -106,15 +104,6 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { // the buildx dial-stdio process from cleaning up its resources properly. cmd := exec.Command("docker", args...) cmd.Env = os.Environ() - cmd.SysProcAttr = &syscall.SysProcAttr{ - // Put the child in its own process group so we can kill the entire - // group (docker + docker-buildx plugin) during cleanup. - Setpgid: true, - // Send SIGTERM to the child process when the parent (test process) dies. - // This prevents dial-stdio processes from being orphaned when the test - // suite is interrupted or crashes. - Pdeathsig: syscall.SIGTERM, - } c1, c2 := net.Pipe() cmd.Stdin = c1 @@ -128,29 +117,16 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { ww := io.MultiWriter(w, errBuf) cmd.Stderr = ww - // processDone is closed when cmd.Wait() returns, signaling the cleanup - // function that the process has exited. - processDone := make(chan struct{}) - go func() { - // Lock this goroutine to its OS thread for the lifetime of the child process. - // Pdeathsig delivers the signal when the *thread* that forked the child exits, - // not when the process exits. Without locking, the Go runtime may destroy the - // thread that called cmd.Start(), prematurely delivering SIGTERM to the child. - runtime.LockOSThread() - defer runtime.UnlockOSThread() - - if err := cmd.Start(); err != nil { - // Propagate the start error through the stderr pipe so the - // scanner below will surface it via scanner.Err(). - w.CloseWithError(err) - return - } + if err := cmd.Start(); err != nil { + return nil, err + } + chWait := make(chan struct{}) + go func() { err := cmd.Wait() - close(processDone) c1.Close() // pkgerrors.Wrap will return nil if err is nil, otherwise it will give - // us a wrapped error with the buffered stderr from the command. + // us a wrapped error with the buffered stderr from he command. w.CloseWithError(pkgerrors.Wrapf(err, "%s", errBuf)) }() @@ -184,7 +160,7 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { } select { - case <-processDone: + case <-chWait: case <-time.After(10 * time.Second): // If it still doesn't exit, force kill cmd.Process.Kill() //nolint:errcheck // Force kill if it doesn't exit after interrupt From 19aa2aa754bad5280858c65c5f94e1121487a772 Mon Sep 17 00:00:00 2001 From: Mateusz Gozdek Date: Mon, 2 Mar 2026 08:54:28 +0100 Subject: [PATCH 2/2] test/testenv: let Docker CLI shut down dial-stdio gracefully After looking into buildx and Docker CLI plugin runtime implementation, eventually I got to this code, which seems to be properly cleaning up the dial-stdio process when test suite gets interrupted or panics, without using process group and threads locking, which are OS specific. Integration tests now explicitly close client connection which makes dial-stdio process to exit on its own gracefully, even without passing the interrupt signal, which I think is an elegant solution. Closes #974 Signed-off-by: Mateusz Gozdek --- test/main_test.go | 6 +++++ test/testenv/buildx.go | 56 +++++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/test/main_test.go b/test/main_test.go index 42f688c46..6b12b6529 100644 --- a/test/main_test.go +++ b/test/main_test.go @@ -96,6 +96,12 @@ func TestMain(m *testing.M) { cancel() }() + defer func() { + if err := testEnv.Close(); err != nil { + fmt.Fprintln(os.Stderr, "Error closing test environment:", err) + } + }() + if err := testEnv.Load(ctx, phonyRef, fixtures.PhonyFrontend); err != nil { panic(err) } diff --git a/test/testenv/buildx.go b/test/testenv/buildx.go index bed6011bc..57b22ee12 100644 --- a/test/testenv/buildx.go +++ b/test/testenv/buildx.go @@ -54,6 +54,20 @@ func (b *BuildxEnv) WithBuilder(builder string) *BuildxEnv { return b } +// Close closes the underlying buildkit client connection, which triggers +// cleanup of the dial-stdio process. +func (b *BuildxEnv) Close() error { + b.mu.Lock() + defer b.mu.Unlock() + + if b.client != nil { + err := b.client.Close() + b.client = nil + return err + } + return nil +} + // Load loads the output of the specified [gwclient.BuildFunc] into the buildkit instance. func (b *BuildxEnv) Load(ctx context.Context, id string, f gwclient.BuildFunc) error { if b.refs == nil { @@ -85,10 +99,7 @@ func (c *connCloseWrapper) Close() error { if c.close != nil { c.close() } - if err := c.Conn.Close(); err != nil { - return err - } - return nil + return c.Conn.Close() } func (b *BuildxEnv) dialStdio(ctx context.Context) error { @@ -105,9 +116,9 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { cmd := exec.Command("docker", args...) cmd.Env = os.Environ() - c1, c2 := net.Pipe() - cmd.Stdin = c1 - cmd.Stdout = c1 + dialStdioConn, clientConn := net.Pipe() + cmd.Stdin = dialStdioConn + cmd.Stdout = dialStdioConn // Use a pipe to check when the connection is actually complete // Also write all of stderr to an error buffer so we can have more details @@ -121,12 +132,15 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { return nil, err } + // chWait is closed when cmd.Wait() returns, signaling the cleanup + // function that the process has exited. chWait := make(chan struct{}) go func() { err := cmd.Wait() - c1.Close() + close(chWait) + dialStdioConn.Close() // pkgerrors.Wrap will return nil if err is nil, otherwise it will give - // us a wrapped error with the buffered stderr from he command. + // us a wrapped error with the buffered stderr from the command. w.CloseWithError(pkgerrors.Wrapf(err, "%s", errBuf)) }() @@ -149,28 +163,26 @@ func (b *BuildxEnv) dialStdio(ctx context.Context) error { } out := &connCloseWrapper{ - Conn: c2, + Conn: clientConn, close: sync.OnceFunc(func() { - // Send 2 interrupt signals to the process to ensure it exits gracefully - // This is how buildx/docker plugins handle termination - - cmd.Process.Signal(os.Interrupt) //nolint:errcheck // We don't care about this error, we are going to send another one anyway - if err := cmd.Process.Signal(os.Interrupt); err != nil { - cmd.Process.Kill() //nolint:errcheck // Force kill if interrupt fails - } + // Close the stdin/stdout pipe to the process. + // This causes stdin EOF in buildx's dial-stdio, which triggers + // closeWrite(conn) on the buildkit connection and should start + // the chain reaction for docker CLI process to exit. + dialStdioConn.Close() select { case <-chWait: case <-time.After(10 * time.Second): - // If it still doesn't exit, force kill - cmd.Process.Kill() //nolint:errcheck // Force kill if it doesn't exit after interrupt + // Safety net: force kill if still running. + cmd.Process.Kill() //nolint:errcheck + <-chWait } }), } return out, nil })) - if err != nil { return err } @@ -331,9 +343,7 @@ func (b *BuildxEnv) RunTest(ctx context.Context, t *testing.T, f TestFunc, opts t.Fatalf("%+v", err) } - var ( - ch chan *client.SolveStatus - ) + var ch chan *client.SolveStatus if cfg.SolveStatusFn != nil { ch = make(chan *client.SolveStatus, 1)