ipfs · lidel · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025
@@ -2,7 +2,6 @@ package autoconf
 
 import (
 	"testing"
-	"time"
 
 	"github.com/ipfs/kubo/test/cli/harness"
 	"github.com/stretchr/testify/assert"
@@ -21,10 +20,12 @@ func TestSwarmConnectWithAutoConf(t *testing.T) {
 	t.Parallel()
 
 	t.Run("AutoConf disabled - should work", func(t *testing.T) {
+		// Don't run subtests in parallel to avoid daemon startup conflicts
 		testSwarmConnectWithAutoConfSetting(t, false, true) // expect success
 	})
 
 	t.Run("AutoConf enabled - should work", func(t *testing.T) {
+		// Don't run subtests in parallel to avoid daemon startup conflicts
 		testSwarmConnectWithAutoConfSetting(t, true, true) // expect success (fix the bug!)
 	})
 }
@@ -44,25 +45,18 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp
 		"/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
 	})
 
-	// CRITICAL: Start the daemon first - this is the key requirement
-	// The daemon must be running and working properly
+	// Start the daemon
 	node.StartDaemon()
 	defer node.StopDaemon()
 
-	// Give daemon time to start up completely
-	time.Sleep(3 * time.Second)
-
-	// Verify daemon is responsive
-	result := node.RunIPFS("id")
-	require.Equal(t, 0, result.ExitCode(), "Daemon should be responsive before testing swarm connect")
 	t.Logf("Daemon is running and responsive. AutoConf enabled: %v", autoConfEnabled)
 
 	// Now test swarm connect to a bootstrap peer
 	// This should work because:
 	// 1. The daemon is running
 	// 2. The CLI should connect to the daemon via API
 	// 3. The daemon should handle the swarm connect request
-	result = node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io")
+	result := node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io")
 
 	// swarm connect should work regardless of AutoConf setting
 	assert.Equal(t, 0, result.ExitCode(),

@@ -35,7 +35,25 @@ func EnableDebugLogging() {
 // NewT constructs a harness that cleans up after the given test is done.
 func NewT(t *testing.T, options ...func(h *Harness)) *Harness {
 	h := New(options...)
-	t.Cleanup(h.Cleanup)
+
+	// Single consolidated cleanup with proper panic recovery
+	t.Cleanup(func() {
+		defer func() {
+			// Always force-kill daemon processes on panic
+			if r := recover(); r != nil {
+				log.Debugf("panic during cleanup: %v, forcing daemon cleanup", r)
+				CleanupDaemonProcesses()
+				panic(r) // re-panic after cleanup
+			}
+		}()
+
+		// Run full cleanup which includes daemon cleanup
+		h.Cleanup()
+
+		// Final safety check for any remaining processes
+		CleanupDaemonProcesses()
+	})
+
 	return h
 }
 
@@ -181,12 +199,32 @@ func (h *Harness) Sh(expr string) *RunResult {
 }
 
 func (h *Harness) Cleanup() {
+	defer func() {
+		if r := recover(); r != nil {
+			log.Debugf("panic during harness cleanup: %v, forcing daemon cleanup", r)
+			CleanupDaemonProcesses()
+			panic(r)
+		}
+	}()
+
 	log.Debugf("cleaning up cluster")
-	h.Nodes.StopDaemons()
-	// TODO: don't do this if test fails, not sure how?
+
+	// Try graceful daemon shutdown with panic protection
+	func() {
+		defer func() {
+			if r := recover(); r != nil {
+				log.Debugf("panic stopping daemons gracefully: %v", r)
+			}
+		}()
+		h.Nodes.StopDaemons()
+	}()
+
+	// Force cleanup any remaining daemon processes
+	CleanupDaemonProcesses()
+
+	// Clean up temp directory
 	log.Debugf("removing harness dir")
-	err := os.RemoveAll(h.Dir)
-	if err != nil {
+	if err := os.RemoveAll(h.Dir); err != nil {
 		log.Panicf("removing temp dir %s: %s", h.Dir, err)
 	}
 }

@@ -265,19 +265,76 @@ func (n *Node) StartDaemonWithReq(req RunRequest, authorization string) *Node {
 	if alive {
 		log.Panicf("node %d is already running", n.ID)
 	}
+
+	// Retry daemon start up to 3 times for transient failures
+	const maxAttempts = 3
+	var lastErr error
+
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		if attempt > 1 {
+			log.Debugf("retrying daemon start for node %d (attempt %d/%d)", n.ID, attempt, maxAttempts)
+			time.Sleep(time.Second)
+		}
+
+		if err := n.tryStartDaemon(req, authorization); err == nil {
+			// Success - verify daemon is responsive
+			n.waitForDaemonReady()
+			return n
+		} else {
+			lastErr = err
+			log.Debugf("node %d daemon start attempt %d failed: %v", n.ID, attempt, err)
+		}
+	}
+
+	// All attempts failed
+	log.Panicf("node %d failed to start daemon after %d attempts: %v", n.ID, maxAttempts, lastErr)
+	return n
+}
+
+// tryStartDaemon attempts to start the daemon once
+func (n *Node) tryStartDaemon(req RunRequest, authorization string) (err error) {
+	defer func() {
+		if r := recover(); r != nil {
+			err = fmt.Errorf("panic during daemon start: %v", r)
+		}
+	}()
+
 	newReq := req
 	newReq.Path = n.IPFSBin
 	newReq.Args = append([]string{"daemon"}, req.Args...)
 	newReq.RunFunc = (*exec.Cmd).Start
 
 	log.Debugf("starting node %d", n.ID)
 	res := n.Runner.MustRun(newReq)
-
 	n.Daemon = res
 
+	// Register the daemon process for cleanup tracking
+	if res.Cmd != nil && res.Cmd.Process != nil {
+		globalProcessTracker.registerProcess(res.Cmd.Process)
+	}
+
 	log.Debugf("node %d started, checking API", n.ID)
 	n.WaitOnAPI(authorization)
-	return n
+	return nil
+}
+
+// waitForDaemonReady waits for the daemon to be fully responsive
+func (n *Node) waitForDaemonReady() {
+	const maxRetries = 30
+	const retryDelay = 200 * time.Millisecond
+
+	for i := 0; i < maxRetries; i++ {
+		result := n.RunIPFS("id")
+		if result.ExitCode() == 0 {
+			log.Debugf("node %d daemon is fully responsive", n.ID)
+			return
+		}
+		if i == maxRetries-1 {
+			log.Panicf("node %d daemon not responsive after %d retries. stderr: %s",
+				n.ID, maxRetries, result.Stderr.String())
+		}
+		time.Sleep(retryDelay)
+	}
 }
 
 func (n *Node) StartDaemon(ipfsArgs ...string) *Node {
@@ -317,6 +374,10 @@ func (n *Node) StopDaemon() *Node {
 		log.Debugf("didn't stop node %d since no daemon present", n.ID)
 		return n
 	}
+
+	// Store PID for cleanup tracking
+	pid := n.Daemon.Cmd.Process.Pid
+
 	watch := make(chan struct{}, 1)
 	go func() {
 		_, _ = n.Daemon.Cmd.Process.Wait()
@@ -326,25 +387,30 @@ func (n *Node) StopDaemon() *Node {
 	// os.Interrupt does not support interrupts on Windows https://github.com/golang/go/issues/46345
 	if runtime.GOOS == "windows" {
 		if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) {
+			globalProcessTracker.unregisterProcess(pid)
 			return n
 		}
 		log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID())
 	}
 
 	log.Debugf("signaling node %d with SIGTERM", n.ID)
 	if n.signalAndWait(watch, syscall.SIGTERM, 1*time.Second) {
+		globalProcessTracker.unregisterProcess(pid)
 		return n
 	}
 	log.Debugf("signaling node %d with SIGTERM", n.ID)
 	if n.signalAndWait(watch, syscall.SIGTERM, 2*time.Second) {
+		globalProcessTracker.unregisterProcess(pid)
 		return n
 	}
 	log.Debugf("signaling node %d with SIGQUIT", n.ID)
 	if n.signalAndWait(watch, syscall.SIGQUIT, 5*time.Second) {
+		globalProcessTracker.unregisterProcess(pid)
 		return n
 	}
 	log.Debugf("signaling node %d with SIGKILL", n.ID)
 	if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) {
+		globalProcessTracker.unregisterProcess(pid)
 		return n
 	}
 	log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID())

@@ -28,6 +28,7 @@ func (n Nodes) ForEachPar(f func(*Node)) {
 	wg.Wait()
 }
 
+// Connect establishes connections between all nodes in the collection
 func (n Nodes) Connect() Nodes {
 	for i, node := range n {
 		for j, otherNode := range n {

@@ -0,0 +1,95 @@
+package harness
+
+import (
+	"errors"
+	"os"
+	"sync"
+	"syscall"
+	"time"
+)
+
+// processTracker keeps track of all daemon processes started during tests
+type processTracker struct {
+	mu        sync.Mutex
+	processes map[int]*os.Process
+}
+
+// globalProcessTracker is a package-level tracker for all spawned daemons
+var globalProcessTracker = &processTracker{
+	processes: make(map[int]*os.Process),
+}
+
+// registerProcess adds a process to the tracker
+func (pt *processTracker) registerProcess(proc *os.Process) {
+	if proc == nil {
+		return
+	}
+	pt.mu.Lock()
+	defer pt.mu.Unlock()
+	pt.processes[proc.Pid] = proc
+	log.Debugf("registered daemon process PID %d", proc.Pid)
+}
+
+// unregisterProcess removes a process from the tracker
+func (pt *processTracker) unregisterProcess(pid int) {
+	pt.mu.Lock()
+	defer pt.mu.Unlock()
+	delete(pt.processes, pid)
+	log.Debugf("unregistered daemon process PID %d", pid)
+}
+
+// killAll forcefully terminates all tracked processes
+func (pt *processTracker) killAll() {
+	pt.mu.Lock()
+	defer pt.mu.Unlock()
+
+	count := len(pt.processes)
+	if count == 0 {
+		return
+	}
+
+	log.Debugf("cleaning up %d daemon processes", count)
+
+	for pid, proc := range pt.processes {
+		log.Debugf("force killing daemon process PID %d", pid)
+
+		// Try SIGTERM first
+		if err := proc.Signal(syscall.SIGTERM); err != nil {
+			if !isProcessDone(err) {
+				log.Debugf("error sending SIGTERM to PID %d: %v", pid, err)
+			}
+		}
+
+		// Give it a moment to terminate
+		time.Sleep(100 * time.Millisecond)
+
+		// Force kill if still running
+		if err := proc.Kill(); err != nil {
+			if !isProcessDone(err) {
+				log.Debugf("error killing PID %d: %v", pid, err)
+			}
+		}
+
+		// Clean up entry
+		delete(pt.processes, pid)
+	}
+}
+
+// isProcessDone checks if an error indicates the process has already exited
+func isProcessDone(err error) bool {
+	return errors.Is(err, os.ErrProcessDone)
+}
+
+// CleanupDaemonProcesses kills all tracked daemon processes
+// This should be called in test cleanup or panic recovery
+func CleanupDaemonProcesses() {
+	globalProcessTracker.killAll()
+}
+
+// RegisterBackgroundProcess registers an external process for cleanup tracking
+// This is useful for tests that start processes outside of the harness Runner
+func RegisterBackgroundProcess(proc *os.Process) {
+	if proc != nil {
+		globalProcessTracker.registerProcess(proc)
+	}
+}
@@ -109,6 +109,14 @@ func (r *Runner) Run(req RunRequest) *RunResult {
 		Err:    err,
 	}
 
+	// Track background processes automatically
+	// If the process is still running after RunFunc returns, it's a background process
+	if err == nil && cmd.Process != nil && cmd.ProcessState == nil {
+		// Process was started but not waited for (background process)
+		globalProcessTracker.registerProcess(cmd.Process)
+		log.Debugf("auto-tracked background process PID %d: %v", cmd.Process.Pid, cmd.Args)
+	}
+
 	if exitErr, ok := err.(*exec.ExitError); ok {
 		result.ExitErr = exitErr
 	}

@@ -600,6 +600,9 @@ func runDaemonWithMigrationMonitoring(t *testing.T, node *harness.Node, migratio
 	err = cmd.Start()
 	require.NoError(t, err)
 
+	// Register for cleanup in case of test failure
+	harness.RegisterBackgroundProcess(cmd.Process)
+
 	var allOutput strings.Builder
 	var migrationDetected, migrationSucceeded, daemonReady bool
 

@@ -215,6 +215,9 @@ func runDaemonWithMigrationMonitoringCustomEnv(t *testing.T, node *harness.Node,
 	// Start the daemon
 	require.NoError(t, cmd.Start())
 
+	// Register for cleanup in case of test failure
+	harness.RegisterBackgroundProcess(cmd.Process)
+
 	// Monitor output from both streams
 	var outputBuffer strings.Builder
 	done := make(chan bool)