diff --git a/test/cli/autoconf/swarm_connect_test.go b/test/cli/autoconf/swarm_connect_test.go index 95c75d95317..a9e58b7c93e 100644 --- a/test/cli/autoconf/swarm_connect_test.go +++ b/test/cli/autoconf/swarm_connect_test.go @@ -2,7 +2,6 @@ package autoconf import ( "testing" - "time" "github.com/ipfs/kubo/test/cli/harness" "github.com/stretchr/testify/assert" @@ -21,10 +20,12 @@ func TestSwarmConnectWithAutoConf(t *testing.T) { t.Parallel() t.Run("AutoConf disabled - should work", func(t *testing.T) { + // Don't run subtests in parallel to avoid daemon startup conflicts testSwarmConnectWithAutoConfSetting(t, false, true) // expect success }) t.Run("AutoConf enabled - should work", func(t *testing.T) { + // Don't run subtests in parallel to avoid daemon startup conflicts testSwarmConnectWithAutoConfSetting(t, true, true) // expect success (fix the bug!) }) } @@ -44,17 +45,10 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp "/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb", }) - // CRITICAL: Start the daemon first - this is the key requirement - // The daemon must be running and working properly + // Start the daemon node.StartDaemon() defer node.StopDaemon() - // Give daemon time to start up completely - time.Sleep(3 * time.Second) - - // Verify daemon is responsive - result := node.RunIPFS("id") - require.Equal(t, 0, result.ExitCode(), "Daemon should be responsive before testing swarm connect") t.Logf("Daemon is running and responsive. AutoConf enabled: %v", autoConfEnabled) // Now test swarm connect to a bootstrap peer @@ -62,7 +56,7 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp // 1. The daemon is running // 2. The CLI should connect to the daemon via API // 3. The daemon should handle the swarm connect request - result = node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io") + result := node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io") // swarm connect should work regardless of AutoConf setting assert.Equal(t, 0, result.ExitCode(), diff --git a/test/cli/harness/harness.go b/test/cli/harness/harness.go index 067608cdc38..98a466eb58e 100644 --- a/test/cli/harness/harness.go +++ b/test/cli/harness/harness.go @@ -35,7 +35,25 @@ func EnableDebugLogging() { // NewT constructs a harness that cleans up after the given test is done. func NewT(t *testing.T, options ...func(h *Harness)) *Harness { h := New(options...) - t.Cleanup(h.Cleanup) + + // Single consolidated cleanup with proper panic recovery + t.Cleanup(func() { + defer func() { + // Always force-kill daemon processes on panic + if r := recover(); r != nil { + log.Debugf("panic during cleanup: %v, forcing daemon cleanup", r) + CleanupDaemonProcesses() + panic(r) // re-panic after cleanup + } + }() + + // Run full cleanup which includes daemon cleanup + h.Cleanup() + + // Final safety check for any remaining processes + CleanupDaemonProcesses() + }) + return h } @@ -181,12 +199,32 @@ func (h *Harness) Sh(expr string) *RunResult { } func (h *Harness) Cleanup() { + defer func() { + if r := recover(); r != nil { + log.Debugf("panic during harness cleanup: %v, forcing daemon cleanup", r) + CleanupDaemonProcesses() + panic(r) + } + }() + log.Debugf("cleaning up cluster") - h.Nodes.StopDaemons() - // TODO: don't do this if test fails, not sure how? + + // Try graceful daemon shutdown with panic protection + func() { + defer func() { + if r := recover(); r != nil { + log.Debugf("panic stopping daemons gracefully: %v", r) + } + }() + h.Nodes.StopDaemons() + }() + + // Force cleanup any remaining daemon processes + CleanupDaemonProcesses() + + // Clean up temp directory log.Debugf("removing harness dir") - err := os.RemoveAll(h.Dir) - if err != nil { + if err := os.RemoveAll(h.Dir); err != nil { log.Panicf("removing temp dir %s: %s", h.Dir, err) } } diff --git a/test/cli/harness/node.go b/test/cli/harness/node.go index 6403a2f1af6..367035f52ff 100644 --- a/test/cli/harness/node.go +++ b/test/cli/harness/node.go @@ -265,6 +265,40 @@ func (n *Node) StartDaemonWithReq(req RunRequest, authorization string) *Node { if alive { log.Panicf("node %d is already running", n.ID) } + + // Retry daemon start up to 3 times for transient failures + const maxAttempts = 3 + var lastErr error + + for attempt := 1; attempt <= maxAttempts; attempt++ { + if attempt > 1 { + log.Debugf("retrying daemon start for node %d (attempt %d/%d)", n.ID, attempt, maxAttempts) + time.Sleep(time.Second) + } + + if err := n.tryStartDaemon(req, authorization); err == nil { + // Success - verify daemon is responsive + n.waitForDaemonReady() + return n + } else { + lastErr = err + log.Debugf("node %d daemon start attempt %d failed: %v", n.ID, attempt, err) + } + } + + // All attempts failed + log.Panicf("node %d failed to start daemon after %d attempts: %v", n.ID, maxAttempts, lastErr) + return n +} + +// tryStartDaemon attempts to start the daemon once +func (n *Node) tryStartDaemon(req RunRequest, authorization string) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic during daemon start: %v", r) + } + }() + newReq := req newReq.Path = n.IPFSBin newReq.Args = append([]string{"daemon"}, req.Args...) @@ -272,12 +306,35 @@ func (n *Node) StartDaemonWithReq(req RunRequest, authorization string) *Node { log.Debugf("starting node %d", n.ID) res := n.Runner.MustRun(newReq) - n.Daemon = res + // Register the daemon process for cleanup tracking + if res.Cmd != nil && res.Cmd.Process != nil { + globalProcessTracker.registerProcess(res.Cmd.Process) + } + log.Debugf("node %d started, checking API", n.ID) n.WaitOnAPI(authorization) - return n + return nil +} + +// waitForDaemonReady waits for the daemon to be fully responsive +func (n *Node) waitForDaemonReady() { + const maxRetries = 30 + const retryDelay = 200 * time.Millisecond + + for i := 0; i < maxRetries; i++ { + result := n.RunIPFS("id") + if result.ExitCode() == 0 { + log.Debugf("node %d daemon is fully responsive", n.ID) + return + } + if i == maxRetries-1 { + log.Panicf("node %d daemon not responsive after %d retries. stderr: %s", + n.ID, maxRetries, result.Stderr.String()) + } + time.Sleep(retryDelay) + } } func (n *Node) StartDaemon(ipfsArgs ...string) *Node { @@ -317,6 +374,10 @@ func (n *Node) StopDaemon() *Node { log.Debugf("didn't stop node %d since no daemon present", n.ID) return n } + + // Store PID for cleanup tracking + pid := n.Daemon.Cmd.Process.Pid + watch := make(chan struct{}, 1) go func() { _, _ = n.Daemon.Cmd.Process.Wait() @@ -326,6 +387,7 @@ func (n *Node) StopDaemon() *Node { // os.Interrupt does not support interrupts on Windows https://github.com/golang/go/issues/46345 if runtime.GOOS == "windows" { if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) { + globalProcessTracker.unregisterProcess(pid) return n } log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID()) @@ -333,18 +395,22 @@ func (n *Node) StopDaemon() *Node { log.Debugf("signaling node %d with SIGTERM", n.ID) if n.signalAndWait(watch, syscall.SIGTERM, 1*time.Second) { + globalProcessTracker.unregisterProcess(pid) return n } log.Debugf("signaling node %d with SIGTERM", n.ID) if n.signalAndWait(watch, syscall.SIGTERM, 2*time.Second) { + globalProcessTracker.unregisterProcess(pid) return n } log.Debugf("signaling node %d with SIGQUIT", n.ID) if n.signalAndWait(watch, syscall.SIGQUIT, 5*time.Second) { + globalProcessTracker.unregisterProcess(pid) return n } log.Debugf("signaling node %d with SIGKILL", n.ID) if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) { + globalProcessTracker.unregisterProcess(pid) return n } log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID()) diff --git a/test/cli/harness/nodes.go b/test/cli/harness/nodes.go index 8a5451e0374..f638d32be5f 100644 --- a/test/cli/harness/nodes.go +++ b/test/cli/harness/nodes.go @@ -28,6 +28,7 @@ func (n Nodes) ForEachPar(f func(*Node)) { wg.Wait() } +// Connect establishes connections between all nodes in the collection func (n Nodes) Connect() Nodes { for i, node := range n { for j, otherNode := range n { diff --git a/test/cli/harness/process_tracker.go b/test/cli/harness/process_tracker.go new file mode 100644 index 00000000000..dfad69bab9d --- /dev/null +++ b/test/cli/harness/process_tracker.go @@ -0,0 +1,95 @@ +package harness + +import ( + "errors" + "os" + "sync" + "syscall" + "time" +) + +// processTracker keeps track of all daemon processes started during tests +type processTracker struct { + mu sync.Mutex + processes map[int]*os.Process +} + +// globalProcessTracker is a package-level tracker for all spawned daemons +var globalProcessTracker = &processTracker{ + processes: make(map[int]*os.Process), +} + +// registerProcess adds a process to the tracker +func (pt *processTracker) registerProcess(proc *os.Process) { + if proc == nil { + return + } + pt.mu.Lock() + defer pt.mu.Unlock() + pt.processes[proc.Pid] = proc + log.Debugf("registered daemon process PID %d", proc.Pid) +} + +// unregisterProcess removes a process from the tracker +func (pt *processTracker) unregisterProcess(pid int) { + pt.mu.Lock() + defer pt.mu.Unlock() + delete(pt.processes, pid) + log.Debugf("unregistered daemon process PID %d", pid) +} + +// killAll forcefully terminates all tracked processes +func (pt *processTracker) killAll() { + pt.mu.Lock() + defer pt.mu.Unlock() + + count := len(pt.processes) + if count == 0 { + return + } + + log.Debugf("cleaning up %d daemon processes", count) + + for pid, proc := range pt.processes { + log.Debugf("force killing daemon process PID %d", pid) + + // Try SIGTERM first + if err := proc.Signal(syscall.SIGTERM); err != nil { + if !isProcessDone(err) { + log.Debugf("error sending SIGTERM to PID %d: %v", pid, err) + } + } + + // Give it a moment to terminate + time.Sleep(100 * time.Millisecond) + + // Force kill if still running + if err := proc.Kill(); err != nil { + if !isProcessDone(err) { + log.Debugf("error killing PID %d: %v", pid, err) + } + } + + // Clean up entry + delete(pt.processes, pid) + } +} + +// isProcessDone checks if an error indicates the process has already exited +func isProcessDone(err error) bool { + return errors.Is(err, os.ErrProcessDone) +} + +// CleanupDaemonProcesses kills all tracked daemon processes +// This should be called in test cleanup or panic recovery +func CleanupDaemonProcesses() { + globalProcessTracker.killAll() +} + +// RegisterBackgroundProcess registers an external process for cleanup tracking +// This is useful for tests that start processes outside of the harness Runner +func RegisterBackgroundProcess(proc *os.Process) { + if proc != nil { + globalProcessTracker.registerProcess(proc) + } +} diff --git a/test/cli/harness/run.go b/test/cli/harness/run.go index 077af6ca574..5a347609c6e 100644 --- a/test/cli/harness/run.go +++ b/test/cli/harness/run.go @@ -109,6 +109,14 @@ func (r *Runner) Run(req RunRequest) *RunResult { Err: err, } + // Track background processes automatically + // If the process is still running after RunFunc returns, it's a background process + if err == nil && cmd.Process != nil && cmd.ProcessState == nil { + // Process was started but not waited for (background process) + globalProcessTracker.registerProcess(cmd.Process) + log.Debugf("auto-tracked background process PID %d: %v", cmd.Process.Pid, cmd.Args) + } + if exitErr, ok := err.(*exec.ExitError); ok { result.ExitErr = exitErr } diff --git a/test/cli/migrations/migration_16_to_17_test.go b/test/cli/migrations/migration_16_to_17_test.go index e4d75bffdda..4dca0e95352 100644 --- a/test/cli/migrations/migration_16_to_17_test.go +++ b/test/cli/migrations/migration_16_to_17_test.go @@ -600,6 +600,9 @@ func runDaemonWithMigrationMonitoring(t *testing.T, node *harness.Node, migratio err = cmd.Start() require.NoError(t, err) + // Register for cleanup in case of test failure + harness.RegisterBackgroundProcess(cmd.Process) + var allOutput strings.Builder var migrationDetected, migrationSucceeded, daemonReady bool diff --git a/test/cli/migrations/migration_legacy_15_to_17_test.go b/test/cli/migrations/migration_legacy_15_to_17_test.go index 1471cab1f42..8260a133142 100644 --- a/test/cli/migrations/migration_legacy_15_to_17_test.go +++ b/test/cli/migrations/migration_legacy_15_to_17_test.go @@ -215,6 +215,9 @@ func runDaemonWithMigrationMonitoringCustomEnv(t *testing.T, node *harness.Node, // Start the daemon require.NoError(t, cmd.Start()) + // Register for cleanup in case of test failure + harness.RegisterBackgroundProcess(cmd.Process) + // Monitor output from both streams var outputBuffer strings.Builder done := make(chan bool)