Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 4 additions & 10 deletions test/cli/autoconf/swarm_connect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package autoconf

import (
"testing"
"time"

"github.com/ipfs/kubo/test/cli/harness"
"github.com/stretchr/testify/assert"
Expand All @@ -21,10 +20,12 @@ func TestSwarmConnectWithAutoConf(t *testing.T) {
t.Parallel()

t.Run("AutoConf disabled - should work", func(t *testing.T) {
// Don't run subtests in parallel to avoid daemon startup conflicts
testSwarmConnectWithAutoConfSetting(t, false, true) // expect success
})

t.Run("AutoConf enabled - should work", func(t *testing.T) {
// Don't run subtests in parallel to avoid daemon startup conflicts
testSwarmConnectWithAutoConfSetting(t, true, true) // expect success (fix the bug!)
})
}
Expand All @@ -44,25 +45,18 @@ func testSwarmConnectWithAutoConfSetting(t *testing.T, autoConfEnabled bool, exp
"/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb",
})

// CRITICAL: Start the daemon first - this is the key requirement
// The daemon must be running and working properly
// Start the daemon
node.StartDaemon()
defer node.StopDaemon()

// Give daemon time to start up completely
time.Sleep(3 * time.Second)

// Verify daemon is responsive
result := node.RunIPFS("id")
require.Equal(t, 0, result.ExitCode(), "Daemon should be responsive before testing swarm connect")
t.Logf("Daemon is running and responsive. AutoConf enabled: %v", autoConfEnabled)

// Now test swarm connect to a bootstrap peer
// This should work because:
// 1. The daemon is running
// 2. The CLI should connect to the daemon via API
// 3. The daemon should handle the swarm connect request
result = node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io")
result := node.RunIPFS("swarm", "connect", "/dnsaddr/bootstrap.libp2p.io")

// swarm connect should work regardless of AutoConf setting
assert.Equal(t, 0, result.ExitCode(),
Expand Down
48 changes: 43 additions & 5 deletions test/cli/harness/harness.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,25 @@ func EnableDebugLogging() {
// NewT constructs a harness that cleans up after the given test is done.
func NewT(t *testing.T, options ...func(h *Harness)) *Harness {
h := New(options...)
t.Cleanup(h.Cleanup)

// Single consolidated cleanup with proper panic recovery
t.Cleanup(func() {
defer func() {
// Always force-kill daemon processes on panic
if r := recover(); r != nil {
log.Debugf("panic during cleanup: %v, forcing daemon cleanup", r)
CleanupDaemonProcesses()
panic(r) // re-panic after cleanup
}
}()

// Run full cleanup which includes daemon cleanup
h.Cleanup()

// Final safety check for any remaining processes
CleanupDaemonProcesses()
})

return h
}

Expand Down Expand Up @@ -181,12 +199,32 @@ func (h *Harness) Sh(expr string) *RunResult {
}

func (h *Harness) Cleanup() {
defer func() {
if r := recover(); r != nil {
log.Debugf("panic during harness cleanup: %v, forcing daemon cleanup", r)
CleanupDaemonProcesses()
panic(r)
}
}()

log.Debugf("cleaning up cluster")
h.Nodes.StopDaemons()
// TODO: don't do this if test fails, not sure how?

// Try graceful daemon shutdown with panic protection
func() {
defer func() {
if r := recover(); r != nil {
log.Debugf("panic stopping daemons gracefully: %v", r)
}
}()
h.Nodes.StopDaemons()
}()

// Force cleanup any remaining daemon processes
CleanupDaemonProcesses()

// Clean up temp directory
log.Debugf("removing harness dir")
err := os.RemoveAll(h.Dir)
if err != nil {
if err := os.RemoveAll(h.Dir); err != nil {
log.Panicf("removing temp dir %s: %s", h.Dir, err)
}
}
Expand Down
70 changes: 68 additions & 2 deletions test/cli/harness/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,19 +265,76 @@ func (n *Node) StartDaemonWithReq(req RunRequest, authorization string) *Node {
if alive {
log.Panicf("node %d is already running", n.ID)
}

// Retry daemon start up to 3 times for transient failures
const maxAttempts = 3
var lastErr error

for attempt := 1; attempt <= maxAttempts; attempt++ {
if attempt > 1 {
log.Debugf("retrying daemon start for node %d (attempt %d/%d)", n.ID, attempt, maxAttempts)
time.Sleep(time.Second)
}

if err := n.tryStartDaemon(req, authorization); err == nil {
// Success - verify daemon is responsive
n.waitForDaemonReady()
return n
} else {
lastErr = err
log.Debugf("node %d daemon start attempt %d failed: %v", n.ID, attempt, err)
}
}

// All attempts failed
log.Panicf("node %d failed to start daemon after %d attempts: %v", n.ID, maxAttempts, lastErr)
return n
}

// tryStartDaemon attempts to start the daemon once
func (n *Node) tryStartDaemon(req RunRequest, authorization string) (err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic during daemon start: %v", r)
}
}()

newReq := req
newReq.Path = n.IPFSBin
newReq.Args = append([]string{"daemon"}, req.Args...)
newReq.RunFunc = (*exec.Cmd).Start

log.Debugf("starting node %d", n.ID)
res := n.Runner.MustRun(newReq)

n.Daemon = res

// Register the daemon process for cleanup tracking
if res.Cmd != nil && res.Cmd.Process != nil {
globalProcessTracker.registerProcess(res.Cmd.Process)
}

log.Debugf("node %d started, checking API", n.ID)
n.WaitOnAPI(authorization)
return n
return nil
}

// waitForDaemonReady waits for the daemon to be fully responsive
func (n *Node) waitForDaemonReady() {
const maxRetries = 30
const retryDelay = 200 * time.Millisecond

for i := 0; i < maxRetries; i++ {
result := n.RunIPFS("id")
if result.ExitCode() == 0 {
log.Debugf("node %d daemon is fully responsive", n.ID)
return
}
if i == maxRetries-1 {
log.Panicf("node %d daemon not responsive after %d retries. stderr: %s",
n.ID, maxRetries, result.Stderr.String())
}
time.Sleep(retryDelay)
}
}

func (n *Node) StartDaemon(ipfsArgs ...string) *Node {
Expand Down Expand Up @@ -317,6 +374,10 @@ func (n *Node) StopDaemon() *Node {
log.Debugf("didn't stop node %d since no daemon present", n.ID)
return n
}

// Store PID for cleanup tracking
pid := n.Daemon.Cmd.Process.Pid

watch := make(chan struct{}, 1)
go func() {
_, _ = n.Daemon.Cmd.Process.Wait()
Expand All @@ -326,25 +387,30 @@ func (n *Node) StopDaemon() *Node {
// os.Interrupt does not support interrupts on Windows https://github.com/golang/go/issues/46345
if runtime.GOOS == "windows" {
if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) {
globalProcessTracker.unregisterProcess(pid)
return n
}
log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID())
}

log.Debugf("signaling node %d with SIGTERM", n.ID)
if n.signalAndWait(watch, syscall.SIGTERM, 1*time.Second) {
globalProcessTracker.unregisterProcess(pid)
return n
}
log.Debugf("signaling node %d with SIGTERM", n.ID)
if n.signalAndWait(watch, syscall.SIGTERM, 2*time.Second) {
globalProcessTracker.unregisterProcess(pid)
return n
}
log.Debugf("signaling node %d with SIGQUIT", n.ID)
if n.signalAndWait(watch, syscall.SIGQUIT, 5*time.Second) {
globalProcessTracker.unregisterProcess(pid)
return n
}
log.Debugf("signaling node %d with SIGKILL", n.ID)
if n.signalAndWait(watch, syscall.SIGKILL, 5*time.Second) {
globalProcessTracker.unregisterProcess(pid)
return n
}
log.Panicf("timed out stopping node %d with peer ID %s", n.ID, n.PeerID())
Expand Down
1 change: 1 addition & 0 deletions test/cli/harness/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ func (n Nodes) ForEachPar(f func(*Node)) {
wg.Wait()
}

// Connect establishes connections between all nodes in the collection
func (n Nodes) Connect() Nodes {
for i, node := range n {
for j, otherNode := range n {
Expand Down
95 changes: 95 additions & 0 deletions test/cli/harness/process_tracker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package harness

import (
"errors"
"os"
"sync"
"syscall"
"time"
)

// processTracker keeps track of all daemon processes started during tests
type processTracker struct {
mu sync.Mutex
processes map[int]*os.Process
}

// globalProcessTracker is a package-level tracker for all spawned daemons
var globalProcessTracker = &processTracker{
processes: make(map[int]*os.Process),
}

// registerProcess adds a process to the tracker
func (pt *processTracker) registerProcess(proc *os.Process) {
if proc == nil {
return
}
pt.mu.Lock()
defer pt.mu.Unlock()
pt.processes[proc.Pid] = proc
log.Debugf("registered daemon process PID %d", proc.Pid)
}

// unregisterProcess removes a process from the tracker
func (pt *processTracker) unregisterProcess(pid int) {
pt.mu.Lock()
defer pt.mu.Unlock()
delete(pt.processes, pid)
log.Debugf("unregistered daemon process PID %d", pid)
}

// killAll forcefully terminates all tracked processes
func (pt *processTracker) killAll() {
pt.mu.Lock()
defer pt.mu.Unlock()

count := len(pt.processes)
if count == 0 {
return
}

log.Debugf("cleaning up %d daemon processes", count)

for pid, proc := range pt.processes {
log.Debugf("force killing daemon process PID %d", pid)

// Try SIGTERM first
if err := proc.Signal(syscall.SIGTERM); err != nil {
if !isProcessDone(err) {
log.Debugf("error sending SIGTERM to PID %d: %v", pid, err)
}
}

// Give it a moment to terminate
time.Sleep(100 * time.Millisecond)

// Force kill if still running
if err := proc.Kill(); err != nil {
if !isProcessDone(err) {
log.Debugf("error killing PID %d: %v", pid, err)
}
}

// Clean up entry
delete(pt.processes, pid)
}
}

// isProcessDone checks if an error indicates the process has already exited
func isProcessDone(err error) bool {
return errors.Is(err, os.ErrProcessDone)
}

// CleanupDaemonProcesses kills all tracked daemon processes
// This should be called in test cleanup or panic recovery
func CleanupDaemonProcesses() {
globalProcessTracker.killAll()
}

// RegisterBackgroundProcess registers an external process for cleanup tracking
// This is useful for tests that start processes outside of the harness Runner
func RegisterBackgroundProcess(proc *os.Process) {
if proc != nil {
globalProcessTracker.registerProcess(proc)
}
}
8 changes: 8 additions & 0 deletions test/cli/harness/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ func (r *Runner) Run(req RunRequest) *RunResult {
Err: err,
}

// Track background processes automatically
// If the process is still running after RunFunc returns, it's a background process
if err == nil && cmd.Process != nil && cmd.ProcessState == nil {
// Process was started but not waited for (background process)
globalProcessTracker.registerProcess(cmd.Process)
log.Debugf("auto-tracked background process PID %d: %v", cmd.Process.Pid, cmd.Args)
}

if exitErr, ok := err.(*exec.ExitError); ok {
result.ExitErr = exitErr
}
Expand Down
3 changes: 3 additions & 0 deletions test/cli/migrations/migration_16_to_17_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,9 @@ func runDaemonWithMigrationMonitoring(t *testing.T, node *harness.Node, migratio
err = cmd.Start()
require.NoError(t, err)

// Register for cleanup in case of test failure
harness.RegisterBackgroundProcess(cmd.Process)

var allOutput strings.Builder
var migrationDetected, migrationSucceeded, daemonReady bool

Expand Down
3 changes: 3 additions & 0 deletions test/cli/migrations/migration_legacy_15_to_17_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,9 @@ func runDaemonWithMigrationMonitoringCustomEnv(t *testing.T, node *harness.Node,
// Start the daemon
require.NoError(t, cmd.Start())

// Register for cleanup in case of test failure
harness.RegisterBackgroundProcess(cmd.Process)

// Monitor output from both streams
var outputBuffer strings.Builder
done := make(chan bool)
Expand Down
Loading