Skip to content

Commit f5bec37

Browse files
committed
manager: fix race condition in Stop() using sync.Once
The switch from RWMutex to sync.Map in commit 20e39fc introduced a race condition in destroyContainer(). When multiple goroutines attempt to destroy the same container simultaneously, they can all call Stop() on the same containerData, causing a "close of closed channel" panic. The panic manifests as: panic: close of closed channel goroutine N [running]: github.com/google/cadvisor/manager.(*containerData).Stop(...) manager/container.go:143 +0x46 Fix this by using sync.Once to ensure the stop channel is only closed once, making Stop() safe to call multiple times. Signed-off-by: Davanum Srinivas <davanum@gmail.com>
1 parent e4d5281 commit f5bec37

2 files changed

Lines changed: 139 additions & 2 deletions

File tree

manager/container.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ type containerData struct {
104104
logUsage bool
105105

106106
// Tells the container to stop.
107-
stop chan struct{}
107+
stop chan struct{}
108+
stopOnce sync.Once
108109

109110
// Tells the container to immediately collect stats
110111
onDemandChan chan chan struct{}
@@ -140,7 +141,12 @@ func (cd *containerData) Stop() error {
140141
if err != nil {
141142
return err
142143
}
143-
close(cd.stop)
144+
// Use sync.Once to ensure the channel is only closed once, preventing
145+
// panic from concurrent calls to Stop() when multiple goroutines try
146+
// to destroy the same container simultaneously.
147+
cd.stopOnce.Do(func() {
148+
close(cd.stop)
149+
})
144150
cd.perfCollector.Destroy()
145151
cd.resctrlCollector.Destroy()
146152
return nil

manager/manager_test.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"reflect"
2424
"strings"
25+
"sync"
2526
"testing"
2627
"time"
2728

@@ -646,3 +647,133 @@ func (m *mockEventHandler) GetEvents(request *events.Request) ([]*info.Event, er
646647

647648
func (m *mockEventHandler) StopWatch(watchID int) {
648649
}
650+
651+
// threadSafeEventHandler is a thread-safe version of mockEventHandler for concurrent tests.
652+
type threadSafeEventHandler struct {
653+
mu sync.Mutex
654+
events []*info.Event
655+
}
656+
657+
func (m *threadSafeEventHandler) AddEvent(e *info.Event) error {
658+
m.mu.Lock()
659+
defer m.mu.Unlock()
660+
m.events = append(m.events, e)
661+
return nil
662+
}
663+
664+
func (m *threadSafeEventHandler) WatchEvents(request *events.Request) (*events.EventChannel, error) {
665+
return nil, fmt.Errorf("not implemented")
666+
}
667+
668+
func (m *threadSafeEventHandler) GetEvents(request *events.Request) ([]*info.Event, error) {
669+
return nil, fmt.Errorf("not implemented")
670+
}
671+
672+
func (m *threadSafeEventHandler) StopWatch(watchID int) {
673+
}
674+
675+
// TestContainerDataStopConcurrent verifies that concurrent calls to Stop()
676+
// do not cause a panic. This is a regression test for a race condition where
677+
// multiple goroutines could call Stop() on the same containerData, causing a
678+
// "close of closed channel" panic.
679+
func TestContainerDataStopConcurrent(t *testing.T) {
680+
memoryCache := memory.New(60*time.Second, nil)
681+
682+
// Create a minimal containerData with the fields needed for Stop()
683+
cd := &containerData{
684+
info: containerInfo{
685+
ContainerReference: info.ContainerReference{
686+
Name: "/test-concurrent",
687+
},
688+
},
689+
memoryCache: memoryCache,
690+
stop: make(chan struct{}),
691+
perfCollector: &stats.NoopCollector{},
692+
resctrlCollector: &stats.NoopCollector{},
693+
}
694+
695+
// Launch multiple goroutines that all try to call Stop() simultaneously
696+
const numGoroutines = 10
697+
var wg sync.WaitGroup
698+
wg.Add(numGoroutines)
699+
700+
// Use a channel to synchronize goroutines to start at the same time
701+
start := make(chan struct{})
702+
703+
for i := 0; i < numGoroutines; i++ {
704+
go func() {
705+
defer wg.Done()
706+
<-start // Wait for signal to start
707+
// This should not panic even if called multiple times concurrently
708+
_ = cd.Stop()
709+
}()
710+
}
711+
712+
// Signal all goroutines to start simultaneously
713+
close(start)
714+
715+
// Wait for all goroutines to complete - if there's a panic, the test will fail
716+
wg.Wait()
717+
}
718+
719+
// TestDestroyContainerConcurrent verifies that concurrent calls to destroyContainer
720+
// for the same container do not cause a panic.
721+
func TestDestroyContainerConcurrent(t *testing.T) {
722+
memoryCache := memory.New(60*time.Second, nil)
723+
m := &manager{
724+
quitChannels: make([]chan error, 0, 2),
725+
memoryCache: memoryCache,
726+
}
727+
728+
mockEventHandler := &threadSafeEventHandler{}
729+
mockHandler := containertest.NewMockContainerHandler("/test-concurrent")
730+
mockHandler.On("Start").Return(nil)
731+
mockHandler.On("Cleanup").Return()
732+
mockHandler.On("Stop").Return()
733+
// GetExitCode may be called multiple times due to concurrent access
734+
mockHandler.On("GetExitCode").Return(-1, nil).Maybe()
735+
736+
// Create the container
737+
cd := &containerData{
738+
handler: mockHandler,
739+
info: containerInfo{
740+
ContainerReference: info.ContainerReference{
741+
Name: "/test-concurrent",
742+
},
743+
},
744+
memoryCache: memoryCache,
745+
stop: make(chan struct{}),
746+
perfCollector: &stats.NoopCollector{},
747+
resctrlCollector: &stats.NoopCollector{},
748+
}
749+
750+
// Add to manager's container map
751+
m.containers.Store(namespacedContainerName{Name: "/test-concurrent"}, cd)
752+
753+
// Register event handler
754+
m.eventHandler = mockEventHandler
755+
756+
// Launch multiple goroutines that all try to destroy the same container
757+
const numGoroutines = 10
758+
var wg sync.WaitGroup
759+
wg.Add(numGoroutines)
760+
761+
start := make(chan struct{})
762+
763+
for i := 0; i < numGoroutines; i++ {
764+
go func() {
765+
defer wg.Done()
766+
<-start
767+
_ = m.destroyContainer("/test-concurrent")
768+
}()
769+
}
770+
771+
close(start)
772+
wg.Wait()
773+
774+
// With sync.Once protecting only the channel close, multiple goroutines
775+
// can still process the container and add events. The key assertion is
776+
// that no panic occurred (test would fail otherwise).
777+
// At least one event should be recorded.
778+
assert.GreaterOrEqual(t, len(mockEventHandler.events), 1, "at least one destruction event should be recorded")
779+
}

0 commit comments

Comments
 (0)