@@ -19,6 +19,9 @@ import (
1919 "github.com/castai/cluster-controller/internal/waitext"
2020)
2121
22+ // gcCompletedActionAfterTimes specifies after how many GCs to remove the completed action from the store.
23+ const gcCompletedActionAfterTimes = 2
24+
2225type Config struct {
2326 PollWaitInterval time.Duration // How long to wait unit next long polling request.
2427 PollTimeout time.Duration // hard timeout. Normally server should return empty result before this timeout.
@@ -40,13 +43,14 @@ func NewService(
4043 actionHandlers actions.ActionHandlers ,
4144) * Controller {
4245 return & Controller {
43- log : log ,
44- cfg : cfg ,
45- k8sVersion : k8sVersion ,
46- castAIClient : castaiClient ,
47- startedActions : map [string ]struct {}{},
48- actionHandlers : actionHandlers ,
49- healthCheck : healthCheck ,
46+ log : log ,
47+ cfg : cfg ,
48+ k8sVersion : k8sVersion ,
49+ castAIClient : castaiClient ,
50+ startedActions : make (map [string ]struct {}),
51+ recentlyCompletedActions : make (map [string ]int8 ),
52+ actionHandlers : actionHandlers ,
53+ healthCheck : healthCheck ,
5054 }
5155}
5256
@@ -59,10 +63,12 @@ type Controller struct {
5963
6064 actionHandlers actions.ActionHandlers
6165
62- startedActionsWg sync.WaitGroup
63- startedActions map [string ]struct {}
64- startedActionsMu sync.Mutex
65- healthCheck * health.HealthzProvider
66+ startedActionsWg sync.WaitGroup
67+ actionsMu sync.Mutex
68+ startedActions map [string ]struct {} // protected by actionsMu
69+ recentlyCompletedActions map [string ]int8 // protected by actionsMu
70+
71+ healthCheck * health.HealthzProvider
6672}
6773
6874func (s * Controller ) Run (ctx context.Context ) {
@@ -122,6 +128,7 @@ func (s *Controller) doWork(ctx context.Context) error {
122128
123129 s .log .WithFields (logrus.Fields {"n" : strconv .Itoa (len (actions ))}).Infof ("received in %s" , pollDuration )
124130 s .handleActions (ctx , actions )
131+ s .gcCompletedActions ()
125132 return nil
126133}
127134
@@ -132,7 +139,10 @@ func (s *Controller) handleActions(ctx context.Context, clusterActions []*castai
132139 }
133140
134141 go func (action * castai.ClusterAction ) {
135- defer s .finishProcessing (action .ID )
142+ var ackErr error
143+ defer func () {
144+ s .finishProcessing (action .ID , ackErr )
145+ }()
136146
137147 var err error
138148
@@ -142,11 +152,12 @@ func (s *Controller) handleActions(ctx context.Context, clusterActions []*castai
142152 handleErr := s .handleAction (ctx , action )
143153 if errors .Is (handleErr , context .Canceled ) {
144154 // Action should be handled again on context canceled errors.
155+ ackErr = ctx .Err ()
145156 return
146157 }
147158
148159 handleDuration := time .Since (startTime )
149- ackErr : = s .ackAction (ctx , action , handleErr , handleDuration )
160+ ackErr = s .ackAction (ctx , action , handleErr , handleDuration )
150161 if handleErr != nil {
151162 err = handleErr
152163 }
@@ -163,29 +174,40 @@ func (s *Controller) handleActions(ctx context.Context, clusterActions []*castai
163174 }
164175}
165176
166- func (s * Controller ) finishProcessing (actionID string ) {
167- s .startedActionsMu .Lock ()
168- defer s .startedActionsMu .Unlock ()
177+ func (s * Controller ) finishProcessing (actionID string , ackErr error ) {
178+ s .actionsMu .Lock ()
179+ defer s .actionsMu .Unlock ()
169180
170181 s .startedActionsWg .Done ()
171182 delete (s .startedActions , actionID )
183+
184+ if ackErr == nil {
185+ // only mark the action as completed if it was successfully acknowledged so it can be retried quickly if not and still requested.
186+ s .recentlyCompletedActions [actionID ] = gcCompletedActionAfterTimes + 1
187+ }
172188}
173189
174190func (s * Controller ) startProcessing (actionID string ) bool {
175- s .startedActionsMu .Lock ()
176- defer s .startedActionsMu .Unlock ()
191+ s .actionsMu .Lock ()
192+ defer s .actionsMu .Unlock ()
177193
178194 if _ , ok := s .startedActions [actionID ]; ok {
179195 return false
180196 }
181197
198+ if _ , ok := s .recentlyCompletedActions [actionID ]; ok {
199+ s .log .WithField (actions .ActionIDLogField , actionID ).Debug ("action has been recently completed, not starting" )
200+ return false
201+ }
202+
182203 if inProgress := len (s .startedActions ); inProgress >= s .cfg .MaxActionsInProgress {
183204 s .log .Warnf ("too many actions in progress %d/%d" , inProgress , s .cfg .MaxActionsInProgress )
184205 return false
185206 }
186207
187208 s .startedActionsWg .Add (1 )
188209 s .startedActions [actionID ] = struct {}{}
210+
189211 return true
190212}
191213
@@ -243,6 +265,25 @@ func (s *Controller) ackAction(ctx context.Context, action *castai.ClusterAction
243265 })
244266}
245267
268+ // gcCompletedActions removes recently completed actions from memory after they've been visited
269+ // a certain number of times during polling cycles. This prevents completed actions from being
270+ // re-executed while allowing enough time for duplicate action requests to be filtered out.
271+ // Actions are removed after gcCompletedActionAfterTimes visits to balance memory usage and
272+ // protection against duplicate execution.
273+ func (s * Controller ) gcCompletedActions () {
274+ s .actionsMu .Lock ()
275+ defer s .actionsMu .Unlock ()
276+
277+ for actionID , timesVisited := range s .recentlyCompletedActions {
278+ timesVisited --
279+ if timesVisited <= 0 {
280+ delete (s .recentlyCompletedActions , actionID )
281+ continue
282+ }
283+ s .recentlyCompletedActions [actionID ] = timesVisited
284+ }
285+ }
286+
246287func getHandlerError (err error ) * string {
247288 if err != nil {
248289 str := err .Error ()
0 commit comments