@@ -23,6 +23,8 @@ import (
23
23
"github.com/rancher/system-agent/pkg/prober"
24
24
"github.com/sirupsen/logrus"
25
25
"golang.org/x/sync/errgroup"
26
+ "k8s.io/apimachinery/pkg/util/wait"
27
+ "k8s.io/client-go/util/retry"
26
28
)
27
29
28
30
type Applyinator struct {
@@ -100,6 +102,13 @@ const restartPendingInterlockFile = "restart-pending"
100
102
const applyinatorActiveInterlockFile = "applyinator-active"
101
103
const restartPendingTimeout = 5 * time .Minute // Wait a maximum of 5 minutes before force-applying a plan if a restart is pending.
102
104
105
+ var customRetry = wait.Backoff {
106
+ Steps : 5 ,
107
+ Duration : 3 * time .Second ,
108
+ Factor : 2.0 ,
109
+ Jitter : 0.1 ,
110
+ }
111
+
103
112
func NewApplyinator (workDir string , preserveWorkDir bool , appliedPlanDir , interlockDir string , imageUtil * image.Utility ) * Applyinator {
104
113
return & Applyinator {
105
114
mu : & sync.Mutex {},
@@ -266,20 +275,45 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266
275
}
267
276
268
277
oneTimeApplySucceeded := true
278
+
269
279
for index , instruction := range input .CalculatedPlan .Plan .OneTimeInstructions {
270
- logrus .Debugf ("[Applyinator] Executing instruction %d attempt %d for plan %s" , index , input .OneTimeInstructionAttempts , input .CalculatedPlan .Checksum )
280
+ logrus .Debugf ("[Applyinator] Executing instruction %d for plan %s" , index , input .CalculatedPlan .Checksum )
281
+
271
282
executionInstructionDir := filepath .Join (executionDir , input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index ))
272
283
prefix := input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index )
273
- executeOutput , _ , exitCode , err := a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
284
+
285
+ var executeOutput []byte
286
+ var exitCode , attempt int
287
+
288
+ err := retry .OnError (customRetry , func (err error ) bool {
289
+ attempt ++
290
+ logrus .Errorf ("[Applyinator] Execution failed for attempt #%d: %v" , attempt , err )
291
+ logrus .Infof ("[Applyinator] Retrying one-time instructions %d for plan %s" , index , input .CalculatedPlan .Checksum )
292
+
293
+ // Always retry regardless of error type
294
+ return true
295
+ }, func () error {
296
+ var retryErr error
297
+ executeOutput , _ , exitCode , retryErr = a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
298
+ if retryErr != nil || exitCode != 0 {
299
+ logrus .Debugf ("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)" , index , retryErr , exitCode )
300
+ return retryErr
301
+ }
302
+ return nil
303
+ })
304
+
305
+ // Final evaluation after retries
274
306
if err != nil || exitCode != 0 {
275
307
logrus .Errorf ("error executing instruction %d: %v" , index , err )
276
308
oneTimeApplySucceeded = false
277
309
}
310
+
278
311
if instruction .Name == "" && instruction .SaveOutput {
279
312
logrus .Errorf ("instruction does not have a name set, cannot save output data" )
280
313
} else if instruction .SaveOutput {
281
314
executionOutputs [instruction .Name ] = executeOutput
282
315
}
316
+
283
317
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284
318
if ! oneTimeApplySucceeded {
285
319
break
0 commit comments