@@ -23,6 +23,8 @@ import (
23
23
"github.com/rancher/system-agent/pkg/prober"
24
24
"github.com/sirupsen/logrus"
25
25
"golang.org/x/sync/errgroup"
26
+ "k8s.io/apimachinery/pkg/util/wait"
27
+ "k8s.io/client-go/util/retry"
26
28
)
27
29
28
30
type Applyinator struct {
@@ -266,20 +268,51 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266
268
}
267
269
268
270
oneTimeApplySucceeded := true
271
+ customRetry := wait.Backoff {
272
+ Steps : 5 ,
273
+ Duration : 2 * time .Second ,
274
+ Factor : 2.0 ,
275
+ Jitter : 0.1 ,
276
+ }
277
+
269
278
for index , instruction := range input .CalculatedPlan .Plan .OneTimeInstructions {
270
- logrus .Debugf ("[Applyinator] Executing instruction %d attempt %d for plan %s" , index , input .OneTimeInstructionAttempts , input .CalculatedPlan .Checksum )
279
+ logrus .Debugf ("[Applyinator] Executing instruction %d for plan %s" , index , input .CalculatedPlan .Checksum )
280
+
271
281
executionInstructionDir := filepath .Join (executionDir , input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index ))
272
282
prefix := input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index )
273
- executeOutput , _ , exitCode , err := a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
283
+
284
+ var executeOutput []byte
285
+ var exitCode , retryCount int
286
+
287
+ err := retry .OnError (customRetry , func (err error ) bool {
288
+ retryCount ++
289
+ logrus .Debugf ("[Applyinator] Execution failed: %v" , err )
290
+ logrus .Debugf ("[Applyinator] (attempt %d) Retrying instruction %d for plan %s" , retryCount , index , input .CalculatedPlan .Checksum )
291
+
292
+ // Always retry regardless of error type
293
+ return true
294
+ }, func () error {
295
+ var retryErr error
296
+ executeOutput , _ , exitCode , retryErr = a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
297
+ if retryErr != nil || exitCode != 0 {
298
+ logrus .Debugf ("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)" , index , retryErr , exitCode )
299
+ return retryErr
300
+ }
301
+ return nil
302
+ })
303
+
304
+ // Final evaluation after retries
274
305
if err != nil || exitCode != 0 {
275
306
logrus .Errorf ("error executing instruction %d: %v" , index , err )
276
307
oneTimeApplySucceeded = false
277
308
}
309
+
278
310
if instruction .Name == "" && instruction .SaveOutput {
279
311
logrus .Errorf ("instruction does not have a name set, cannot save output data" )
280
312
} else if instruction .SaveOutput {
281
313
executionOutputs [instruction .Name ] = executeOutput
282
314
}
315
+
283
316
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284
317
if ! oneTimeApplySucceeded {
285
318
break
0 commit comments