@@ -23,6 +23,8 @@ import (
23
23
"github.com/rancher/system-agent/pkg/prober"
24
24
"github.com/sirupsen/logrus"
25
25
"golang.org/x/sync/errgroup"
26
+ "k8s.io/apimachinery/pkg/util/wait"
27
+ "k8s.io/client-go/util/retry"
26
28
)
27
29
28
30
type Applyinator struct {
@@ -100,6 +102,13 @@ const restartPendingInterlockFile = "restart-pending"
100
102
const applyinatorActiveInterlockFile = "applyinator-active"
101
103
const restartPendingTimeout = 5 * time .Minute // Wait a maximum of 5 minutes before force-applying a plan if a restart is pending.
102
104
105
+ var customRetry = wait.Backoff {
106
+ Steps : 5 ,
107
+ Duration : 3 * time .Second ,
108
+ Factor : 2.0 ,
109
+ Jitter : 0.1 ,
110
+ }
111
+
103
112
func NewApplyinator (workDir string , preserveWorkDir bool , appliedPlanDir , interlockDir string , imageUtil * image.Utility ) * Applyinator {
104
113
return & Applyinator {
105
114
mu : & sync.Mutex {},
@@ -266,20 +275,46 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266
275
}
267
276
268
277
oneTimeApplySucceeded := true
278
+
269
279
for index , instruction := range input .CalculatedPlan .Plan .OneTimeInstructions {
270
- logrus .Debugf ("[Applyinator] Executing instruction %d attempt %d for plan %s" , index , input .OneTimeInstructionAttempts , input .CalculatedPlan .Checksum )
280
+ logrus .Debugf ("[Applyinator] Executing instruction %d for plan %s" , index , input .CalculatedPlan .Checksum )
281
+
271
282
executionInstructionDir := filepath .Join (executionDir , input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index ))
272
283
prefix := input .CalculatedPlan .Checksum + "_" + strconv .Itoa (index )
273
- executeOutput , _ , exitCode , err := a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
284
+
285
+ var executeOutput []byte
286
+ var exitCode , retryCount int
287
+
288
+ err := retry .OnError (customRetry , func (err error ) bool {
289
+ retryCount ++
290
+ logrus .Errorf ("[Applyinator] Execution failed: %v" , err )
291
+ logrus .Infof ("[Applyinator] (attempt #%d) Applying one-time instructions for plan with checksum %s" , retryCount , input .CalculatedPlan .Checksum )
292
+ logrus .Infof ("[Applyinator] (attempt #%d) Retrying instruction %d for plan %s" , retryCount , index , input .CalculatedPlan .Checksum )
293
+
294
+ // Always retry regardless of error type
295
+ return true
296
+ }, func () error {
297
+ var retryErr error
298
+ executeOutput , _ , exitCode , retryErr = a .execute (ctx , prefix , executionInstructionDir , instruction .CommonInstruction , true , input .OneTimeInstructionAttempts )
299
+ if retryErr != nil || exitCode != 0 {
300
+ logrus .Debugf ("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)" , index , retryErr , exitCode )
301
+ return retryErr
302
+ }
303
+ return nil
304
+ })
305
+
306
+ // Final evaluation after retries
274
307
if err != nil || exitCode != 0 {
275
308
logrus .Errorf ("error executing instruction %d: %v" , index , err )
276
309
oneTimeApplySucceeded = false
277
310
}
311
+
278
312
if instruction .Name == "" && instruction .SaveOutput {
279
313
logrus .Errorf ("instruction does not have a name set, cannot save output data" )
280
314
} else if instruction .SaveOutput {
281
315
executionOutputs [instruction .Name ] = executeOutput
282
316
}
317
+
283
318
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284
319
if ! oneTimeApplySucceeded {
285
320
break
0 commit comments