Skip to content

Commit 1d3de06

Browse files
committed
fix: add retry logic for one time instruction
1 parent e4876a6 commit 1d3de06

File tree

1 file changed

+35
-2
lines changed

1 file changed

+35
-2
lines changed

Diff for: pkg/applyinator/applyinator.go

+35-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"github.com/rancher/system-agent/pkg/prober"
2424
"github.com/sirupsen/logrus"
2525
"golang.org/x/sync/errgroup"
26+
"k8s.io/apimachinery/pkg/util/wait"
27+
"k8s.io/client-go/util/retry"
2628
)
2729

2830
type Applyinator struct {
@@ -266,20 +268,51 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266268
}
267269

268270
oneTimeApplySucceeded := true
271+
customRetry := wait.Backoff{
272+
Steps: 5,
273+
Duration: 2 * time.Second,
274+
Factor: 2.0,
275+
Jitter: 0.1,
276+
}
277+
269278
for index, instruction := range input.CalculatedPlan.Plan.OneTimeInstructions {
270-
logrus.Debugf("[Applyinator] Executing instruction %d attempt %d for plan %s", index, input.OneTimeInstructionAttempts, input.CalculatedPlan.Checksum)
279+
logrus.Debugf("[Applyinator] Executing instruction %d for plan %s", index, input.CalculatedPlan.Checksum)
280+
271281
executionInstructionDir := filepath.Join(executionDir, input.CalculatedPlan.Checksum+"_"+strconv.Itoa(index))
272282
prefix := input.CalculatedPlan.Checksum + "_" + strconv.Itoa(index)
273-
executeOutput, _, exitCode, err := a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
283+
284+
var executeOutput []byte
285+
var exitCode, retryCount int
286+
287+
err := retry.OnError(customRetry, func(err error) bool {
288+
retryCount++
289+
logrus.Debugf("[Applyinator] Execution failed: %v", err)
290+
logrus.Debugf("[Applyinator] (attempt %d) Retrying instruction %d for plan %s", retryCount, index, input.CalculatedPlan.Checksum)
291+
292+
// Always retry regardless of error type
293+
return true
294+
}, func() error {
295+
var retryErr error
296+
executeOutput, _, exitCode, retryErr = a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
297+
if retryErr != nil || exitCode != 0 {
298+
logrus.Debugf("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)", index, retryErr, exitCode)
299+
return retryErr
300+
}
301+
return nil
302+
})
303+
304+
// Final evaluation after retries
274305
if err != nil || exitCode != 0 {
275306
logrus.Errorf("error executing instruction %d: %v", index, err)
276307
oneTimeApplySucceeded = false
277308
}
309+
278310
if instruction.Name == "" && instruction.SaveOutput {
279311
logrus.Errorf("instruction does not have a name set, cannot save output data")
280312
} else if instruction.SaveOutput {
281313
executionOutputs[instruction.Name] = executeOutput
282314
}
315+
283316
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284317
if !oneTimeApplySucceeded {
285318
break

0 commit comments

Comments
 (0)