Skip to content

Commit 845d8b0

Browse files
committed
fix: add retry logic for one time instruction
1 parent e4876a6 commit 845d8b0

File tree

1 file changed

+36
-2
lines changed

1 file changed

+36
-2
lines changed

Diff for: pkg/applyinator/applyinator.go

+36-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"github.com/rancher/system-agent/pkg/prober"
2424
"github.com/sirupsen/logrus"
2525
"golang.org/x/sync/errgroup"
26+
"k8s.io/apimachinery/pkg/util/wait"
27+
"k8s.io/client-go/util/retry"
2628
)
2729

2830
type Applyinator struct {
@@ -100,6 +102,13 @@ const restartPendingInterlockFile = "restart-pending"
100102
const applyinatorActiveInterlockFile = "applyinator-active"
101103
const restartPendingTimeout = 5 * time.Minute // Wait a maximum of 5 minutes before force-applying a plan if a restart is pending.
102104

105+
var customRetry = wait.Backoff{
106+
Steps: 5,
107+
Duration: 3 * time.Second,
108+
Factor: 2.0,
109+
Jitter: 0.1,
110+
}
111+
103112
func NewApplyinator(workDir string, preserveWorkDir bool, appliedPlanDir, interlockDir string, imageUtil *image.Utility) *Applyinator {
104113
return &Applyinator{
105114
mu: &sync.Mutex{},
@@ -266,20 +275,45 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266275
}
267276

268277
oneTimeApplySucceeded := true
278+
269279
for index, instruction := range input.CalculatedPlan.Plan.OneTimeInstructions {
270-
logrus.Debugf("[Applyinator] Executing instruction %d attempt %d for plan %s", index, input.OneTimeInstructionAttempts, input.CalculatedPlan.Checksum)
280+
logrus.Debugf("[Applyinator] Executing instruction %d for plan %s", index, input.CalculatedPlan.Checksum)
281+
271282
executionInstructionDir := filepath.Join(executionDir, input.CalculatedPlan.Checksum+"_"+strconv.Itoa(index))
272283
prefix := input.CalculatedPlan.Checksum + "_" + strconv.Itoa(index)
273-
executeOutput, _, exitCode, err := a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
284+
285+
var executeOutput []byte
286+
var exitCode, attempt int
287+
288+
err := retry.OnError(customRetry, func(err error) bool {
289+
attempt++
290+
logrus.Errorf("[Applyinator] Execution failed for attempt #%d: %v", attempt, err)
291+
logrus.Infof("[Applyinator] Retrying one-time instructions %d for plan %s", index, input.CalculatedPlan.Checksum)
292+
293+
// Always retry regardless of error type
294+
return true
295+
}, func() error {
296+
var retryErr error
297+
executeOutput, _, exitCode, retryErr = a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
298+
if retryErr != nil || exitCode != 0 {
299+
logrus.Debugf("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)", index, retryErr, exitCode)
300+
return retryErr
301+
}
302+
return nil
303+
})
304+
305+
// Final evaluation after retries
274306
if err != nil || exitCode != 0 {
275307
logrus.Errorf("error executing instruction %d: %v", index, err)
276308
oneTimeApplySucceeded = false
277309
}
310+
278311
if instruction.Name == "" && instruction.SaveOutput {
279312
logrus.Errorf("instruction does not have a name set, cannot save output data")
280313
} else if instruction.SaveOutput {
281314
executionOutputs[instruction.Name] = executeOutput
282315
}
316+
283317
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284318
if !oneTimeApplySucceeded {
285319
break

0 commit comments

Comments
 (0)