Skip to content

Commit 15b7568

Browse files
committed
fix: add retry logic for one time instruction
1 parent e4876a6 commit 15b7568

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

pkg/applyinator/applyinator.go

+37-2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323
"github.com/rancher/system-agent/pkg/prober"
2424
"github.com/sirupsen/logrus"
2525
"golang.org/x/sync/errgroup"
26+
"k8s.io/apimachinery/pkg/util/wait"
27+
"k8s.io/client-go/util/retry"
2628
)
2729

2830
type Applyinator struct {
@@ -100,6 +102,13 @@ const restartPendingInterlockFile = "restart-pending"
100102
const applyinatorActiveInterlockFile = "applyinator-active"
101103
const restartPendingTimeout = 5 * time.Minute // Wait a maximum of 5 minutes before force-applying a plan if a restart is pending.
102104

105+
var customRetry = wait.Backoff{
106+
Steps: 5,
107+
Duration: 3 * time.Second,
108+
Factor: 2.0,
109+
Jitter: 0.1,
110+
}
111+
103112
func NewApplyinator(workDir string, preserveWorkDir bool, appliedPlanDir, interlockDir string, imageUtil *image.Utility) *Applyinator {
104113
return &Applyinator{
105114
mu: &sync.Mutex{},
@@ -266,20 +275,46 @@ func (a *Applyinator) Apply(ctx context.Context, input ApplyInput) (ApplyOutput,
266275
}
267276

268277
oneTimeApplySucceeded := true
278+
269279
for index, instruction := range input.CalculatedPlan.Plan.OneTimeInstructions {
270-
logrus.Debugf("[Applyinator] Executing instruction %d attempt %d for plan %s", index, input.OneTimeInstructionAttempts, input.CalculatedPlan.Checksum)
280+
logrus.Debugf("[Applyinator] Executing instruction %d for plan %s", index, input.CalculatedPlan.Checksum)
281+
271282
executionInstructionDir := filepath.Join(executionDir, input.CalculatedPlan.Checksum+"_"+strconv.Itoa(index))
272283
prefix := input.CalculatedPlan.Checksum + "_" + strconv.Itoa(index)
273-
executeOutput, _, exitCode, err := a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
284+
285+
var executeOutput []byte
286+
var exitCode, retryCount int
287+
288+
err := retry.OnError(customRetry, func(err error) bool {
289+
retryCount++
290+
logrus.Errorf("[Applyinator] Execution failed: %v", err)
291+
logrus.Infof("[Applyinator] (attempt #%d) Applying one-time instructions for plan with checksum %s", retryCount, input.CalculatedPlan.Checksum)
292+
logrus.Infof("[Applyinator] (attempt #%d) Retrying instruction %d for plan %s", retryCount, index, input.CalculatedPlan.Checksum)
293+
294+
// Always retry regardless of error type
295+
return true
296+
}, func() error {
297+
var retryErr error
298+
executeOutput, _, exitCode, retryErr = a.execute(ctx, prefix, executionInstructionDir, instruction.CommonInstruction, true, input.OneTimeInstructionAttempts)
299+
if retryErr != nil || exitCode != 0 {
300+
logrus.Debugf("[Applyinator] Execution failed for instruction %d: %v (exit code: %d)", index, retryErr, exitCode)
301+
return retryErr
302+
}
303+
return nil
304+
})
305+
306+
// Final evaluation after retries
274307
if err != nil || exitCode != 0 {
275308
logrus.Errorf("error executing instruction %d: %v", index, err)
276309
oneTimeApplySucceeded = false
277310
}
311+
278312
if instruction.Name == "" && instruction.SaveOutput {
279313
logrus.Errorf("instruction does not have a name set, cannot save output data")
280314
} else if instruction.SaveOutput {
281315
executionOutputs[instruction.Name] = executeOutput
282316
}
317+
283318
// If we have failed to apply our one-time instructions, we need to break in order to stop subsequent instructions from executing.
284319
if !oneTimeApplySucceeded {
285320
break

0 commit comments

Comments
 (0)