Skip to content

Commit 50c549e

Browse files
authored
fix: keep retryable failures retrying (#499)
1 parent 526e608 commit 50c549e

12 files changed

Lines changed: 27 additions & 44 deletions

File tree

internal/fixer/runner.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6004,10 +6004,7 @@ func isRetryableFailure(kind QueueFailureKind) bool {
60046004
}
60056005

60066006
func shouldRetryQueueFailure(kind QueueFailureKind, nextAttempts, maxAttempts int64) bool {
6007-
if !isRetryableFailure(kind) {
6008-
return false
6009-
}
6010-
return maxAttempts <= 0 || nextAttempts < maxAttempts
6007+
return isRetryableFailure(kind)
60116008
}
60126009

60136010
func normalizePRState(value string) string {

internal/fixer/runner_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5145,16 +5145,16 @@ func TestProcessClaimedQueueItemResumeValidationFailureUpdatesLoopState(t *testi
51455145
if err != nil {
51465146
t.Fatalf("Queue.GetByID() error = %v", err)
51475147
}
5148-
if queue == nil || queue.Status != "manual_intervention" || queue.LastErrorKind == nil || *queue.LastErrorKind != string(FailureRetryableTransient) || queue.FinishedAt == nil {
5149-
t.Fatalf("queue = %#v, want parked retryable_transient queue item after exhaustion", queue)
5148+
if queue == nil || queue.Status != "queued" || queue.LastErrorKind == nil || *queue.LastErrorKind != string(FailureRetryableTransient) || queue.FinishedAt != nil {
5149+
t.Fatalf("queue = %#v, want requeued retryable_transient queue item after max attempts", queue)
51505150
}
51515151

51525152
loop, err := fixture.repos.Loops.GetByID(context.Background(), "loop_fixer_resume_parse_status")
51535153
if err != nil {
51545154
t.Fatalf("Loops.GetByID() error = %v", err)
51555155
}
5156-
if loop == nil || loop.Status != "paused" || loop.NextRunAt != nil {
5157-
t.Fatalf("loop = %#v, want paused parked loop after exhaustion", loop)
5156+
if loop == nil || loop.Status != "queued" || loop.NextRunAt == nil {
5157+
t.Fatalf("loop = %#v, want queued loop after max attempts", loop)
51585158
}
51595159
if len(git.cleanupCalls) != 0 {
51605160
t.Fatalf("len(git.cleanupCalls) = %d, want no cleanup for manual intervention", len(git.cleanupCalls))

internal/planner/runner.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,10 +2123,7 @@ func isRetryableFailure(kind QueueFailureKind) bool {
21232123
}
21242124

21252125
func shouldRetryQueueFailure(kind QueueFailureKind, nextAttempts, maxAttempts int64) bool {
2126-
if !isRetryableFailure(kind) {
2127-
return false
2128-
}
2129-
return maxAttempts <= 0 || nextAttempts < maxAttempts
2126+
return isRetryableFailure(kind)
21302127
}
21312128

21322129
func cappedRetryDelayAttempt(attempts, maxAttempts int64) int64 {

internal/reviewer/runner.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4153,8 +4153,7 @@ func isRetryableTransientWithRemainingAttempts(queue storage.QueueItemRecord) bo
41534153
}
41544154

41554155
func queueHasRemainingAttempts(queue storage.QueueItemRecord) bool {
4156-
nextAttempts := queue.Attempts + 1
4157-
return queue.MaxAttempts > 0 && nextAttempts < queue.MaxAttempts
4156+
return true
41584157
}
41594158

41604159
func isKnownReviewerRediscoveryGuardrail(message string) bool {
@@ -6064,10 +6063,7 @@ func isRetryableFailure(kind QueueFailureKind) bool {
60646063
}
60656064

60666065
func shouldRetryQueueFailure(kind QueueFailureKind, nextAttempts, maxAttempts int64) bool {
6067-
if !isRetryableFailure(kind) {
6068-
return false
6069-
}
6070-
return maxAttempts <= 0 || nextAttempts < maxAttempts
6066+
return isRetryableFailure(kind)
60716067
}
60726068

60736069
func cloneStrings(values []string) []string {

internal/reviewer/runner_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ func TestReviewerFailedLoopRecoveryEligibilityWhitelist(t *testing.T) {
445445
{name: "retryable restart from discover", seed: failedReviewerRecoverySeed{ResumePolicy: "restart_from_discover", QueueErrorKind: string(FailureRetryableAfterResume), ErrorMessage: "PR head changed before publish"}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: true},
446446
{name: "retryable transient attempts remaining", seed: failedReviewerRecoverySeed{ResumePolicy: "replay_step", QueueErrorKind: string(FailureRetryableTransient), ErrorMessage: "reviewer agent timed out", QueueAttempts: 3, QueueMaxAttempts: 5}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: true},
447447
{name: "historical guardrail non retryable", seed: failedReviewerRecoverySeed{ResumePolicy: "replay_step", QueueErrorKind: string(FailureNonRetryable), ErrorMessage: "review request removed before publish"}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: true},
448-
{name: "retryable transient exhausted on final allowed run", seed: failedReviewerRecoverySeed{ResumePolicy: "replay_step", QueueErrorKind: string(FailureRetryableTransient), ErrorMessage: "reviewer agent timed out", QueueAttempts: 4, QueueMaxAttempts: 5}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: false},
448+
{name: "retryable transient ignores final allowed run", seed: failedReviewerRecoverySeed{ResumePolicy: "replay_step", QueueErrorKind: string(FailureRetryableTransient), ErrorMessage: "reviewer agent timed out", QueueAttempts: 4, QueueMaxAttempts: 5}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: true},
449449
{name: "manual intervention kind", seed: failedReviewerRecoverySeed{ResumePolicy: "replay_step", QueueErrorKind: string(FailureManualIntervention), ErrorMessage: "operator needed"}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: false},
450450
{name: "manual intervention resume policy", seed: failedReviewerRecoverySeed{ResumePolicy: "manual_intervention", QueueErrorKind: string(FailureRetryableAfterResume), ErrorMessage: "operator needed"}, pr: PullRequestSummary{Number: 42, State: "OPEN"}, want: false},
451451
{name: "closed pr", seed: failedReviewerRecoverySeed{ResumePolicy: "restart_from_discover", QueueErrorKind: string(FailureRetryableAfterResume), ErrorMessage: "PR head changed before publish"}, pr: PullRequestSummary{Number: 42, State: "CLOSED"}, want: false},
@@ -6188,7 +6188,7 @@ func TestExecuteStepRetriesTransientDiscoverShellFailure(t *testing.T) {
61886188
}
61896189
}
61906190

6191-
func TestProcessClaimedItemPersistsExhaustedTransientDiscoverShellFailureAsRetryable(t *testing.T) {
6191+
func TestProcessClaimedItemRequeuesTransientDiscoverShellFailureAfterMaxAttempts(t *testing.T) {
61926192
fixture := newRunnerFixture(t)
61936193
github := &fakeGitHubGateway{}
61946194
runner := New(Options{DB: fixture.coordinator.DB(), Repos: fixture.repos, GitHub: github, Git: &fakeGitGateway{}, AgentExecutor: &fakeAgentExecutor{}, Logger: fixture.logger, Now: fixture.now, RetryBaseDelay: time.Nanosecond, RetryMaxAttempts: 1})
@@ -6213,8 +6213,8 @@ func TestProcessClaimedItemPersistsExhaustedTransientDiscoverShellFailureAsRetry
62136213
if err != nil || queue == nil {
62146214
t.Fatalf("Queue.GetByID() = (%#v, %v), want queue", queue, err)
62156215
}
6216-
if queue.Status != "manual_intervention" || queue.LastErrorKind == nil || *queue.LastErrorKind != string(FailureRetryableTransient) || queue.LastError == nil || !strings.Contains(*queue.LastError, "unexpected EOF") || queue.FinishedAt == nil {
6217-
t.Fatalf("queue = %#v, want parked retryable transient item preserving GitHub error after exhaustion", queue)
6216+
if queue.Status != "queued" || queue.LastErrorKind == nil || *queue.LastErrorKind != string(FailureRetryableTransient) || queue.LastError == nil || !strings.Contains(*queue.LastError, "unexpected EOF") || queue.FinishedAt != nil {
6217+
t.Fatalf("queue = %#v, want requeued retryable transient item preserving GitHub error after max attempts", queue)
62186218
}
62196219
}
62206220

internal/runtime/runtime.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2703,8 +2703,7 @@ func runtimeRecoverableEnhancedTransient(policy config.ReviewerRetryConfig, queu
27032703
}
27042704

27052705
func runtimeQueueHasRemainingAttempts(queue storage.QueueItemRecord) bool {
2706-
nextAttempts := queue.Attempts + 1
2707-
return queue.MaxAttempts > 0 && nextAttempts < queue.MaxAttempts
2706+
return true
27082707
}
27092708

27102709
func autoRecoveredReviewerLoop(loop storage.LoopRecord, nowISO string) storage.LoopRecord {

internal/runtime/runtime_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3658,8 +3658,8 @@ func TestShouldAutoRecoverFailedReviewerLoopAllowsRetryableTransientWithAttempts
36583658
t.Fatalf("shouldAutoRecoverFailedReviewerLoop() = false, want true")
36593659
}
36603660
queue.Attempts = 4
3661-
if shouldAutoRecoverFailedReviewerLoop(loop, &run, &queue, policy) {
3662-
t.Fatalf("shouldAutoRecoverFailedReviewerLoop() = true, want false on final allowed run")
3661+
if !shouldAutoRecoverFailedReviewerLoop(loop, &run, &queue, policy) {
3662+
t.Fatalf("shouldAutoRecoverFailedReviewerLoop() = false, want true after max attempts")
36633663
}
36643664
}
36653665

@@ -3686,8 +3686,8 @@ func TestShouldAutoRecoverFailedReviewerLoopAllowsEnhancedMatchedTransientWhenCo
36863686
t.Fatal("shouldAutoRecoverFailedReviewerLoop(enabled) = false, want true")
36873687
}
36883688
queue.Attempts = 4
3689-
if shouldAutoRecoverFailedReviewerLoop(loop, &run, &queue, enabledPolicy) {
3690-
t.Fatal("shouldAutoRecoverFailedReviewerLoop(enabled final attempt) = true, want false")
3689+
if !shouldAutoRecoverFailedReviewerLoop(loop, &run, &queue, enabledPolicy) {
3690+
t.Fatal("shouldAutoRecoverFailedReviewerLoop(enabled final attempt) = false, want true")
36913691
}
36923692
}
36933693

internal/runtime/scheduler.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1773,7 +1773,7 @@ func failSnapshotQueueItem(ctx context.Context, item storage.QueueItemRecord, in
17731773
}
17741774
nowISO := formatJavaScriptISOString(now().UTC())
17751775
nextAttempts := item.Attempts + 1
1776-
if kind == "retryable_transient" && nextAttempts < item.MaxAttempts {
1776+
if kind == "retryable_transient" {
17771777
retryAt := formatJavaScriptISOString(now().UTC().Add(time.Minute * time.Duration(cappedRetryDelayAttempt(nextAttempts, item.MaxAttempts))))
17781778
return input.Repos.Queue.MarkRetry(ctx, storage.QueueMarkRetryInput{ID: item.ID, AvailableAt: retryAt, Attempts: nextAttempts, ErrorMessage: &message, ErrorKind: kind, UpdatedAt: nowISO})
17791779
}

internal/sweeper/runner.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -381,10 +381,7 @@ func (r *Runner) recoverClaimedQueueItem(ctx context.Context, queueItem storage.
381381
}
382382

383383
func shouldRetryQueueFailure(kind string, nextAttempts, maxAttempts int64) bool {
384-
if kind != "retryable_transient" {
385-
return false
386-
}
387-
return maxAttempts <= 0 || nextAttempts < maxAttempts
384+
return kind == "retryable_transient"
388385
}
389386

390387
func cappedRetryDelayAttempt(attempts, maxAttempts int64) int64 {

internal/sweeper/runner_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,7 +1502,7 @@ func TestProcessWarnRecoversAfterCommentPostedButLabelFails(t *testing.T) {
15021502
}
15031503
}
15041504

1505-
func TestProcessWarnFailsQueueItemAfterMaxAttempts(t *testing.T) {
1505+
func TestProcessWarnRequeuesRetryableFailureAfterMaxAttempts(t *testing.T) {
15061506
t.Parallel()
15071507

15081508
fixture := newRunnerFixture(t)
@@ -1525,8 +1525,8 @@ func TestProcessWarnFailsQueueItemAfterMaxAttempts(t *testing.T) {
15251525
if err != nil {
15261526
t.Fatalf("Queue.GetByID() error = %v", err)
15271527
}
1528-
if stored == nil || stored.Status != "manual_intervention" || stored.Attempts != 3 || stored.FinishedAt == nil || stored.LastErrorKind == nil || *stored.LastErrorKind != "retryable_transient" {
1529-
t.Fatalf("stored queue item = %#v, want parked queue item after max attempts", stored)
1528+
if stored == nil || stored.Status != "queued" || stored.Attempts != 3 || stored.FinishedAt != nil || stored.LastErrorKind == nil || *stored.LastErrorKind != "retryable_transient" {
1529+
t.Fatalf("stored queue item = %#v, want requeued queue item after max attempts", stored)
15301530
}
15311531
}
15321532

0 commit comments

Comments
 (0)