Skip to content

Commit 1431653

Browse files
authored
Restore status fields when gitjob creation fails (#4949) (#4988)
When manageGitJob fails, updateErrorStatus was persisting mutated in-memory status fields to Kubernetes even though no gitjob had been created. On the next reconcile all shouldCreateJob conditions evaluated to false, permanently losing the trigger. Three fields were affected: - status.Commit — promoted by getNextCommit before manageGitJob is called - status.UpdateGeneration — consumed by updateGenerationValuesIfNeeded before job creation - status.ObservedGeneration — consumed by the same call The fix saves the original values of all three fields before manageGitJob is called and restores them in the error path, so the next reconcile still sees the pending trigger and retries job creation. Tests are included that fail on the unfixed code and pass after the fix. Refers to: #4948 Signed-off-by: Xavi Garcia <xavi.garcia@suse.com>
1 parent 2bb4769 commit 1431653

2 files changed

Lines changed: 226 additions & 0 deletions

File tree

internal/cmd/controller/gitops/reconciler/gitjob_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,11 +263,22 @@ func (r *GitJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
263263
}
264264

265265
oldCommit := gitrepo.Status.Commit
266+
oldUpdateGeneration := gitrepo.Status.UpdateGeneration
267+
oldObservedGeneration := gitrepo.Status.ObservedGeneration
266268
// maybe update the commit from webhooks or polling
267269
gitrepo.Status.Commit = getNextCommit(gitrepo.Status)
268270

269271
res, err := r.manageGitJob(ctx, logger, gitrepo, oldCommit)
270272
if err != nil || res.RequeueAfter > 0 {
273+
// Restore the status fields that may have been mutated in memory
274+
// (by getNextCommit above or by updateGenerationValuesIfNeeded inside
275+
// manageGitJob) before persisting the error status. Without this, the
276+
// next reconcile would see all shouldCreateJob conditions as false and
277+
// never retry the failed job creation.
278+
// Refers to: https://github.com/rancher/fleet/issues/4948
279+
gitrepo.Status.Commit = oldCommit
280+
gitrepo.Status.UpdateGeneration = oldUpdateGeneration
281+
gitrepo.Status.ObservedGeneration = oldObservedGeneration
271282
return res, updateErrorStatus(ctx, r.Client, req.NamespacedName, gitrepo.Status, err)
272283
}
273284

internal/cmd/controller/gitops/reconciler/gitjob_test.go

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,221 @@ func TestReconcile_Error_WhenSecretDoesNotExist(t *testing.T) {
341341
}
342342
}
343343

344+
// setupMissingCommitReconciler builds a GitJobReconciler backed by a mock client
345+
// configured for the "validateExternalSecretExist fails" error path.
346+
// Returns the reconciler and a writtenRepo channel that receives the repo
347+
// passed to statusClient.Update exactly once.
348+
func setupMissingCommitReconciler(
349+
t *testing.T,
350+
gitRepo fleetv1.GitRepo,
351+
setupGitRepo func(*fleetv1.GitRepo),
352+
nJobGets int,
353+
) (*GitJobReconciler, <-chan *fleetv1.GitRepo) {
354+
t.Helper()
355+
mockCtrl := gomock.NewController(t)
356+
t.Cleanup(mockCtrl.Finish)
357+
358+
scheme := runtime.NewScheme()
359+
utilruntime.Must(batchv1.AddToScheme(scheme))
360+
utilruntime.Must(fleetv1.AddToScheme(scheme))
361+
362+
mockClient := mocks.NewMockK8sClient(mockCtrl)
363+
mockClient.EXPECT().List(gomock.Any(), gomock.Any(), gomock.Any()).AnyTimes().Return(nil)
364+
365+
// GitRepo Gets: initial fetch, removeDisplayNameLabel, updateStatus
366+
mockClient.EXPECT().
367+
Get(gomock.Any(), gomock.Any(), &gitRepoPointerMatcher{}, gomock.Any()).
368+
Times(3).
369+
DoAndReturn(func(ctx context.Context, req types.NamespacedName, gr *fleetv1.GitRepo, opts ...any) error {
370+
gr.Name = gitRepo.Name
371+
gr.Namespace = gitRepo.Namespace
372+
controllerutil.AddFinalizer(gr, finalize.GitRepoFinalizer)
373+
setupGitRepo(gr)
374+
return nil
375+
})
376+
377+
// Job Gets: always NotFound (deletePreviousJob + manageGitJob check)
378+
mockClient.EXPECT().
379+
Get(gomock.Any(), gomock.Any(), gomock.AssignableToTypeOf(&batchv1.Job{}), gomock.Any()).
380+
Times(nJobGets).
381+
DoAndReturn(func(ctx context.Context, req types.NamespacedName, job *batchv1.Job, opts ...any) error {
382+
return apierrors.NewNotFound(schema.GroupResource{Resource: "jobs"}, req.Name)
383+
})
384+
385+
// Secret Gets: both return NotFound
386+
// - hasReferencedSecretChanged → (false, nil) because no annotation
387+
// - validateExternalSecretExist → returns error
388+
mockClient.EXPECT().
389+
Get(gomock.Any(), gomock.Any(), gomock.AssignableToTypeOf(&corev1.Secret{}), gomock.Any()).
390+
Times(2).
391+
DoAndReturn(func(ctx context.Context, req types.NamespacedName, secret *corev1.Secret, opts ...any) error {
392+
return apierrors.NewNotFound(schema.GroupResource{Resource: "secrets"}, req.Name)
393+
})
394+
395+
recorderMock := mocks.NewMockEventRecorder(mockCtrl)
396+
recorderMock.EXPECT().Eventf(
397+
&gitRepoMatcher{gitRepo},
398+
nil,
399+
corev1.EventTypeWarning,
400+
"FailedValidatingSecret",
401+
"ValidateSecret",
402+
"%v",
403+
gomock.Any(),
404+
).Times(1)
405+
406+
writtenRepo := make(chan *fleetv1.GitRepo, 1)
407+
statusClient := mocks.NewMockStatusWriter(mockCtrl)
408+
mockClient.EXPECT().Status().Times(1).Return(statusClient)
409+
statusClient.EXPECT().Update(gomock.Any(), gomock.Any(), gomock.Any()).
410+
DoAndReturn(func(ctx context.Context, repo *fleetv1.GitRepo, opts ...any) error {
411+
writtenRepo <- repo.DeepCopy()
412+
return nil
413+
})
414+
415+
r := &GitJobReconciler{
416+
Client: mockClient,
417+
Scheme: scheme,
418+
Image: "test",
419+
Clock: RealClock{},
420+
Recorder: recorderMock,
421+
}
422+
return r, writtenRepo
423+
}
424+
425+
// Test_CommitNotPromotedOnGitJobError proves that when manageGitJob fails,
426+
// the new webhook commit that was promoted into gitrepo.Status.Commit in-memory
427+
// is not persisted to k8s by updateErrorStatus.
428+
//
429+
// Refers to: https://github.com/rancher/fleet/issues/4948
430+
func Test_CommitNotPromotedOnGitJobError(t *testing.T) {
431+
const (
432+
oldCommit = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
433+
newCommit = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
434+
)
435+
436+
gitRepo := fleetv1.GitRepo{
437+
ObjectMeta: metav1.ObjectMeta{Name: "gitrepo", Namespace: "default"},
438+
}
439+
namespacedName := types.NamespacedName{Name: gitRepo.Name, Namespace: gitRepo.Namespace}
440+
441+
// commit changes (2 job Gets: deletePreviousJob + manageGitJob check)
442+
r, writtenRepo := setupMissingCommitReconciler(t, gitRepo,
443+
func(gr *fleetv1.GitRepo) {
444+
gr.Spec.Repo = "repo"
445+
gr.Spec.HelmSecretNameForPaths = "helm-secret"
446+
gr.Status.Commit = oldCommit
447+
gr.Status.WebhookCommit = newCommit // triggers shouldCreateJob via commit != oldCommit
448+
},
449+
2,
450+
)
451+
452+
_, err := r.Reconcile(context.TODO(), ctrl.Request{NamespacedName: namespacedName})
453+
if err == nil {
454+
t.Fatal("expected reconcile error, got nil")
455+
}
456+
457+
written := <-writtenRepo
458+
459+
// The commit must NOT have been advanced to newCommit.
460+
// On the buggy code this assertion fails because status.Commit == newCommit.
461+
if written.Status.Commit != oldCommit {
462+
t.Errorf("status.Commit was promoted despite gitjob failure: got %q, want %q",
463+
written.Status.Commit, oldCommit)
464+
}
465+
}
466+
467+
// Test_ForceSyncGenerationNotConsumedOnGitJobError proves that when manageGitJob
468+
// fails, the spec.forceSyncGeneration bump that was consumed into
469+
// gitrepo.Status.UpdateGeneration in-memory is not persisted to k8s by
470+
// updateErrorStatus.
471+
//
472+
// Refers to: https://github.com/rancher/fleet/issues/4948
473+
func Test_ForceSyncGenerationNotConsumedOnGitJobError(t *testing.T) {
474+
const (
475+
existingCommit = "cccccccccccccccccccccccccccccccccccccccc"
476+
oldUpdateGeneration = int64(3)
477+
forceSyncGeneration = int64(5)
478+
)
479+
480+
gitRepo := fleetv1.GitRepo{
481+
ObjectMeta: metav1.ObjectMeta{Name: "gitrepo", Namespace: "default"},
482+
}
483+
namespacedName := types.NamespacedName{Name: gitRepo.Name, Namespace: gitRepo.Namespace}
484+
485+
// commit is stable (1 job Get: manageGitJob check only; deletePreviousJob is a no-op)
486+
r, writtenRepo := setupMissingCommitReconciler(t, gitRepo,
487+
func(gr *fleetv1.GitRepo) {
488+
gr.Spec.Repo = "repo"
489+
gr.Spec.HelmSecretNameForPaths = "helm-secret"
490+
gr.Spec.ForceSyncGeneration = forceSyncGeneration
491+
gr.Status.Commit = existingCommit
492+
gr.Status.UpdateGeneration = oldUpdateGeneration
493+
},
494+
1,
495+
)
496+
497+
_, err := r.Reconcile(context.TODO(), ctrl.Request{NamespacedName: namespacedName})
498+
if err == nil {
499+
t.Fatal("expected reconcile error, got nil")
500+
}
501+
502+
written := <-writtenRepo
503+
504+
// UpdateGeneration must NOT have been advanced to forceSyncGeneration.
505+
if written.Status.UpdateGeneration != oldUpdateGeneration {
506+
t.Errorf("status.UpdateGeneration was consumed despite gitjob failure: got %d, want %d",
507+
written.Status.UpdateGeneration, oldUpdateGeneration)
508+
}
509+
}
510+
511+
// Test_ObservedGenerationNotConsumedOnGitJobError proves that when manageGitJob
512+
// fails, the spec generation change that was consumed into
513+
// gitrepo.Status.ObservedGeneration in-memory is not persisted to k8s by
514+
// updateErrorStatus.
515+
//
516+
// Refers to: https://github.com/rancher/fleet/issues/4948
517+
func Test_ObservedGenerationNotConsumedOnGitJobError(t *testing.T) {
518+
const (
519+
existingCommit = "dddddddddddddddddddddddddddddddddddddddd"
520+
currentGeneration = int64(5)
521+
oldObservedGeneration = int64(3) // must be > 0 for generationChanged() to trigger
522+
)
523+
524+
gitRepo := fleetv1.GitRepo{
525+
ObjectMeta: metav1.ObjectMeta{
526+
Name: "gitrepo",
527+
Namespace: "default",
528+
Generation: currentGeneration,
529+
},
530+
}
531+
namespacedName := types.NamespacedName{Name: gitRepo.Name, Namespace: gitRepo.Namespace}
532+
533+
// commit is stable (1 job Get)
534+
r, writtenRepo := setupMissingCommitReconciler(t, gitRepo,
535+
func(gr *fleetv1.GitRepo) {
536+
gr.Spec.Repo = "repo"
537+
gr.Spec.HelmSecretNameForPaths = "helm-secret"
538+
gr.Generation = currentGeneration
539+
gr.Status.Commit = existingCommit
540+
gr.Status.ObservedGeneration = oldObservedGeneration
541+
},
542+
1,
543+
)
544+
545+
_, err := r.Reconcile(context.TODO(), ctrl.Request{NamespacedName: namespacedName})
546+
if err == nil {
547+
t.Fatal("expected reconcile error, got nil")
548+
}
549+
550+
written := <-writtenRepo
551+
552+
// ObservedGeneration must NOT have been advanced to currentGeneration.
553+
if written.Status.ObservedGeneration != oldObservedGeneration {
554+
t.Errorf("status.ObservedGeneration was consumed despite gitjob failure: got %d, want %d",
555+
written.Status.ObservedGeneration, oldObservedGeneration)
556+
}
557+
}
558+
344559
func TestNewJob(t *testing.T) {
345560
securityContext := &corev1.SecurityContext{
346561
AllowPrivilegeEscalation: &[]bool{false}[0],

0 commit comments

Comments
 (0)