diff --git a/integrationtests/controller/bundle/bundle_secret_deadlock_test.go b/integrationtests/controller/bundle/bundle_secret_deadlock_test.go new file mode 100644 index 0000000000..73e26a40f4 --- /dev/null +++ b/integrationtests/controller/bundle/bundle_secret_deadlock_test.go @@ -0,0 +1,233 @@ +package bundle + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/rancher/fleet/integrationtests/utils" + "github.com/rancher/fleet/internal/helmvalues" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = Describe("BD secret / ValuesHash inconsistency deadlock", func() { + var ( + bundleNS string + clusterNS string + cluster *fleet.Cluster + bundle *fleet.Bundle + testID string + ) + + BeforeEach(func() { + var err error + testID, err = utils.NewNamespaceName() + Expect(err).ToNot(HaveOccurred()) + if len(testID) > 8 { + testID = testID[:8] + } + + bundleNS = "bdl-bns-" + testID + clusterNS = "bdl-cns-" + testID + + Expect(k8sClient.Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: bundleNS}, + })).ToNot(HaveOccurred()) + Expect(k8sClient.Create(ctx, &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: clusterNS}, + })).ToNot(HaveOccurred()) + + cluster, err = utils.CreateCluster(ctx, k8sClient, "test-cluster-"+testID, bundleNS, + map[string]string{"bdl-env": testID}, clusterNS) + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + if bundle != nil { + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, bundle))).To(Succeed()) + // Wait for BDs to be cleaned up before deleting namespaces. + Eventually(func(g Gomega) { + bdList := &fleet.BundleDeploymentList{} + g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed()) + g.Expect(bdList.Items).To(BeEmpty()) + }, 30*time.Second, time.Second).Should(Succeed()) + } + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, cluster))).To(Succeed()) + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: bundleNS}}))).To(Succeed()) + Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: clusterNS}}))).To(Succeed()) + }) + + // createBundleWithValues builds a bundle that targets the test cluster with the given Helm values. + createBundleWithValues := func(name string, values map[string]interface{}) *fleet.Bundle { + b := &fleet.Bundle{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: bundleNS}, + Spec: fleet.BundleSpec{ + Targets: []fleet.BundleTarget{ + { + ClusterSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"bdl-env": testID}, + }, + BundleDeploymentOptions: fleet.BundleDeploymentOptions{ + Helm: &fleet.HelmOptions{ + Values: &fleet.GenericMap{Data: values}, + }, + }, + }, + }, + Resources: []fleet.BundleResource{{ + Name: "test.yaml", + Content: "apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: test\ndata:\n key: value", + }}, + }, + } + Expect(k8sClient.Create(ctx, b)).ToNot(HaveOccurred()) + return b + } + + // waitForBD waits until the BundleDeployment for bundleName has a non-empty ValuesHash, + // meaning the controller completed at least one successful reconcile and wrote the secret. + waitForBD := func(bundleName string) *fleet.BundleDeployment { + var bd *fleet.BundleDeployment + Eventually(func(g Gomega) { + bdList := &fleet.BundleDeploymentList{} + g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed()) + for i := range bdList.Items { + item := &bdList.Items[i] + if item.Labels[fleet.BundleLabel] == bundleName { + g.Expect(item.Spec.ValuesHash).ToNot(BeEmpty(), + "BD should have a ValuesHash set by manageOptionsSecret") + bd = item + return + } + } + g.Expect(false).To(BeTrue(), "BundleDeployment not found for bundle "+bundleName) + }).Should(Succeed()) + return bd + } + + // corruptBDSecret overwrites the BD secret with data that hashes differently from the BD's + // current ValuesHash, simulating the state left when manageOptionsSecret succeeded but + // createBundleDeployment failed in a previous reconcile cycle. + corruptBDSecret := func(bd *fleet.BundleDeployment) { + staleValues := []byte(`replicas: 999`) + Eventually(func(g Gomega) { + secret := &corev1.Secret{} + g.Expect(k8sClient.Get(ctx, types.NamespacedName{ + Name: bd.Name, Namespace: bd.Namespace, + }, secret)).To(Succeed()) + + // Sanity check: hashes must be consistent before we corrupt. + h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey]) + g.Expect(h).To(Equal(bd.Spec.ValuesHash), "pre-condition: secret and BD must be consistent") + + secret.Data[helmvalues.ValuesKey] = staleValues + g.Expect(k8sClient.Update(ctx, secret)).To(Succeed()) + }).Should(Succeed()) + } + + // getBundleReadyMessage returns the message from the bundle's Ready condition, or "" if absent. + getBundleReadyMessage := func(b *fleet.Bundle) string { + latest := &fleet.Bundle{} + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(b), latest); err != nil { + return "" + } + for _, c := range latest.Status.Conditions { + if c.Type == "Ready" { + return c.Message + } + } + return "" + } + + // When the BD secret and BD.Spec.ValuesHash are inconsistent, the bundle + // controller should self-heal: it must re-synchronise the secret and clear + // the targeting error without requiring a manual patch. + It("self-heals after the BD secret content diverges from ValuesHash", func() { + By("creating a bundle with Helm values so manageOptionsSecret writes a secret") + bundleName := "bdl-selfheal-" + testID + bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8}) + + By("waiting for the BD and its options secret to be created with a consistent hash") + bd := waitForBD(bundleName) + + By("corrupting the BD secret (simulates: manageOptionsSecret ran but createBundleDeployment failed)") + corruptBDSecret(bd) + + // Verify the mismatch is real before letting the controller see it. + secret := &corev1.Secret{} + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace}, secret)).To(Succeed()) + actualHash := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey]) + Expect(actualHash).ToNot(Equal(bd.Spec.ValuesHash), "pre-condition: hashes must differ after corruption") + + By("triggering a bundle reconcile so the controller detects the mismatch") + Eventually(func(g Gomega) { + latest := &fleet.Bundle{} + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed()) + if latest.Annotations == nil { + latest.Annotations = map[string]string{} + } + latest.Annotations["test/trigger"] = "1" + g.Expect(k8sClient.Update(ctx, latest)).To(Succeed()) + }).Should(Succeed()) + + By("confirming the hash-mismatch targeting error appears on the bundle") + Eventually(func(g Gomega) { + msg := getBundleReadyMessage(bundle) + g.Expect(msg).To(ContainSubstring("hash mismatch between secret and bundledeployment"), + "the hash-mismatch error should appear after the controller reconciles with the corrupted secret") + }).Should(Succeed()) + + // The error should eventually clear without any further + // external intervention. + By("expecting the bundle to eventually clear the targeting error (self-heal)") + Eventually(func(g Gomega) { + msg := getBundleReadyMessage(bundle) + g.Expect(msg).ToNot(ContainSubstring("hash mismatch between secret and bundledeployment"), + "the hash-mismatch targeting error should clear once the controller self-heals") + }).Should(Succeed()) + }) + + // After a secret/ValuesHash inconsistency, a subsequent bundle update (e.g. a + // new GitRepo commit with new values) should be applied to the BD. + It("applies new bundle values after recovering from a secret/ValuesHash inconsistency", func() { + By("creating a bundle with initial Helm values") + bundleName := "bdl-newvals-" + testID + bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8}) + + By("waiting for the BD and its options secret to be created consistently") + bd := waitForBD(bundleName) + originalHash := bd.Spec.ValuesHash + + By("corrupting the BD secret to create the inconsistency") + corruptBDSecret(bd) + + By("updating the bundle with new Helm values (simulates a new GitRepo commit)") + Eventually(func(g Gomega) { + latest := &fleet.Bundle{} + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed()) + latest.Spec.Targets[0].BundleDeploymentOptions.Helm.Values = &fleet.GenericMap{ + Data: map[string]interface{}{"data": 25}, + } + g.Expect(k8sClient.Update(ctx, latest)).To(Succeed()) + }).Should(Succeed()) + + // After the controller reconciles the new values, the BD's + // ValuesHash should change from the original value to a new one reflecting the + // updated values. + By("expecting the BD's ValuesHash to change to reflect the new values") + Eventually(func(g Gomega) { + latestBD := &fleet.BundleDeployment{} + g.Expect(k8sClient.Get(ctx, types.NamespacedName{ + Name: bd.Name, Namespace: bd.Namespace, + }, latestBD)).To(Succeed()) + g.Expect(latestBD.Spec.ValuesHash).ToNot(Equal(originalHash), + "BD ValuesHash should be updated once the controller applies the new values") + }).Should(Succeed()) + }) +}) diff --git a/internal/cmd/controller/errorutil/errorutil.go b/internal/cmd/controller/errorutil/errorutil.go index 395ac54bec..73f831a72e 100644 --- a/internal/cmd/controller/errorutil/errorutil.go +++ b/internal/cmd/controller/errorutil/errorutil.go @@ -8,6 +8,12 @@ import ( var ErrRetryable = errors.New("requeue event") +// ErrHashMismatch is returned when the BD options secret content does not match the BD's +// ValuesHash. It signals that the secret was updated but the BD spec was not (or vice-versa), +// leaving them inconsistent. The bundle controller self-heals by clearing ValuesHash and +// requeuing so the next reconcile can re-sync the secret. +var ErrHashMismatch = errors.New("hash mismatch between secret and bundledeployment") + func IgnoreConflict(err error) error { if apierrors.IsConflict(err) { return nil diff --git a/internal/cmd/controller/reconciler/bundle_controller.go b/internal/cmd/controller/reconciler/bundle_controller.go index b495cc9691..87951e5e8e 100644 --- a/internal/cmd/controller/reconciler/bundle_controller.go +++ b/internal/cmd/controller/reconciler/bundle_controller.go @@ -265,12 +265,28 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr matchedTargets, secretsMissing, err := r.Builder.Targets(ctx, bundle, manifestID) if err != nil { + wrappedErr := fmt.Errorf("targeting error: %w", err) + if errors.Is(err, fleetutil.ErrHashMismatch) { + // The BD options secret and BD.Spec.ValuesHash are inconsistent. This + // happens when manageOptionsSecret wrote the secret but + // createBundleDeployment failed in a previous reconcile cycle. + // Make the error visible on the bundle status, then repair by clearing + // ValuesHash on affected BDs so the next reconcile can re-sync the secret. + SetCondition(string(fleet.Ready), &bundle.Status, wrappedErr) + if statusErr := r.updateStatus(ctx, bundleOrig, bundle); statusErr != nil { + return ctrl.Result{}, statusErr + } + if repairErr := r.repairHashMismatch(ctx, bundle); repairErr != nil { + logger.Error(repairErr, "failed to repair BD secret/ValuesHash inconsistency") + } + return ctrl.Result{RequeueAfter: durations.DefaultRequeueAfter}, nil + } return ctrl.Result{}, r.updateErrorStatus( ctx, bundleOrig, bundle, - fmt.Errorf("targeting error: %w", err), + wrappedErr, ) } @@ -591,6 +607,48 @@ func (r *BundleReconciler) manageOptionsSecret( return hash, secret, nil } +// repairHashMismatch finds all BundleDeployments for the given bundle whose options secret +// content does not match their ValuesHash, and clears their ValuesHash so the next reconcile +// can re-sync the secret via manageOptionsSecret. +func (r *BundleReconciler) repairHashMismatch(ctx context.Context, bundle *fleet.Bundle) error { + bdList := &fleet.BundleDeploymentList{} + if err := r.List(ctx, bdList, client.MatchingLabels{ + fleet.BundleLabel: bundle.Name, + fleet.BundleNamespaceLabel: bundle.Namespace, + }); err != nil { + return err + } + + for i := range bdList.Items { + bd := &bdList.Items[i] + if bd.Spec.ValuesHash == "" { + continue + } + + secret := &corev1.Secret{} + if err := r.Get(ctx, client.ObjectKey{Namespace: bd.Namespace, Name: bd.Name}, secret); err != nil { + if apierrors.IsNotFound(err) { + continue + } + return err + } + + h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey]) + if h == bd.Spec.ValuesHash { + continue + } + + // Clear ValuesHash so the next reconcile re-syncs the secret via manageOptionsSecret. + patch := client.MergeFrom(bd.DeepCopy()) + bd.Spec.ValuesHash = "" + if err := r.Patch(ctx, bd, patch); err != nil { + return fmt.Errorf("failed to clear ValuesHash on bundledeployment %s/%s: %w", bd.Namespace, bd.Name, err) + } + } + + return nil +} + // ensureOwnerReferences sets bd as the owner of s, and returns any error occurring in the process. func (r *BundleReconciler) ensureOwnerReferences(ctx context.Context, bd *fleet.BundleDeployment, s *corev1.Secret) error { if s == nil { diff --git a/internal/cmd/controller/target/builder.go b/internal/cmd/controller/target/builder.go index 2be607c75b..a852c622ae 100644 --- a/internal/cmd/controller/target/builder.go +++ b/internal/cmd/controller/target/builder.go @@ -15,6 +15,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + errutil "github.com/rancher/fleet/internal/cmd/controller/errorutil" "github.com/rancher/fleet/internal/cmd/controller/labelselectors" "github.com/rancher/fleet/internal/cmd/controller/options" "github.com/rancher/fleet/internal/cmd/controller/target/matcher" @@ -170,7 +171,7 @@ func (m *Manager) Targets(ctx context.Context, bundle *fleet.Bundle, manifestID h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey]) if h != bd.Spec.ValuesHash { - return nil, false, fmt.Errorf("retrying, hash mismatch between secret and bundledeployment: actual %s != expected %s", h, bd.Spec.ValuesHash) + return nil, false, fmt.Errorf("%w: actual %s != expected %s", errutil.ErrHashMismatch, h, bd.Spec.ValuesHash) } if err := helmvalues.SetOptions(bd, secret.Data); err != nil {