Skip to content

Commit 8f3b41c

Browse files
committed
self-heal BD secret when having a hash mismatch
If the bundle controller updates the BD secret but fails to write the BD spec, the two are left inconsistent and Targets() returns a TerminalError on every subsequent reconcile, deadlocking the BD. Detect the mismatch in Targets() and allow the reconcile to proceed so manageOptionsSecret can re-establish consistency, instead of returning a terminal error with no recovery path. Refers to: #4962 Signed-off-by: Xavi Garcia <xavi.garcia@suse.com>
1 parent e3d2443 commit 8f3b41c

4 files changed

Lines changed: 300 additions & 2 deletions

File tree

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
package bundle
2+
3+
import (
4+
"time"
5+
6+
. "github.com/onsi/ginkgo/v2"
7+
. "github.com/onsi/gomega"
8+
9+
"github.com/rancher/fleet/integrationtests/utils"
10+
"github.com/rancher/fleet/internal/helmvalues"
11+
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
12+
13+
corev1 "k8s.io/api/core/v1"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/types"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
)
18+
19+
var _ = Describe("BD secret / ValuesHash inconsistency deadlock", func() {
20+
var (
21+
bundleNS string
22+
clusterNS string
23+
cluster *fleet.Cluster
24+
bundle *fleet.Bundle
25+
testID string
26+
)
27+
28+
BeforeEach(func() {
29+
var err error
30+
testID, err = utils.NewNamespaceName()
31+
Expect(err).ToNot(HaveOccurred())
32+
if len(testID) > 8 {
33+
testID = testID[:8]
34+
}
35+
36+
bundleNS = "bdl-bns-" + testID
37+
clusterNS = "bdl-cns-" + testID
38+
39+
Expect(k8sClient.Create(ctx, &corev1.Namespace{
40+
ObjectMeta: metav1.ObjectMeta{Name: bundleNS},
41+
})).ToNot(HaveOccurred())
42+
Expect(k8sClient.Create(ctx, &corev1.Namespace{
43+
ObjectMeta: metav1.ObjectMeta{Name: clusterNS},
44+
})).ToNot(HaveOccurred())
45+
46+
cluster, err = utils.CreateCluster(ctx, k8sClient, "test-cluster-"+testID, bundleNS,
47+
map[string]string{"bdl-env": testID}, clusterNS)
48+
Expect(err).ToNot(HaveOccurred())
49+
})
50+
51+
AfterEach(func() {
52+
if bundle != nil {
53+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, bundle))).To(Succeed())
54+
// Wait for BDs to be cleaned up before deleting namespaces.
55+
Eventually(func(g Gomega) {
56+
bdList := &fleet.BundleDeploymentList{}
57+
g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed())
58+
g.Expect(bdList.Items).To(BeEmpty())
59+
}, 30*time.Second, time.Second).Should(Succeed())
60+
}
61+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, cluster))).To(Succeed())
62+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: bundleNS}}))).To(Succeed())
63+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: clusterNS}}))).To(Succeed())
64+
})
65+
66+
// createBundleWithValues builds a bundle that targets the test cluster with the given Helm values.
67+
createBundleWithValues := func(name string, values map[string]interface{}) *fleet.Bundle {
68+
b := &fleet.Bundle{
69+
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: bundleNS},
70+
Spec: fleet.BundleSpec{
71+
Targets: []fleet.BundleTarget{
72+
{
73+
ClusterSelector: &metav1.LabelSelector{
74+
MatchLabels: map[string]string{"bdl-env": testID},
75+
},
76+
BundleDeploymentOptions: fleet.BundleDeploymentOptions{
77+
Helm: &fleet.HelmOptions{
78+
Values: &fleet.GenericMap{Data: values},
79+
},
80+
},
81+
},
82+
},
83+
Resources: []fleet.BundleResource{{
84+
Name: "test.yaml",
85+
Content: "apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: test\ndata:\n key: value",
86+
}},
87+
},
88+
}
89+
Expect(k8sClient.Create(ctx, b)).ToNot(HaveOccurred())
90+
return b
91+
}
92+
93+
// waitForBD waits until the BundleDeployment for bundleName has a non-empty ValuesHash,
94+
// meaning the controller completed at least one successful reconcile and wrote the secret.
95+
waitForBD := func(bundleName string) *fleet.BundleDeployment {
96+
var bd *fleet.BundleDeployment
97+
Eventually(func(g Gomega) {
98+
bdList := &fleet.BundleDeploymentList{}
99+
g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed())
100+
for i := range bdList.Items {
101+
item := &bdList.Items[i]
102+
if item.Labels[fleet.BundleLabel] == bundleName {
103+
g.Expect(item.Spec.ValuesHash).ToNot(BeEmpty(),
104+
"BD should have a ValuesHash set by manageOptionsSecret")
105+
bd = item
106+
return
107+
}
108+
}
109+
g.Expect(false).To(BeTrue(), "BundleDeployment not found for bundle "+bundleName)
110+
}).Should(Succeed())
111+
return bd
112+
}
113+
114+
// corruptBDSecret overwrites the BD secret with data that hashes differently from the BD's
115+
// current ValuesHash, simulating the state left when manageOptionsSecret succeeded but
116+
// createBundleDeployment failed in a previous reconcile cycle.
117+
corruptBDSecret := func(bd *fleet.BundleDeployment) {
118+
staleValues := []byte(`replicas: 999`)
119+
Eventually(func(g Gomega) {
120+
secret := &corev1.Secret{}
121+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{
122+
Name: bd.Name, Namespace: bd.Namespace,
123+
}, secret)).To(Succeed())
124+
125+
// Sanity check: hashes must be consistent before we corrupt.
126+
h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
127+
g.Expect(h).To(Equal(bd.Spec.ValuesHash), "pre-condition: secret and BD must be consistent")
128+
129+
secret.Data[helmvalues.ValuesKey] = staleValues
130+
g.Expect(k8sClient.Update(ctx, secret)).To(Succeed())
131+
}).Should(Succeed())
132+
}
133+
134+
// getBundleReadyMessage returns the message from the bundle's Ready condition, or "" if absent.
135+
getBundleReadyMessage := func(b *fleet.Bundle) string {
136+
latest := &fleet.Bundle{}
137+
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(b), latest); err != nil {
138+
return ""
139+
}
140+
for _, c := range latest.Status.Conditions {
141+
if c.Type == "Ready" {
142+
return c.Message
143+
}
144+
}
145+
return ""
146+
}
147+
148+
// When the BD secret and BD.Spec.ValuesHash are inconsistent, the bundle
149+
// controller should self-heal: it must re-synchronise the secret and clear
150+
// the targeting error without requiring a manual patch.
151+
It("self-heals after the BD secret content diverges from ValuesHash", func() {
152+
By("creating a bundle with Helm values so manageOptionsSecret writes a secret")
153+
bundleName := "bdl-selfheal-" + testID
154+
bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8})
155+
156+
By("waiting for the BD and its options secret to be created with a consistent hash")
157+
bd := waitForBD(bundleName)
158+
159+
By("corrupting the BD secret (simulates: manageOptionsSecret ran but createBundleDeployment failed)")
160+
corruptBDSecret(bd)
161+
162+
// Verify the mismatch is real before letting the controller see it.
163+
secret := &corev1.Secret{}
164+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace}, secret)).To(Succeed())
165+
actualHash := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
166+
Expect(actualHash).ToNot(Equal(bd.Spec.ValuesHash), "pre-condition: hashes must differ after corruption")
167+
168+
By("triggering a bundle reconcile so the controller detects the mismatch")
169+
Eventually(func(g Gomega) {
170+
latest := &fleet.Bundle{}
171+
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed())
172+
if latest.Annotations == nil {
173+
latest.Annotations = map[string]string{}
174+
}
175+
latest.Annotations["test/trigger"] = "1"
176+
g.Expect(k8sClient.Update(ctx, latest)).To(Succeed())
177+
}).Should(Succeed())
178+
179+
By("confirming the hash-mismatch targeting error appears on the bundle")
180+
Eventually(func(g Gomega) {
181+
msg := getBundleReadyMessage(bundle)
182+
g.Expect(msg).To(ContainSubstring("hash mismatch between secret and bundledeployment"),
183+
"the hash-mismatch error should appear after the controller reconciles with the corrupted secret")
184+
}).Should(Succeed())
185+
186+
// The error should eventually clear without any further
187+
// external intervention.
188+
By("expecting the bundle to eventually clear the targeting error (self-heal)")
189+
Eventually(func(g Gomega) {
190+
msg := getBundleReadyMessage(bundle)
191+
g.Expect(msg).ToNot(ContainSubstring("hash mismatch between secret and bundledeployment"),
192+
"the hash-mismatch targeting error should clear once the controller self-heals")
193+
}).Should(Succeed())
194+
})
195+
196+
// After a secret/ValuesHash inconsistency, a subsequent bundle update (e.g. a
197+
// new GitRepo commit with new values) should be applied to the BD.
198+
It("applies new bundle values after recovering from a secret/ValuesHash inconsistency", func() {
199+
By("creating a bundle with initial Helm values")
200+
bundleName := "bdl-newvals-" + testID
201+
bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8})
202+
203+
By("waiting for the BD and its options secret to be created consistently")
204+
bd := waitForBD(bundleName)
205+
originalHash := bd.Spec.ValuesHash
206+
207+
By("corrupting the BD secret to create the inconsistency")
208+
corruptBDSecret(bd)
209+
210+
By("updating the bundle with new Helm values (simulates a new GitRepo commit)")
211+
Eventually(func(g Gomega) {
212+
latest := &fleet.Bundle{}
213+
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed())
214+
latest.Spec.Targets[0].BundleDeploymentOptions.Helm.Values = &fleet.GenericMap{
215+
Data: map[string]interface{}{"data": 25},
216+
}
217+
g.Expect(k8sClient.Update(ctx, latest)).To(Succeed())
218+
}).Should(Succeed())
219+
220+
// After the controller reconciles the new values, the BD's
221+
// ValuesHash should change from the original value to a new one reflecting the
222+
// updated values.
223+
By("expecting the BD's ValuesHash to change to reflect the new values")
224+
Eventually(func(g Gomega) {
225+
latestBD := &fleet.BundleDeployment{}
226+
g.Expect(k8sClient.Get(ctx, types.NamespacedName{
227+
Name: bd.Name, Namespace: bd.Namespace,
228+
}, latestBD)).To(Succeed())
229+
g.Expect(latestBD.Spec.ValuesHash).ToNot(Equal(originalHash),
230+
"BD ValuesHash should be updated once the controller applies the new values")
231+
}).Should(Succeed())
232+
})
233+
})

internal/cmd/controller/errorutil/errorutil.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ import (
88

99
var ErrRetryable = errors.New("requeue event")
1010

11+
// ErrHashMismatch is returned when the BD options secret content does not match the BD's
12+
// ValuesHash. It signals that the secret was updated but the BD spec was not (or vice-versa),
13+
// leaving them inconsistent. The bundle controller self-heals by clearing ValuesHash and
14+
// requeuing so the next reconcile can re-sync the secret.
15+
var ErrHashMismatch = errors.New("hash mismatch between secret and bundledeployment")
16+
1117
func IgnoreConflict(err error) error {
1218
if apierrors.IsConflict(err) {
1319
return nil

internal/cmd/controller/reconciler/bundle_controller.go

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,12 +265,28 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
265265

266266
matchedTargets, secretsMissing, err := r.Builder.Targets(ctx, bundle, manifestID)
267267
if err != nil {
268+
wrappedErr := fmt.Errorf("targeting error: %w", err)
269+
if errors.Is(err, fleetutil.ErrHashMismatch) {
270+
// The BD options secret and BD.Spec.ValuesHash are inconsistent. This
271+
// happens when manageOptionsSecret wrote the secret but
272+
// createBundleDeployment failed in a previous reconcile cycle.
273+
// Make the error visible on the bundle status, then repair by clearing
274+
// ValuesHash on affected BDs so the next reconcile can re-sync the secret.
275+
SetCondition(string(fleet.Ready), &bundle.Status, wrappedErr)
276+
if statusErr := r.updateStatus(ctx, bundleOrig, bundle); statusErr != nil {
277+
return ctrl.Result{}, statusErr
278+
}
279+
if repairErr := r.repairHashMismatch(ctx, bundle); repairErr != nil {
280+
logger.Error(repairErr, "failed to repair BD secret/ValuesHash inconsistency")
281+
}
282+
return ctrl.Result{RequeueAfter: durations.DefaultRequeueAfter}, nil
283+
}
268284
return ctrl.Result{},
269285
r.updateErrorStatus(
270286
ctx,
271287
bundleOrig,
272288
bundle,
273-
fmt.Errorf("targeting error: %w", err),
289+
wrappedErr,
274290
)
275291
}
276292

@@ -591,6 +607,48 @@ func (r *BundleReconciler) manageOptionsSecret(
591607
return hash, secret, nil
592608
}
593609

610+
// repairHashMismatch finds all BundleDeployments for the given bundle whose options secret
611+
// content does not match their ValuesHash, and clears their ValuesHash so the next reconcile
612+
// can re-sync the secret via manageOptionsSecret.
613+
func (r *BundleReconciler) repairHashMismatch(ctx context.Context, bundle *fleet.Bundle) error {
614+
bdList := &fleet.BundleDeploymentList{}
615+
if err := r.List(ctx, bdList, client.MatchingLabels{
616+
fleet.BundleLabel: bundle.Name,
617+
fleet.BundleNamespaceLabel: bundle.Namespace,
618+
}); err != nil {
619+
return err
620+
}
621+
622+
for i := range bdList.Items {
623+
bd := &bdList.Items[i]
624+
if bd.Spec.ValuesHash == "" {
625+
continue
626+
}
627+
628+
secret := &corev1.Secret{}
629+
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: bd.Namespace, Name: bd.Name}, secret); err != nil {
630+
if apierrors.IsNotFound(err) {
631+
continue
632+
}
633+
return err
634+
}
635+
636+
h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
637+
if h == bd.Spec.ValuesHash {
638+
continue
639+
}
640+
641+
// Clear ValuesHash so the next reconcile re-syncs the secret via manageOptionsSecret.
642+
patch := client.MergeFrom(bd.DeepCopy())
643+
bd.Spec.ValuesHash = ""
644+
if err := r.Patch(ctx, bd, patch); err != nil {
645+
return fmt.Errorf("failed to clear ValuesHash on bundledeployment %s/%s: %w", bd.Namespace, bd.Name, err)
646+
}
647+
}
648+
649+
return nil
650+
}
651+
594652
// ensureOwnerReferences sets bd as the owner of s, and returns any error occurring in the process.
595653
func (r *BundleReconciler) ensureOwnerReferences(ctx context.Context, bd *fleet.BundleDeployment, s *corev1.Secret) error {
596654
if s == nil {

internal/cmd/controller/target/builder.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
apierrors "k8s.io/apimachinery/pkg/api/errors"
1616
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1717

18+
errutil "github.com/rancher/fleet/internal/cmd/controller/errorutil"
1819
"github.com/rancher/fleet/internal/cmd/controller/labelselectors"
1920
"github.com/rancher/fleet/internal/cmd/controller/options"
2021
"github.com/rancher/fleet/internal/cmd/controller/target/matcher"
@@ -170,7 +171,7 @@ func (m *Manager) Targets(ctx context.Context, bundle *fleet.Bundle, manifestID
170171

171172
h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
172173
if h != bd.Spec.ValuesHash {
173-
return nil, false, fmt.Errorf("retrying, hash mismatch between secret and bundledeployment: actual %s != expected %s", h, bd.Spec.ValuesHash)
174+
return nil, false, fmt.Errorf("%w: actual %s != expected %s", errutil.ErrHashMismatch, h, bd.Spec.ValuesHash)
174175
}
175176

176177
if err := helmvalues.SetOptions(bd, secret.Data); err != nil {

0 commit comments

Comments
 (0)