Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 233 additions & 0 deletions integrationtests/controller/bundle/bundle_secret_deadlock_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
package bundle

import (
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/rancher/fleet/integrationtests/utils"
"github.com/rancher/fleet/internal/helmvalues"
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("BD secret / ValuesHash inconsistency deadlock", func() {
var (
bundleNS string
clusterNS string
cluster *fleet.Cluster
bundle *fleet.Bundle
testID string
)

BeforeEach(func() {
var err error
testID, err = utils.NewNamespaceName()
Expect(err).ToNot(HaveOccurred())
if len(testID) > 8 {
testID = testID[:8]
}

bundleNS = "bdl-bns-" + testID
clusterNS = "bdl-cns-" + testID

Expect(k8sClient.Create(ctx, &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{Name: bundleNS},
})).ToNot(HaveOccurred())
Expect(k8sClient.Create(ctx, &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{Name: clusterNS},
})).ToNot(HaveOccurred())

cluster, err = utils.CreateCluster(ctx, k8sClient, "test-cluster-"+testID, bundleNS,
map[string]string{"bdl-env": testID}, clusterNS)
Expect(err).ToNot(HaveOccurred())
})

AfterEach(func() {
if bundle != nil {
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, bundle))).To(Succeed())
// Wait for BDs to be cleaned up before deleting namespaces.
Eventually(func(g Gomega) {
bdList := &fleet.BundleDeploymentList{}
g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed())
g.Expect(bdList.Items).To(BeEmpty())
}, 30*time.Second, time.Second).Should(Succeed())
}
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, cluster))).To(Succeed())
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: bundleNS}}))).To(Succeed())
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: clusterNS}}))).To(Succeed())
})

// createBundleWithValues builds a bundle that targets the test cluster with the given Helm values.
createBundleWithValues := func(name string, values map[string]interface{}) *fleet.Bundle {
b := &fleet.Bundle{
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: bundleNS},
Spec: fleet.BundleSpec{
Targets: []fleet.BundleTarget{
{
ClusterSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{"bdl-env": testID},
},
BundleDeploymentOptions: fleet.BundleDeploymentOptions{
Helm: &fleet.HelmOptions{
Values: &fleet.GenericMap{Data: values},
},
},
},
},
Resources: []fleet.BundleResource{{
Name: "test.yaml",
Content: "apiVersion: v1\nkind: ConfigMap\nmetadata:\n name: test\ndata:\n key: value",
}},
},
}
Expect(k8sClient.Create(ctx, b)).ToNot(HaveOccurred())
return b
}

// waitForBD waits until the BundleDeployment for bundleName has a non-empty ValuesHash,
// meaning the controller completed at least one successful reconcile and wrote the secret.
waitForBD := func(bundleName string) *fleet.BundleDeployment {
var bd *fleet.BundleDeployment
Eventually(func(g Gomega) {
bdList := &fleet.BundleDeploymentList{}
g.Expect(k8sClient.List(ctx, bdList, client.InNamespace(clusterNS))).To(Succeed())
for i := range bdList.Items {
item := &bdList.Items[i]
if item.Labels[fleet.BundleLabel] == bundleName {
g.Expect(item.Spec.ValuesHash).ToNot(BeEmpty(),
"BD should have a ValuesHash set by manageOptionsSecret")
bd = item
return
}
}
g.Expect(false).To(BeTrue(), "BundleDeployment not found for bundle "+bundleName)
}).Should(Succeed())
return bd
}

// corruptBDSecret overwrites the BD secret with data that hashes differently from the BD's
// current ValuesHash, simulating the state left when manageOptionsSecret succeeded but
// createBundleDeployment failed in a previous reconcile cycle.
corruptBDSecret := func(bd *fleet.BundleDeployment) {
staleValues := []byte(`replicas: 999`)
Eventually(func(g Gomega) {
secret := &corev1.Secret{}
g.Expect(k8sClient.Get(ctx, types.NamespacedName{
Name: bd.Name, Namespace: bd.Namespace,
}, secret)).To(Succeed())

// Sanity check: hashes must be consistent before we corrupt.
h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
g.Expect(h).To(Equal(bd.Spec.ValuesHash), "pre-condition: secret and BD must be consistent")

secret.Data[helmvalues.ValuesKey] = staleValues
g.Expect(k8sClient.Update(ctx, secret)).To(Succeed())
}).Should(Succeed())
}

// getBundleReadyMessage returns the message from the bundle's Ready condition, or "" if absent.
getBundleReadyMessage := func(b *fleet.Bundle) string {
latest := &fleet.Bundle{}
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(b), latest); err != nil {
return ""
}
for _, c := range latest.Status.Conditions {
if c.Type == "Ready" {
return c.Message
}
}
return ""
}

// When the BD secret and BD.Spec.ValuesHash are inconsistent, the bundle
// controller should self-heal: it must re-synchronise the secret and clear
// the targeting error without requiring a manual patch.
It("self-heals after the BD secret content diverges from ValuesHash", func() {
By("creating a bundle with Helm values so manageOptionsSecret writes a secret")
bundleName := "bdl-selfheal-" + testID
bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8})

By("waiting for the BD and its options secret to be created with a consistent hash")
bd := waitForBD(bundleName)

By("corrupting the BD secret (simulates: manageOptionsSecret ran but createBundleDeployment failed)")
corruptBDSecret(bd)

// Verify the mismatch is real before letting the controller see it.
secret := &corev1.Secret{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: bd.Name, Namespace: bd.Namespace}, secret)).To(Succeed())
actualHash := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
Expect(actualHash).ToNot(Equal(bd.Spec.ValuesHash), "pre-condition: hashes must differ after corruption")

By("triggering a bundle reconcile so the controller detects the mismatch")
Eventually(func(g Gomega) {
latest := &fleet.Bundle{}
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed())
if latest.Annotations == nil {
latest.Annotations = map[string]string{}
}
latest.Annotations["test/trigger"] = "1"
g.Expect(k8sClient.Update(ctx, latest)).To(Succeed())
}).Should(Succeed())

By("confirming the hash-mismatch targeting error appears on the bundle")
Eventually(func(g Gomega) {
msg := getBundleReadyMessage(bundle)
g.Expect(msg).To(ContainSubstring("hash mismatch between secret and bundledeployment"),
"the hash-mismatch error should appear after the controller reconciles with the corrupted secret")
}).Should(Succeed())

// The error should eventually clear without any further
// external intervention.
By("expecting the bundle to eventually clear the targeting error (self-heal)")
Eventually(func(g Gomega) {
msg := getBundleReadyMessage(bundle)
g.Expect(msg).ToNot(ContainSubstring("hash mismatch between secret and bundledeployment"),
"the hash-mismatch targeting error should clear once the controller self-heals")
}).Should(Succeed())
})

// After a secret/ValuesHash inconsistency, a subsequent bundle update (e.g. a
// new GitRepo commit with new values) should be applied to the BD.
It("applies new bundle values after recovering from a secret/ValuesHash inconsistency", func() {
By("creating a bundle with initial Helm values")
bundleName := "bdl-newvals-" + testID
bundle = createBundleWithValues(bundleName, map[string]interface{}{"data": 8})

By("waiting for the BD and its options secret to be created consistently")
bd := waitForBD(bundleName)
originalHash := bd.Spec.ValuesHash

By("corrupting the BD secret to create the inconsistency")
corruptBDSecret(bd)

By("updating the bundle with new Helm values (simulates a new GitRepo commit)")
Eventually(func(g Gomega) {
latest := &fleet.Bundle{}
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bundle), latest)).To(Succeed())
latest.Spec.Targets[0].BundleDeploymentOptions.Helm.Values = &fleet.GenericMap{
Data: map[string]interface{}{"data": 25},
}
g.Expect(k8sClient.Update(ctx, latest)).To(Succeed())
}).Should(Succeed())

// After the controller reconciles the new values, the BD's
// ValuesHash should change from the original value to a new one reflecting the
// updated values.
By("expecting the BD's ValuesHash to change to reflect the new values")
Eventually(func(g Gomega) {
latestBD := &fleet.BundleDeployment{}
g.Expect(k8sClient.Get(ctx, types.NamespacedName{
Name: bd.Name, Namespace: bd.Namespace,
}, latestBD)).To(Succeed())
g.Expect(latestBD.Spec.ValuesHash).ToNot(Equal(originalHash),
"BD ValuesHash should be updated once the controller applies the new values")
}).Should(Succeed())
})
})
6 changes: 6 additions & 0 deletions internal/cmd/controller/errorutil/errorutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ import (

var ErrRetryable = errors.New("requeue event")

// ErrHashMismatch is returned when the BD options secret content does not match the BD's
// ValuesHash. It signals that the secret was updated but the BD spec was not (or vice-versa),
// leaving them inconsistent. The bundle controller self-heals by clearing ValuesHash and
// requeuing so the next reconcile can re-sync the secret.
var ErrHashMismatch = errors.New("hash mismatch between secret and bundledeployment")

func IgnoreConflict(err error) error {
if apierrors.IsConflict(err) {
return nil
Expand Down
60 changes: 59 additions & 1 deletion internal/cmd/controller/reconciler/bundle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,28 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr

matchedTargets, secretsMissing, err := r.Builder.Targets(ctx, bundle, manifestID)
if err != nil {
wrappedErr := fmt.Errorf("targeting error: %w", err)
if errors.Is(err, fleetutil.ErrHashMismatch) {
// The BD options secret and BD.Spec.ValuesHash are inconsistent. This
// happens when manageOptionsSecret wrote the secret but
// createBundleDeployment failed in a previous reconcile cycle.
// Make the error visible on the bundle status, then repair by clearing
// ValuesHash on affected BDs so the next reconcile can re-sync the secret.
SetCondition(string(fleet.Ready), &bundle.Status, wrappedErr)
if statusErr := r.updateStatus(ctx, bundleOrig, bundle); statusErr != nil {
return ctrl.Result{}, statusErr
}
if repairErr := r.repairHashMismatch(ctx, bundle); repairErr != nil {
logger.Error(repairErr, "failed to repair BD secret/ValuesHash inconsistency")
}
return ctrl.Result{RequeueAfter: durations.DefaultRequeueAfter}, nil
}
return ctrl.Result{},
r.updateErrorStatus(
ctx,
bundleOrig,
bundle,
fmt.Errorf("targeting error: %w", err),
wrappedErr,
)
}

Expand Down Expand Up @@ -591,6 +607,48 @@ func (r *BundleReconciler) manageOptionsSecret(
return hash, secret, nil
}

// repairHashMismatch finds all BundleDeployments for the given bundle whose options secret
// content does not match their ValuesHash, and clears their ValuesHash so the next reconcile
// can re-sync the secret via manageOptionsSecret.
func (r *BundleReconciler) repairHashMismatch(ctx context.Context, bundle *fleet.Bundle) error {
bdList := &fleet.BundleDeploymentList{}
if err := r.List(ctx, bdList, client.MatchingLabels{
fleet.BundleLabel: bundle.Name,
fleet.BundleNamespaceLabel: bundle.Namespace,
}); err != nil {
return err
}

for i := range bdList.Items {
bd := &bdList.Items[i]
if bd.Spec.ValuesHash == "" {
continue
}

secret := &corev1.Secret{}
if err := r.Get(ctx, client.ObjectKey{Namespace: bd.Namespace, Name: bd.Name}, secret); err != nil {
if apierrors.IsNotFound(err) {
continue
}
Comment thread
0xavi0 marked this conversation as resolved.
return err
}

h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
if h == bd.Spec.ValuesHash {
continue
}

// Clear ValuesHash so the next reconcile re-syncs the secret via manageOptionsSecret.
patch := client.MergeFrom(bd.DeepCopy())
bd.Spec.ValuesHash = ""
if err := r.Patch(ctx, bd, patch); err != nil {
return fmt.Errorf("failed to clear ValuesHash on bundledeployment %s/%s: %w", bd.Namespace, bd.Name, err)
Comment on lines +641 to +645
Copy link

Copilot AI Apr 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clearing bd.Spec.ValuesHash here can cause the downstream agent to stop loading the options secret (agent only loads the secret when ValuesHash != ""), and because the controller clears options from the BD spec before persisting, this can lead to deployments running with empty/default Helm values until the next reconcile. A safer repair is to patch ValuesHash to the actual hash computed from the current secret (or otherwise ensure the agent continues to gate on/consume the secret) so options are not silently dropped during the self-heal window; if you change the approach, also update the surrounding comment/doc accordingly.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a valid comment.

I'm checking if we really need to persist that ValuesHash = ""

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed the code to also set WaitingForValues=true which blocks the agent from deploying wrong values while the secret self-heals

}
}

return nil
}

// ensureOwnerReferences sets bd as the owner of s, and returns any error occurring in the process.
func (r *BundleReconciler) ensureOwnerReferences(ctx context.Context, bd *fleet.BundleDeployment, s *corev1.Secret) error {
if s == nil {
Expand Down
3 changes: 2 additions & 1 deletion internal/cmd/controller/target/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

errutil "github.com/rancher/fleet/internal/cmd/controller/errorutil"
"github.com/rancher/fleet/internal/cmd/controller/labelselectors"
"github.com/rancher/fleet/internal/cmd/controller/options"
"github.com/rancher/fleet/internal/cmd/controller/target/matcher"
Expand Down Expand Up @@ -170,7 +171,7 @@ func (m *Manager) Targets(ctx context.Context, bundle *fleet.Bundle, manifestID

h := helmvalues.HashOptions(secret.Data[helmvalues.ValuesKey], secret.Data[helmvalues.StagedValuesKey])
if h != bd.Spec.ValuesHash {
return nil, false, fmt.Errorf("retrying, hash mismatch between secret and bundledeployment: actual %s != expected %s", h, bd.Spec.ValuesHash)
return nil, false, fmt.Errorf("%w: actual %s != expected %s", errutil.ErrHashMismatch, h, bd.Spec.ValuesHash)
}

if err := helmvalues.SetOptions(bd, secret.Data); err != nil {
Expand Down