From 32fe08bb190b061b2b08e1c47088b61c56834ed0 Mon Sep 17 00:00:00 2001 From: Tim Hardeck Date: Tue, 10 Mar 2026 10:29:00 +0100 Subject: [PATCH 1/2] Fix stale bundle error when cluster is offline after bad commit When a GitRepo contains a YAML parse error and the cluster agent is offline, the bundle's Ready condition retains the error message even after a fix commit is pushed. Three interdependent changes are needed. deployer.go: Add MalformedYAMLError to the deployErrToStatus regex. Helm v4 changed the error format from "YAML parse error" to "MalformedYAMLError"; without this match the error is routed to the Deployed condition instead of Installed, bypassing the staleness guard. summary.go: In MessageFromDeployment, skip the Installed condition message when AppliedDeploymentID differs from Spec.DeploymentID, so a stale error from a superseded apply attempt is not surfaced. target.go: Add effectiveDeployment so state and message compare against t.DeploymentID (the ID the controller is about to write) rather than the stale Spec.DeploymentID still held in the cached BundleDeployment. The bundle controller calls SetReadyConditions before updating BD specs, so the summary.go guard would otherwise never trigger while the agent is offline. --- .../offline-bundle-stuck/Chart.yaml | 6 + .../offline-bundle-stuck/fleet.yaml | 2 + .../templates/configmap.yaml | 6 + e2e/single-cluster/status_test.go | 185 ++++++++++++++++++ internal/cmd/agent/deployer/deployer.go | 3 +- internal/cmd/controller/summary/summary.go | 8 +- .../cmd/controller/summary/summary_test.go | 54 +++++ internal/cmd/controller/target/target.go | 17 +- 8 files changed, 277 insertions(+), 4 deletions(-) create mode 100644 e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml create mode 100644 e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml create mode 100644 e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml new file mode 100644 index 0000000000..f1d84db10d --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: offline-bundle-stuck +description: A minimal chart used to reproduce issue 594 (stale error after offline cluster) +type: application +version: 0.1.0 +appVersion: "1.0.0" diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml new file mode 100644 index 0000000000..f0b05b1979 --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml @@ -0,0 +1,2 @@ +helm: + releaseName: offline-bundle-stuck diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml new file mode 100644 index 0000000000..a365f44c2c --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + key: value diff --git a/e2e/single-cluster/status_test.go b/e2e/single-cluster/status_test.go index 975475159b..adefc8d40c 100644 --- a/e2e/single-cluster/status_test.go +++ b/e2e/single-cluster/status_test.go @@ -9,6 +9,7 @@ import ( "path" "strings" + gogit "github.com/go-git/go-git/v5" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -262,6 +263,190 @@ var _ = Describe("Checks that template errors are shown in bundles and gitrepos" }) }) +// Checks that once a cluster goes offline after a failed deployment, the bundle +// status does not permanently show the stale error after a fix commit is applied +// (issue https://github.com/rancher/fleet/issues/594). +var _ = Describe("Bundle status does not retain stale error for offline cluster after fix", Ordered, Label("infra-setup"), func() { + const ( + localAgentNS = "cattle-fleet-local-system" + fleetAgentDeploy = "fleet-agent" + ) + + var ( + tmpDir string + cloneDir string + k kubectl.Command + kAgent kubectl.Command + gh *githelper.Git + inClusterRepoURL string + gitrepoName string + clone *gogit.Repository + targetNamespace string + r = rand.New(rand.NewSource(GinkgoRandomSeed())) + ) + + BeforeEach(func() { + k = env.Kubectl.Namespace(env.Namespace) + kAgent = env.Kubectl.Namespace(localAgentNS) + }) + + JustBeforeEach(func() { + host := githelper.BuildGitHostname() + addr, err := githelper.GetExternalRepoAddr(env, port, "repo") + Expect(err).ToNot(HaveOccurred()) + gh = githelper.NewHTTP(addr) + + inClusterRepoURL = gh.GetInClusterURL(host, port, "repo") + + tmpDir, _ = os.MkdirTemp("", "fleet-") + cloneDir = path.Join(tmpDir, "repo") + + gitrepoName = testenv.RandomFilename("offline-stuck", r) + targetNamespace = testenv.NewNamespaceName("offline-stuck", r) + + clone, err = gh.Create(cloneDir, testenv.AssetPath("single-cluster/offline-bundle-stuck"), "examples") + Expect(err).ToNot(HaveOccurred()) + + err = testenv.ApplyTemplate(k, testenv.AssetPath("status/gitrepo.yaml"), struct { + Name string + Repo string + Branch string + TargetNamespace string + }{ + gitrepoName, + inClusterRepoURL, + gh.Branch, + targetNamespace, + }) + Expect(err).ToNot(HaveOccurred()) + }) + + AfterAll(func() { + // Ensure fleet-agent is always restored, even when the test fails midway. + _, _ = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s") + + _ = os.RemoveAll(tmpDir) + _, _ = k.Delete("gitrepo", gitrepoName) + + Eventually(func(g Gomega) { + out, _ := k.Get( + "bundledeployments", + "-A", + "-l", + fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName), + ) + g.Expect(out).To(ContainSubstring("No resources found")) + }).Should(Succeed()) + + _, _ = k.Delete("ns", targetNamespace, "--wait=false") + }) + + It("clears the stale error from an offline cluster once a fix commit is present", func() { + bundleName := gitrepoName + "-examples" + + By("waiting for the initial deployment to be Ready") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(1)) + }).Should(Succeed()) + + By("pushing a commit that introduces a YAML parse error") + badContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + broken: {unclosed +` + err := os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(badContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("waiting for the bundle to reflect the YAML parse error") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(0)) + found := false + for _, cond := range status.Conditions { + if cond.Type == string(fleet.Ready) && strings.Contains(cond.Message, "did not find expected") { + found = true + break + } + } + g.Expect(found).To(BeTrue(), "expected YAML parse error in bundle conditions, got: %v", status.Conditions) + }, testenv.MediumTimeout, testenv.PollingInterval).Should(Succeed()) + + By("scaling down the fleet-agent to simulate an offline cluster") + out, err := kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=0", "--timeout=60s") + Expect(err).ToNot(HaveOccurred(), out) + + // Wait until the agent pod is gone so it cannot apply any further commits. + Eventually(func(g Gomega) { + out, _ := kAgent.Get("pods", "-l", "app=fleet-agent") + g.Expect(out).To(ContainSubstring("No resources found")) + }).Should(Succeed()) + + By("pushing an intermediate commit that does not fix the YAML error") + intermediateContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm + labels: + version: "2" +data: + broken: {unclosed +` + err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(intermediateContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("pushing a fix commit (valid YAML)") + fixContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + key: fixed +` + err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(fixContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("verifying the error message does not persist after the fix commit is pushed") + // After the controller picks up the fix commit it updates the BD spec. + // Even though the offline agent cannot apply the fix yet, the bundle + // should no longer surface the stale error from the previous apply attempt. + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + found := false + for _, cond := range status.Conditions { + if cond.Type == string(fleet.Ready) { + found = true + g.Expect(cond.Message).NotTo( + ContainSubstring("did not find expected"), + "bundle Ready condition still shows stale YAML error after fix commit was pushed", + ) + break + } + } + g.Expect(found).To(BeTrue(), "expected Ready condition to be present, got: %v", status.Conditions) + }, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed()) + + By("scaling the fleet-agent back up") + out, err = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s") + Expect(err).ToNot(HaveOccurred(), out) + + By("waiting for the bundle to become Ready after the agent recovers") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(1)) + }, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed()) + }) +}) + // getBundleStatus retrieves the status of the bundle with the provided name. func getBundleStatus(g Gomega, k kubectl.Command, name string) fleet.BundleStatus { gr, err := k.Get("bundle", name, "-o=json") diff --git a/internal/cmd/agent/deployer/deployer.go b/internal/cmd/agent/deployer/deployer.go index 6a358aac62..f2898998cd 100644 --- a/internal/cmd/agent/deployer/deployer.go +++ b/internal/cmd/agent/deployer/deployer.go @@ -285,7 +285,8 @@ func deployErrToStatus(err error, status fleet.BundleDeploymentStatus) (bool, fl "(chart requires kubeVersion)|" + // kubeVersion mismatch "(annotation validation error)|" + // annotations fail to pass validation "(failed, and has been rolled back due to atomic being set)|" + // atomic is set and a rollback occurs - "(YAML parse error)|" + // YAML is broken in source files + "(YAML parse error)|" + // YAML is broken in source files (Helm v3) + "(MalformedYAMLError)|" + // YAML is broken in source files (Helm v4) "(Forbidden: updates to [0-9A-Za-z]+ spec for fields other than [0-9A-Za-z ']+ are forbidden)|" + // trying to update fields that cannot be updated "(Forbidden: spec is immutable after creation)|" + // trying to modify immutable spec "(chart requires kubeVersion: [0-9A-Za-z\\.\\-<>=]+ which is incompatible with Kubernetes)", // trying to deploy to incompatible Kubernetes diff --git a/internal/cmd/controller/summary/summary.go b/internal/cmd/controller/summary/summary.go index 6b58b583ad..a8d3034849 100644 --- a/internal/cmd/controller/summary/summary.go +++ b/internal/cmd/controller/summary/summary.go @@ -137,7 +137,13 @@ func MessageFromDeployment(deployment *fleet.BundleDeployment) string { } message := MessageFromCondition("Deployed", deployment.Status.Conditions) if message == "" { - message = MessageFromCondition("Installed", deployment.Status.Conditions) + // Only surface the Installed error when it belongs to the current + // deployment. If the deployment IDs differ, the Installed condition + // was set by a previous (now superseded) apply attempt and the + // message is stale. + if deployment.Status.AppliedDeploymentID == deployment.Spec.DeploymentID { + message = MessageFromCondition("Installed", deployment.Status.Conditions) + } } if message == "" { message = MessageFromCondition("Monitored", deployment.Status.Conditions) diff --git a/internal/cmd/controller/summary/summary_test.go b/internal/cmd/controller/summary/summary_test.go index 0cecdaff99..34046235eb 100644 --- a/internal/cmd/controller/summary/summary_test.go +++ b/internal/cmd/controller/summary/summary_test.go @@ -1,6 +1,7 @@ package summary_test import ( + "errors" "fmt" "testing" @@ -113,3 +114,56 @@ func TestSetReadyConditions_ReasonClearedWhenBecomingReady(t *testing.T) { c.GetReason(bundleStatus)) } } + +func setCondition(bd *fleet.BundleDeployment, condType, message string) { + condition.Cond(condType).SetError(bd, "", errors.New(message)) +} + +func TestMessageFromDeployment(t *testing.T) { + t.Run("nil deployment returns empty string", func(t *testing.T) { + if msg := summary.MessageFromDeployment(nil); msg != "" { + t.Errorf("expected empty string, got %q", msg) + } + }) + + t.Run("Deployed condition takes priority over Installed", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Deployed", "deploy error") + setCondition(bd, "Installed", "install error") + if msg := summary.MessageFromDeployment(bd); msg != "deploy error" { + t.Errorf("expected deploy error to take priority, got %q", msg) + } + }) + + t.Run("Installed shown when deployment IDs match", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Installed", "install error") + if msg := summary.MessageFromDeployment(bd); msg != "install error" { + t.Errorf("expected install error, got %q", msg) + } + }) + + t.Run("Installed suppressed when deployment IDs differ", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id2" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Installed", "stale install error") + if msg := summary.MessageFromDeployment(bd); msg != "" { + t.Errorf("expected stale Installed message to be suppressed, got %q", msg) + } + }) + + t.Run("Monitored used as fallback when Deployed and Installed are absent", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Monitored", "monitor error") + if msg := summary.MessageFromDeployment(bd); msg != "monitor error" { + t.Errorf("expected monitor error as fallback, got %q", msg) + } + }) +} diff --git a/internal/cmd/controller/target/target.go b/internal/cmd/controller/target/target.go index 7dc252c742..7f18d50b52 100644 --- a/internal/cmd/controller/target/target.go +++ b/internal/cmd/controller/target/target.go @@ -136,13 +136,26 @@ func (t *Target) state() fleet.BundleState { case nil: return fleet.Pending default: - return summary.GetDeploymentState(t.Deployment) + return summary.GetDeploymentState(t.effectiveDeployment()) } } // message returns a relevant message from the target (pure function) func (t *Target) message() string { - return summary.MessageFromDeployment(t.Deployment) + return summary.MessageFromDeployment(t.effectiveDeployment()) +} + +// effectiveDeployment returns t.Deployment with Spec.DeploymentID overridden +// to t.DeploymentID, or t.Deployment unchanged when the IDs already match. +// Status is computed (via state and message) before the reconciler writes the +// new BD spec, so the cached Spec.DeploymentID lags at that point. +func (t *Target) effectiveDeployment() *fleet.BundleDeployment { + if t.Deployment == nil || t.DeploymentID == t.Deployment.Spec.DeploymentID { + return t.Deployment + } + bd := *t.Deployment + bd.Spec.DeploymentID = t.DeploymentID + return &bd } // initialiseOptionsMaps initialises options and staged options maps of namespace labels and annotations on bd, if From 5ddf5c85a793a24167097cadbd2fb89af50fe222 Mon Sep 17 00:00:00 2001 From: Tim Hardeck Date: Tue, 10 Mar 2026 11:13:50 +0100 Subject: [PATCH 2/2] Fix integration test after effectiveDeployment change After labels change the controller uses effectiveDeployment to compute WaitApplied=1 (the new deployment ID hasn't been applied yet). The test was checking WaitApplied==0 without simulating the agent re-applying the updated bundle deployment. Split the assertion into three steps: wait for BD spec change, simulate agent applying the new deployment, then assert the bundle shows WaitApplied=0. --- .../controller/bundle/status_test.go | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/integrationtests/controller/bundle/status_test.go b/integrationtests/controller/bundle/status_test.go index 551216a973..7f70db8ac4 100644 --- a/integrationtests/controller/bundle/status_test.go +++ b/integrationtests/controller/bundle/status_test.go @@ -187,12 +187,24 @@ var _ = Describe("Bundle Status Fields", func() { return k8sClient.Update(ctx, cluster) }).ShouldNot(HaveOccurred()) - // Change in cluster state results in a bundle deployment update - Eventually(func(g Gomega) { + // Change in cluster state results in a bundle deployment spec update + Eventually(func() bool { err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(bd.Spec.DeploymentID).ToNot(Equal(deplIDBefore)) + Expect(err).NotTo(HaveOccurred()) + return bd.Spec.DeploymentID != deplIDBefore + }).Should(BeTrue()) + // Simulate the agent applying the updated bundle deployment + Eventually(func() error { + err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd) + if err != nil { + return err + } + bd.Status.AppliedDeploymentID = bd.Spec.DeploymentID + return k8sClient.Status().Update(ctx, bd) + }).ShouldNot(HaveOccurred()) + + Eventually(func(g Gomega) { err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bundle) g.Expect(err).NotTo(HaveOccurred()) g.Expect(bundle.Status.Summary.WaitApplied).To(Equal(0))