diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml new file mode 100644 index 0000000000..f1d84db10d --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: offline-bundle-stuck +description: A minimal chart used to reproduce issue 594 (stale error after offline cluster) +type: application +version: 0.1.0 +appVersion: "1.0.0" diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml new file mode 100644 index 0000000000..f0b05b1979 --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml @@ -0,0 +1,2 @@ +helm: + releaseName: offline-bundle-stuck diff --git a/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml b/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml new file mode 100644 index 0000000000..a365f44c2c --- /dev/null +++ b/e2e/assets/single-cluster/offline-bundle-stuck/templates/configmap.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + key: value diff --git a/e2e/single-cluster/status_test.go b/e2e/single-cluster/status_test.go index 975475159b..adefc8d40c 100644 --- a/e2e/single-cluster/status_test.go +++ b/e2e/single-cluster/status_test.go @@ -9,6 +9,7 @@ import ( "path" "strings" + gogit "github.com/go-git/go-git/v5" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -262,6 +263,190 @@ var _ = Describe("Checks that template errors are shown in bundles and gitrepos" }) }) +// Checks that once a cluster goes offline after a failed deployment, the bundle +// status does not permanently show the stale error after a fix commit is applied +// (issue https://github.com/rancher/fleet/issues/594). +var _ = Describe("Bundle status does not retain stale error for offline cluster after fix", Ordered, Label("infra-setup"), func() { + const ( + localAgentNS = "cattle-fleet-local-system" + fleetAgentDeploy = "fleet-agent" + ) + + var ( + tmpDir string + cloneDir string + k kubectl.Command + kAgent kubectl.Command + gh *githelper.Git + inClusterRepoURL string + gitrepoName string + clone *gogit.Repository + targetNamespace string + r = rand.New(rand.NewSource(GinkgoRandomSeed())) + ) + + BeforeEach(func() { + k = env.Kubectl.Namespace(env.Namespace) + kAgent = env.Kubectl.Namespace(localAgentNS) + }) + + JustBeforeEach(func() { + host := githelper.BuildGitHostname() + addr, err := githelper.GetExternalRepoAddr(env, port, "repo") + Expect(err).ToNot(HaveOccurred()) + gh = githelper.NewHTTP(addr) + + inClusterRepoURL = gh.GetInClusterURL(host, port, "repo") + + tmpDir, _ = os.MkdirTemp("", "fleet-") + cloneDir = path.Join(tmpDir, "repo") + + gitrepoName = testenv.RandomFilename("offline-stuck", r) + targetNamespace = testenv.NewNamespaceName("offline-stuck", r) + + clone, err = gh.Create(cloneDir, testenv.AssetPath("single-cluster/offline-bundle-stuck"), "examples") + Expect(err).ToNot(HaveOccurred()) + + err = testenv.ApplyTemplate(k, testenv.AssetPath("status/gitrepo.yaml"), struct { + Name string + Repo string + Branch string + TargetNamespace string + }{ + gitrepoName, + inClusterRepoURL, + gh.Branch, + targetNamespace, + }) + Expect(err).ToNot(HaveOccurred()) + }) + + AfterAll(func() { + // Ensure fleet-agent is always restored, even when the test fails midway. + _, _ = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s") + + _ = os.RemoveAll(tmpDir) + _, _ = k.Delete("gitrepo", gitrepoName) + + Eventually(func(g Gomega) { + out, _ := k.Get( + "bundledeployments", + "-A", + "-l", + fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName), + ) + g.Expect(out).To(ContainSubstring("No resources found")) + }).Should(Succeed()) + + _, _ = k.Delete("ns", targetNamespace, "--wait=false") + }) + + It("clears the stale error from an offline cluster once a fix commit is present", func() { + bundleName := gitrepoName + "-examples" + + By("waiting for the initial deployment to be Ready") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(1)) + }).Should(Succeed()) + + By("pushing a commit that introduces a YAML parse error") + badContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + broken: {unclosed +` + err := os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(badContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("waiting for the bundle to reflect the YAML parse error") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(0)) + found := false + for _, cond := range status.Conditions { + if cond.Type == string(fleet.Ready) && strings.Contains(cond.Message, "did not find expected") { + found = true + break + } + } + g.Expect(found).To(BeTrue(), "expected YAML parse error in bundle conditions, got: %v", status.Conditions) + }, testenv.MediumTimeout, testenv.PollingInterval).Should(Succeed()) + + By("scaling down the fleet-agent to simulate an offline cluster") + out, err := kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=0", "--timeout=60s") + Expect(err).ToNot(HaveOccurred(), out) + + // Wait until the agent pod is gone so it cannot apply any further commits. + Eventually(func(g Gomega) { + out, _ := kAgent.Get("pods", "-l", "app=fleet-agent") + g.Expect(out).To(ContainSubstring("No resources found")) + }).Should(Succeed()) + + By("pushing an intermediate commit that does not fix the YAML error") + intermediateContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm + labels: + version: "2" +data: + broken: {unclosed +` + err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(intermediateContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("pushing a fix commit (valid YAML)") + fixContent := `apiVersion: v1 +kind: ConfigMap +metadata: + name: offline-bundle-stuck-cm +data: + key: fixed +` + err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(fixContent), 0644) + Expect(err).ToNot(HaveOccurred()) + _, err = gh.Update(clone) + Expect(err).ToNot(HaveOccurred()) + + By("verifying the error message does not persist after the fix commit is pushed") + // After the controller picks up the fix commit it updates the BD spec. + // Even though the offline agent cannot apply the fix yet, the bundle + // should no longer surface the stale error from the previous apply attempt. + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + found := false + for _, cond := range status.Conditions { + if cond.Type == string(fleet.Ready) { + found = true + g.Expect(cond.Message).NotTo( + ContainSubstring("did not find expected"), + "bundle Ready condition still shows stale YAML error after fix commit was pushed", + ) + break + } + } + g.Expect(found).To(BeTrue(), "expected Ready condition to be present, got: %v", status.Conditions) + }, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed()) + + By("scaling the fleet-agent back up") + out, err = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s") + Expect(err).ToNot(HaveOccurred(), out) + + By("waiting for the bundle to become Ready after the agent recovers") + Eventually(func(g Gomega) { + status := getBundleStatus(g, k, bundleName) + g.Expect(status.Summary.Ready).To(Equal(1)) + }, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed()) + }) +}) + // getBundleStatus retrieves the status of the bundle with the provided name. func getBundleStatus(g Gomega, k kubectl.Command, name string) fleet.BundleStatus { gr, err := k.Get("bundle", name, "-o=json") diff --git a/integrationtests/controller/bundle/status_test.go b/integrationtests/controller/bundle/status_test.go index 3a0abfcee8..d872f2b154 100644 --- a/integrationtests/controller/bundle/status_test.go +++ b/integrationtests/controller/bundle/status_test.go @@ -186,12 +186,25 @@ var _ = Describe("Bundle Status Fields", func() { return k8sClient.Update(ctx, cluster) }).ShouldNot(HaveOccurred()) - By("validating that the bundle is re-deployed") - Eventually(func(g Gomega) { + By("validating that the bundle deployment spec is updated") + Eventually(func() bool { err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(bd.Spec.DeploymentID).ToNot(Equal(deplIDBefore)) + Expect(err).NotTo(HaveOccurred()) + return bd.Spec.DeploymentID != deplIDBefore + }).Should(BeTrue()) + By("simulating the agent applying the updated bundle deployment") + Eventually(func() error { + err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd) + if err != nil { + return err + } + bd.Status.AppliedDeploymentID = bd.Spec.DeploymentID + return k8sClient.Status().Update(ctx, bd) + }).ShouldNot(HaveOccurred()) + + By("verifying the bundle is ready again") + Eventually(func(g Gomega) { err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bundle) g.Expect(err).NotTo(HaveOccurred()) g.Expect(bundle.Status.Summary.WaitApplied).To(Equal(0)) diff --git a/internal/cmd/agent/deployer/deployer.go b/internal/cmd/agent/deployer/deployer.go index 6a358aac62..f2898998cd 100644 --- a/internal/cmd/agent/deployer/deployer.go +++ b/internal/cmd/agent/deployer/deployer.go @@ -285,7 +285,8 @@ func deployErrToStatus(err error, status fleet.BundleDeploymentStatus) (bool, fl "(chart requires kubeVersion)|" + // kubeVersion mismatch "(annotation validation error)|" + // annotations fail to pass validation "(failed, and has been rolled back due to atomic being set)|" + // atomic is set and a rollback occurs - "(YAML parse error)|" + // YAML is broken in source files + "(YAML parse error)|" + // YAML is broken in source files (Helm v3) + "(MalformedYAMLError)|" + // YAML is broken in source files (Helm v4) "(Forbidden: updates to [0-9A-Za-z]+ spec for fields other than [0-9A-Za-z ']+ are forbidden)|" + // trying to update fields that cannot be updated "(Forbidden: spec is immutable after creation)|" + // trying to modify immutable spec "(chart requires kubeVersion: [0-9A-Za-z\\.\\-<>=]+ which is incompatible with Kubernetes)", // trying to deploy to incompatible Kubernetes diff --git a/internal/cmd/controller/summary/summary.go b/internal/cmd/controller/summary/summary.go index 6b58b583ad..a8d3034849 100644 --- a/internal/cmd/controller/summary/summary.go +++ b/internal/cmd/controller/summary/summary.go @@ -137,7 +137,13 @@ func MessageFromDeployment(deployment *fleet.BundleDeployment) string { } message := MessageFromCondition("Deployed", deployment.Status.Conditions) if message == "" { - message = MessageFromCondition("Installed", deployment.Status.Conditions) + // Only surface the Installed error when it belongs to the current + // deployment. If the deployment IDs differ, the Installed condition + // was set by a previous (now superseded) apply attempt and the + // message is stale. + if deployment.Status.AppliedDeploymentID == deployment.Spec.DeploymentID { + message = MessageFromCondition("Installed", deployment.Status.Conditions) + } } if message == "" { message = MessageFromCondition("Monitored", deployment.Status.Conditions) diff --git a/internal/cmd/controller/summary/summary_test.go b/internal/cmd/controller/summary/summary_test.go index 0cecdaff99..34046235eb 100644 --- a/internal/cmd/controller/summary/summary_test.go +++ b/internal/cmd/controller/summary/summary_test.go @@ -1,6 +1,7 @@ package summary_test import ( + "errors" "fmt" "testing" @@ -113,3 +114,56 @@ func TestSetReadyConditions_ReasonClearedWhenBecomingReady(t *testing.T) { c.GetReason(bundleStatus)) } } + +func setCondition(bd *fleet.BundleDeployment, condType, message string) { + condition.Cond(condType).SetError(bd, "", errors.New(message)) +} + +func TestMessageFromDeployment(t *testing.T) { + t.Run("nil deployment returns empty string", func(t *testing.T) { + if msg := summary.MessageFromDeployment(nil); msg != "" { + t.Errorf("expected empty string, got %q", msg) + } + }) + + t.Run("Deployed condition takes priority over Installed", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Deployed", "deploy error") + setCondition(bd, "Installed", "install error") + if msg := summary.MessageFromDeployment(bd); msg != "deploy error" { + t.Errorf("expected deploy error to take priority, got %q", msg) + } + }) + + t.Run("Installed shown when deployment IDs match", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Installed", "install error") + if msg := summary.MessageFromDeployment(bd); msg != "install error" { + t.Errorf("expected install error, got %q", msg) + } + }) + + t.Run("Installed suppressed when deployment IDs differ", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id2" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Installed", "stale install error") + if msg := summary.MessageFromDeployment(bd); msg != "" { + t.Errorf("expected stale Installed message to be suppressed, got %q", msg) + } + }) + + t.Run("Monitored used as fallback when Deployed and Installed are absent", func(t *testing.T) { + bd := &fleet.BundleDeployment{} + bd.Spec.DeploymentID = "id1" + bd.Status.AppliedDeploymentID = "id1" + setCondition(bd, "Monitored", "monitor error") + if msg := summary.MessageFromDeployment(bd); msg != "monitor error" { + t.Errorf("expected monitor error as fallback, got %q", msg) + } + }) +} diff --git a/internal/cmd/controller/target/target.go b/internal/cmd/controller/target/target.go index 7dc252c742..7f18d50b52 100644 --- a/internal/cmd/controller/target/target.go +++ b/internal/cmd/controller/target/target.go @@ -136,13 +136,26 @@ func (t *Target) state() fleet.BundleState { case nil: return fleet.Pending default: - return summary.GetDeploymentState(t.Deployment) + return summary.GetDeploymentState(t.effectiveDeployment()) } } // message returns a relevant message from the target (pure function) func (t *Target) message() string { - return summary.MessageFromDeployment(t.Deployment) + return summary.MessageFromDeployment(t.effectiveDeployment()) +} + +// effectiveDeployment returns t.Deployment with Spec.DeploymentID overridden +// to t.DeploymentID, or t.Deployment unchanged when the IDs already match. +// Status is computed (via state and message) before the reconciler writes the +// new BD spec, so the cached Spec.DeploymentID lags at that point. +func (t *Target) effectiveDeployment() *fleet.BundleDeployment { + if t.Deployment == nil || t.DeploymentID == t.Deployment.Spec.DeploymentID { + return t.Deployment + } + bd := *t.Deployment + bd.Spec.DeploymentID = t.DeploymentID + return &bd } // initialiseOptionsMaps initialises options and staged options maps of namespace labels and annotations on bd, if