Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions e2e/assets/single-cluster/offline-bundle-stuck/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v2
name: offline-bundle-stuck
description: A minimal chart used to reproduce issue 594 (stale error after offline cluster)
type: application
version: 0.1.0
appVersion: "1.0.0"
2 changes: 2 additions & 0 deletions e2e/assets/single-cluster/offline-bundle-stuck/fleet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
helm:
releaseName: offline-bundle-stuck
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: offline-bundle-stuck-cm
data:
key: value
185 changes: 185 additions & 0 deletions e2e/single-cluster/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"path"
"strings"

gogit "github.com/go-git/go-git/v5"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

Expand Down Expand Up @@ -262,6 +263,190 @@ var _ = Describe("Checks that template errors are shown in bundles and gitrepos"
})
})

// Checks that once a cluster goes offline after a failed deployment, the bundle
// status does not permanently show the stale error after a fix commit is applied
// (issue https://github.com/rancher/fleet/issues/594).
var _ = Describe("Bundle status does not retain stale error for offline cluster after fix", Ordered, Label("infra-setup"), func() {
const (
localAgentNS = "cattle-fleet-local-system"
fleetAgentDeploy = "fleet-agent"
)

var (
tmpDir string
cloneDir string
k kubectl.Command
kAgent kubectl.Command
gh *githelper.Git
inClusterRepoURL string
gitrepoName string
clone *gogit.Repository
targetNamespace string
r = rand.New(rand.NewSource(GinkgoRandomSeed()))
)

BeforeEach(func() {
k = env.Kubectl.Namespace(env.Namespace)
kAgent = env.Kubectl.Namespace(localAgentNS)
})

JustBeforeEach(func() {
host := githelper.BuildGitHostname()
addr, err := githelper.GetExternalRepoAddr(env, port, "repo")
Expect(err).ToNot(HaveOccurred())
gh = githelper.NewHTTP(addr)

inClusterRepoURL = gh.GetInClusterURL(host, port, "repo")

tmpDir, _ = os.MkdirTemp("", "fleet-")
cloneDir = path.Join(tmpDir, "repo")

gitrepoName = testenv.RandomFilename("offline-stuck", r)
targetNamespace = testenv.NewNamespaceName("offline-stuck", r)

clone, err = gh.Create(cloneDir, testenv.AssetPath("single-cluster/offline-bundle-stuck"), "examples")
Expect(err).ToNot(HaveOccurred())

err = testenv.ApplyTemplate(k, testenv.AssetPath("status/gitrepo.yaml"), struct {
Name string
Repo string
Branch string
TargetNamespace string
}{
gitrepoName,
inClusterRepoURL,
gh.Branch,
targetNamespace,
})
Expect(err).ToNot(HaveOccurred())
})

AfterAll(func() {
// Ensure fleet-agent is always restored, even when the test fails midway.
_, _ = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s")

_ = os.RemoveAll(tmpDir)
_, _ = k.Delete("gitrepo", gitrepoName)

Eventually(func(g Gomega) {
out, _ := k.Get(
"bundledeployments",
"-A",
"-l",
fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName),
)
g.Expect(out).To(ContainSubstring("No resources found"))
}).Should(Succeed())

_, _ = k.Delete("ns", targetNamespace, "--wait=false")
})

It("clears the stale error from an offline cluster once a fix commit is present", func() {
bundleName := gitrepoName + "-examples"

By("waiting for the initial deployment to be Ready")
Eventually(func(g Gomega) {
status := getBundleStatus(g, k, bundleName)
g.Expect(status.Summary.Ready).To(Equal(1))
}).Should(Succeed())

By("pushing a commit that introduces a YAML parse error")
badContent := `apiVersion: v1
kind: ConfigMap
metadata:
name: offline-bundle-stuck-cm
data:
broken: {unclosed
`
err := os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(badContent), 0644)
Expect(err).ToNot(HaveOccurred())
_, err = gh.Update(clone)
Expect(err).ToNot(HaveOccurred())

By("waiting for the bundle to reflect the YAML parse error")
Eventually(func(g Gomega) {
status := getBundleStatus(g, k, bundleName)
g.Expect(status.Summary.Ready).To(Equal(0))
found := false
for _, cond := range status.Conditions {
if cond.Type == string(fleet.Ready) && strings.Contains(cond.Message, "did not find expected") {
found = true
break
}
}
g.Expect(found).To(BeTrue(), "expected YAML parse error in bundle conditions, got: %v", status.Conditions)
}, testenv.MediumTimeout, testenv.PollingInterval).Should(Succeed())

By("scaling down the fleet-agent to simulate an offline cluster")
out, err := kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=0", "--timeout=60s")
Expect(err).ToNot(HaveOccurred(), out)

// Wait until the agent pod is gone so it cannot apply any further commits.
Eventually(func(g Gomega) {
out, _ := kAgent.Get("pods", "-l", "app=fleet-agent")
g.Expect(out).To(ContainSubstring("No resources found"))
}).Should(Succeed())

By("pushing an intermediate commit that does not fix the YAML error")
intermediateContent := `apiVersion: v1
kind: ConfigMap
metadata:
name: offline-bundle-stuck-cm
labels:
version: "2"
data:
broken: {unclosed
`
err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(intermediateContent), 0644)
Expect(err).ToNot(HaveOccurred())
_, err = gh.Update(clone)
Expect(err).ToNot(HaveOccurred())

By("pushing a fix commit (valid YAML)")
fixContent := `apiVersion: v1
kind: ConfigMap
metadata:
name: offline-bundle-stuck-cm
data:
key: fixed
`
err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(fixContent), 0644)
Expect(err).ToNot(HaveOccurred())
_, err = gh.Update(clone)
Expect(err).ToNot(HaveOccurred())

By("verifying the error message does not persist after the fix commit is pushed")
// After the controller picks up the fix commit it updates the BD spec.
// Even though the offline agent cannot apply the fix yet, the bundle
// should no longer surface the stale error from the previous apply attempt.
Eventually(func(g Gomega) {
status := getBundleStatus(g, k, bundleName)
found := false
for _, cond := range status.Conditions {
if cond.Type == string(fleet.Ready) {
found = true
g.Expect(cond.Message).NotTo(
ContainSubstring("did not find expected"),
"bundle Ready condition still shows stale YAML error after fix commit was pushed",
)
break
}
}
Comment thread
thardeck marked this conversation as resolved.
g.Expect(found).To(BeTrue(), "expected Ready condition to be present, got: %v", status.Conditions)
}, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed())

By("scaling the fleet-agent back up")
out, err = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s")
Expect(err).ToNot(HaveOccurred(), out)

By("waiting for the bundle to become Ready after the agent recovers")
Eventually(func(g Gomega) {
status := getBundleStatus(g, k, bundleName)
g.Expect(status.Summary.Ready).To(Equal(1))
}, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed())
})
})

// getBundleStatus retrieves the status of the bundle with the provided name.
func getBundleStatus(g Gomega, k kubectl.Command, name string) fleet.BundleStatus {
gr, err := k.Get("bundle", name, "-o=json")
Expand Down
21 changes: 17 additions & 4 deletions integrationtests/controller/bundle/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,25 @@ var _ = Describe("Bundle Status Fields", func() {
return k8sClient.Update(ctx, cluster)
}).ShouldNot(HaveOccurred())

By("validating that the bundle is re-deployed")
Eventually(func(g Gomega) {
By("validating that the bundle deployment spec is updated")
Eventually(func() bool {
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(bd.Spec.DeploymentID).ToNot(Equal(deplIDBefore))
Expect(err).NotTo(HaveOccurred())
return bd.Spec.DeploymentID != deplIDBefore
}).Should(BeTrue())

By("simulating the agent applying the updated bundle deployment")
Eventually(func() error {
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd)
if err != nil {
return err
}
bd.Status.AppliedDeploymentID = bd.Spec.DeploymentID
return k8sClient.Status().Update(ctx, bd)
}).ShouldNot(HaveOccurred())

By("verifying the bundle is ready again")
Eventually(func(g Gomega) {
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bundle)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(bundle.Status.Summary.WaitApplied).To(Equal(0))
Expand Down
3 changes: 2 additions & 1 deletion internal/cmd/agent/deployer/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,8 @@ func deployErrToStatus(err error, status fleet.BundleDeploymentStatus) (bool, fl
"(chart requires kubeVersion)|" + // kubeVersion mismatch
"(annotation validation error)|" + // annotations fail to pass validation
"(failed, and has been rolled back due to atomic being set)|" + // atomic is set and a rollback occurs
"(YAML parse error)|" + // YAML is broken in source files
"(YAML parse error)|" + // YAML is broken in source files (Helm v3)
"(MalformedYAMLError)|" + // YAML is broken in source files (Helm v4)
"(Forbidden: updates to [0-9A-Za-z]+ spec for fields other than [0-9A-Za-z ']+ are forbidden)|" + // trying to update fields that cannot be updated
"(Forbidden: spec is immutable after creation)|" + // trying to modify immutable spec
"(chart requires kubeVersion: [0-9A-Za-z\\.\\-<>=]+ which is incompatible with Kubernetes)", // trying to deploy to incompatible Kubernetes
Expand Down
8 changes: 7 additions & 1 deletion internal/cmd/controller/summary/summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,13 @@ func MessageFromDeployment(deployment *fleet.BundleDeployment) string {
}
message := MessageFromCondition("Deployed", deployment.Status.Conditions)
if message == "" {
message = MessageFromCondition("Installed", deployment.Status.Conditions)
// Only surface the Installed error when it belongs to the current
// deployment. If the deployment IDs differ, the Installed condition
// was set by a previous (now superseded) apply attempt and the
// message is stale.
if deployment.Status.AppliedDeploymentID == deployment.Spec.DeploymentID {
message = MessageFromCondition("Installed", deployment.Status.Conditions)
}
Comment thread
thardeck marked this conversation as resolved.
}
if message == "" {
message = MessageFromCondition("Monitored", deployment.Status.Conditions)
Expand Down
54 changes: 54 additions & 0 deletions internal/cmd/controller/summary/summary_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package summary_test

import (
"errors"
"fmt"
"testing"

Expand Down Expand Up @@ -113,3 +114,56 @@ func TestSetReadyConditions_ReasonClearedWhenBecomingReady(t *testing.T) {
c.GetReason(bundleStatus))
}
}

func setCondition(bd *fleet.BundleDeployment, condType, message string) {
condition.Cond(condType).SetError(bd, "", errors.New(message))
}

func TestMessageFromDeployment(t *testing.T) {
t.Run("nil deployment returns empty string", func(t *testing.T) {
if msg := summary.MessageFromDeployment(nil); msg != "" {
t.Errorf("expected empty string, got %q", msg)
}
})

t.Run("Deployed condition takes priority over Installed", func(t *testing.T) {
bd := &fleet.BundleDeployment{}
bd.Spec.DeploymentID = "id1"
bd.Status.AppliedDeploymentID = "id1"
setCondition(bd, "Deployed", "deploy error")
setCondition(bd, "Installed", "install error")
if msg := summary.MessageFromDeployment(bd); msg != "deploy error" {
t.Errorf("expected deploy error to take priority, got %q", msg)
}
})

t.Run("Installed shown when deployment IDs match", func(t *testing.T) {
bd := &fleet.BundleDeployment{}
bd.Spec.DeploymentID = "id1"
bd.Status.AppliedDeploymentID = "id1"
setCondition(bd, "Installed", "install error")
if msg := summary.MessageFromDeployment(bd); msg != "install error" {
t.Errorf("expected install error, got %q", msg)
}
})

t.Run("Installed suppressed when deployment IDs differ", func(t *testing.T) {
bd := &fleet.BundleDeployment{}
bd.Spec.DeploymentID = "id2"
bd.Status.AppliedDeploymentID = "id1"
setCondition(bd, "Installed", "stale install error")
if msg := summary.MessageFromDeployment(bd); msg != "" {
t.Errorf("expected stale Installed message to be suppressed, got %q", msg)
}
})

t.Run("Monitored used as fallback when Deployed and Installed are absent", func(t *testing.T) {
bd := &fleet.BundleDeployment{}
bd.Spec.DeploymentID = "id1"
bd.Status.AppliedDeploymentID = "id1"
setCondition(bd, "Monitored", "monitor error")
if msg := summary.MessageFromDeployment(bd); msg != "monitor error" {
t.Errorf("expected monitor error as fallback, got %q", msg)
}
})
}
17 changes: 15 additions & 2 deletions internal/cmd/controller/target/target.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,26 @@ func (t *Target) state() fleet.BundleState {
case nil:
return fleet.Pending
default:
return summary.GetDeploymentState(t.Deployment)
return summary.GetDeploymentState(t.effectiveDeployment())
}
}

// message returns a relevant message from the target (pure function)
func (t *Target) message() string {
return summary.MessageFromDeployment(t.Deployment)
return summary.MessageFromDeployment(t.effectiveDeployment())
}

// effectiveDeployment returns t.Deployment with Spec.DeploymentID overridden
// to t.DeploymentID, or t.Deployment unchanged when the IDs already match.
// Status is computed (via state and message) before the reconciler writes the
// new BD spec, so the cached Spec.DeploymentID lags at that point.
func (t *Target) effectiveDeployment() *fleet.BundleDeployment {
if t.Deployment == nil || t.DeploymentID == t.Deployment.Spec.DeploymentID {
return t.Deployment
}
bd := *t.Deployment
Comment thread
thardeck marked this conversation as resolved.
bd.Spec.DeploymentID = t.DeploymentID
return &bd
}

// initialiseOptionsMaps initialises options and staged options maps of namespace labels and annotations on bd, if
Expand Down