Skip to content

Commit ff94719

Browse files
authored
Fix stale bundle error when cluster is offline after bad commit (#4780)
* Fix stale bundle error when cluster is offline after bad commit When a GitRepo contains a YAML parse error and the cluster agent is offline, the bundle's Ready condition retains the error message even after a fix commit is pushed. Three interdependent changes are needed. deployer.go: Add MalformedYAMLError to the deployErrToStatus regex. Helm v4 changed the error format from "YAML parse error" to "MalformedYAMLError"; without this match the error is routed to the Deployed condition instead of Installed, bypassing the staleness guard. summary.go: In MessageFromDeployment, skip the Installed condition message when AppliedDeploymentID differs from Spec.DeploymentID, so a stale error from a superseded apply attempt is not surfaced. target.go: Add effectiveDeployment so state and message compare against t.DeploymentID (the ID the controller is about to write) rather than the stale Spec.DeploymentID still held in the cached BundleDeployment. The bundle controller calls SetReadyConditions before updating BD specs, so the summary.go guard would otherwise never trigger while the agent is offline. * Fix integration test after effectiveDeployment change After labels change the controller uses effectiveDeployment to compute WaitApplied=1 (the new deployment ID hasn't been applied yet). The test was checking WaitApplied==0 without simulating the agent re-applying the updated bundle deployment. Split the assertion into three steps: wait for BD spec change, simulate agent applying the new deployment, then assert the bundle shows WaitApplied=0.
1 parent 6bbc1ff commit ff94719

9 files changed

Lines changed: 294 additions & 8 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v2
2+
name: offline-bundle-stuck
3+
description: A minimal chart used to reproduce issue 594 (stale error after offline cluster)
4+
type: application
5+
version: 0.1.0
6+
appVersion: "1.0.0"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
helm:
2+
releaseName: offline-bundle-stuck
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: offline-bundle-stuck-cm
5+
data:
6+
key: value

e2e/single-cluster/status_test.go

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"path"
1010
"strings"
1111

12+
gogit "github.com/go-git/go-git/v5"
1213
. "github.com/onsi/ginkgo/v2"
1314
. "github.com/onsi/gomega"
1415

@@ -262,6 +263,190 @@ var _ = Describe("Checks that template errors are shown in bundles and gitrepos"
262263
})
263264
})
264265

266+
// Checks that once a cluster goes offline after a failed deployment, the bundle
267+
// status does not permanently show the stale error after a fix commit is applied
268+
// (issue https://github.com/rancher/fleet/issues/594).
269+
var _ = Describe("Bundle status does not retain stale error for offline cluster after fix", Ordered, Label("infra-setup"), func() {
270+
const (
271+
localAgentNS = "cattle-fleet-local-system"
272+
fleetAgentDeploy = "fleet-agent"
273+
)
274+
275+
var (
276+
tmpDir string
277+
cloneDir string
278+
k kubectl.Command
279+
kAgent kubectl.Command
280+
gh *githelper.Git
281+
inClusterRepoURL string
282+
gitrepoName string
283+
clone *gogit.Repository
284+
targetNamespace string
285+
r = rand.New(rand.NewSource(GinkgoRandomSeed()))
286+
)
287+
288+
BeforeEach(func() {
289+
k = env.Kubectl.Namespace(env.Namespace)
290+
kAgent = env.Kubectl.Namespace(localAgentNS)
291+
})
292+
293+
JustBeforeEach(func() {
294+
host := githelper.BuildGitHostname()
295+
addr, err := githelper.GetExternalRepoAddr(env, port, "repo")
296+
Expect(err).ToNot(HaveOccurred())
297+
gh = githelper.NewHTTP(addr)
298+
299+
inClusterRepoURL = gh.GetInClusterURL(host, port, "repo")
300+
301+
tmpDir, _ = os.MkdirTemp("", "fleet-")
302+
cloneDir = path.Join(tmpDir, "repo")
303+
304+
gitrepoName = testenv.RandomFilename("offline-stuck", r)
305+
targetNamespace = testenv.NewNamespaceName("offline-stuck", r)
306+
307+
clone, err = gh.Create(cloneDir, testenv.AssetPath("single-cluster/offline-bundle-stuck"), "examples")
308+
Expect(err).ToNot(HaveOccurred())
309+
310+
err = testenv.ApplyTemplate(k, testenv.AssetPath("status/gitrepo.yaml"), struct {
311+
Name string
312+
Repo string
313+
Branch string
314+
TargetNamespace string
315+
}{
316+
gitrepoName,
317+
inClusterRepoURL,
318+
gh.Branch,
319+
targetNamespace,
320+
})
321+
Expect(err).ToNot(HaveOccurred())
322+
})
323+
324+
AfterAll(func() {
325+
// Ensure fleet-agent is always restored, even when the test fails midway.
326+
_, _ = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s")
327+
328+
_ = os.RemoveAll(tmpDir)
329+
_, _ = k.Delete("gitrepo", gitrepoName)
330+
331+
Eventually(func(g Gomega) {
332+
out, _ := k.Get(
333+
"bundledeployments",
334+
"-A",
335+
"-l",
336+
fmt.Sprintf("fleet.cattle.io/repo-name=%s", gitrepoName),
337+
)
338+
g.Expect(out).To(ContainSubstring("No resources found"))
339+
}).Should(Succeed())
340+
341+
_, _ = k.Delete("ns", targetNamespace, "--wait=false")
342+
})
343+
344+
It("clears the stale error from an offline cluster once a fix commit is present", func() {
345+
bundleName := gitrepoName + "-examples"
346+
347+
By("waiting for the initial deployment to be Ready")
348+
Eventually(func(g Gomega) {
349+
status := getBundleStatus(g, k, bundleName)
350+
g.Expect(status.Summary.Ready).To(Equal(1))
351+
}).Should(Succeed())
352+
353+
By("pushing a commit that introduces a YAML parse error")
354+
badContent := `apiVersion: v1
355+
kind: ConfigMap
356+
metadata:
357+
name: offline-bundle-stuck-cm
358+
data:
359+
broken: {unclosed
360+
`
361+
err := os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(badContent), 0644)
362+
Expect(err).ToNot(HaveOccurred())
363+
_, err = gh.Update(clone)
364+
Expect(err).ToNot(HaveOccurred())
365+
366+
By("waiting for the bundle to reflect the YAML parse error")
367+
Eventually(func(g Gomega) {
368+
status := getBundleStatus(g, k, bundleName)
369+
g.Expect(status.Summary.Ready).To(Equal(0))
370+
found := false
371+
for _, cond := range status.Conditions {
372+
if cond.Type == string(fleet.Ready) && strings.Contains(cond.Message, "did not find expected") {
373+
found = true
374+
break
375+
}
376+
}
377+
g.Expect(found).To(BeTrue(), "expected YAML parse error in bundle conditions, got: %v", status.Conditions)
378+
}, testenv.MediumTimeout, testenv.PollingInterval).Should(Succeed())
379+
380+
By("scaling down the fleet-agent to simulate an offline cluster")
381+
out, err := kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=0", "--timeout=60s")
382+
Expect(err).ToNot(HaveOccurred(), out)
383+
384+
// Wait until the agent pod is gone so it cannot apply any further commits.
385+
Eventually(func(g Gomega) {
386+
out, _ := kAgent.Get("pods", "-l", "app=fleet-agent")
387+
g.Expect(out).To(ContainSubstring("No resources found"))
388+
}).Should(Succeed())
389+
390+
By("pushing an intermediate commit that does not fix the YAML error")
391+
intermediateContent := `apiVersion: v1
392+
kind: ConfigMap
393+
metadata:
394+
name: offline-bundle-stuck-cm
395+
labels:
396+
version: "2"
397+
data:
398+
broken: {unclosed
399+
`
400+
err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(intermediateContent), 0644)
401+
Expect(err).ToNot(HaveOccurred())
402+
_, err = gh.Update(clone)
403+
Expect(err).ToNot(HaveOccurred())
404+
405+
By("pushing a fix commit (valid YAML)")
406+
fixContent := `apiVersion: v1
407+
kind: ConfigMap
408+
metadata:
409+
name: offline-bundle-stuck-cm
410+
data:
411+
key: fixed
412+
`
413+
err = os.WriteFile(path.Join(cloneDir, "examples", "templates", "configmap.yaml"), []byte(fixContent), 0644)
414+
Expect(err).ToNot(HaveOccurred())
415+
_, err = gh.Update(clone)
416+
Expect(err).ToNot(HaveOccurred())
417+
418+
By("verifying the error message does not persist after the fix commit is pushed")
419+
// After the controller picks up the fix commit it updates the BD spec.
420+
// Even though the offline agent cannot apply the fix yet, the bundle
421+
// should no longer surface the stale error from the previous apply attempt.
422+
Eventually(func(g Gomega) {
423+
status := getBundleStatus(g, k, bundleName)
424+
found := false
425+
for _, cond := range status.Conditions {
426+
if cond.Type == string(fleet.Ready) {
427+
found = true
428+
g.Expect(cond.Message).NotTo(
429+
ContainSubstring("did not find expected"),
430+
"bundle Ready condition still shows stale YAML error after fix commit was pushed",
431+
)
432+
break
433+
}
434+
}
435+
g.Expect(found).To(BeTrue(), "expected Ready condition to be present, got: %v", status.Conditions)
436+
}, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed())
437+
438+
By("scaling the fleet-agent back up")
439+
out, err = kAgent.Run("scale", "deployment", fleetAgentDeploy, "--replicas=1", "--timeout=60s")
440+
Expect(err).ToNot(HaveOccurred(), out)
441+
442+
By("waiting for the bundle to become Ready after the agent recovers")
443+
Eventually(func(g Gomega) {
444+
status := getBundleStatus(g, k, bundleName)
445+
g.Expect(status.Summary.Ready).To(Equal(1))
446+
}, testenv.LongTimeout, testenv.PollingInterval).Should(Succeed())
447+
})
448+
})
449+
265450
// getBundleStatus retrieves the status of the bundle with the provided name.
266451
func getBundleStatus(g Gomega, k kubectl.Command, name string) fleet.BundleStatus {
267452
gr, err := k.Get("bundle", name, "-o=json")

integrationtests/controller/bundle/status_test.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,25 @@ var _ = Describe("Bundle Status Fields", func() {
186186
return k8sClient.Update(ctx, cluster)
187187
}).ShouldNot(HaveOccurred())
188188

189-
By("validating that the bundle is re-deployed")
190-
Eventually(func(g Gomega) {
189+
By("validating that the bundle deployment spec is updated")
190+
Eventually(func() bool {
191191
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd)
192-
g.Expect(err).NotTo(HaveOccurred())
193-
g.Expect(bd.Spec.DeploymentID).ToNot(Equal(deplIDBefore))
192+
Expect(err).NotTo(HaveOccurred())
193+
return bd.Spec.DeploymentID != deplIDBefore
194+
}).Should(BeTrue())
194195

196+
By("simulating the agent applying the updated bundle deployment")
197+
Eventually(func() error {
198+
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bd)
199+
if err != nil {
200+
return err
201+
}
202+
bd.Status.AppliedDeploymentID = bd.Spec.DeploymentID
203+
return k8sClient.Status().Update(ctx, bd)
204+
}).ShouldNot(HaveOccurred())
205+
206+
By("verifying the bundle is ready again")
207+
Eventually(func(g Gomega) {
195208
err = k8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: "name"}, bundle)
196209
g.Expect(err).NotTo(HaveOccurred())
197210
g.Expect(bundle.Status.Summary.WaitApplied).To(Equal(0))

internal/cmd/agent/deployer/deployer.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,8 @@ func deployErrToStatus(err error, status fleet.BundleDeploymentStatus) (bool, fl
285285
"(chart requires kubeVersion)|" + // kubeVersion mismatch
286286
"(annotation validation error)|" + // annotations fail to pass validation
287287
"(failed, and has been rolled back due to atomic being set)|" + // atomic is set and a rollback occurs
288-
"(YAML parse error)|" + // YAML is broken in source files
288+
"(YAML parse error)|" + // YAML is broken in source files (Helm v3)
289+
"(MalformedYAMLError)|" + // YAML is broken in source files (Helm v4)
289290
"(Forbidden: updates to [0-9A-Za-z]+ spec for fields other than [0-9A-Za-z ']+ are forbidden)|" + // trying to update fields that cannot be updated
290291
"(Forbidden: spec is immutable after creation)|" + // trying to modify immutable spec
291292
"(chart requires kubeVersion: [0-9A-Za-z\\.\\-<>=]+ which is incompatible with Kubernetes)", // trying to deploy to incompatible Kubernetes

internal/cmd/controller/summary/summary.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,13 @@ func MessageFromDeployment(deployment *fleet.BundleDeployment) string {
137137
}
138138
message := MessageFromCondition("Deployed", deployment.Status.Conditions)
139139
if message == "" {
140-
message = MessageFromCondition("Installed", deployment.Status.Conditions)
140+
// Only surface the Installed error when it belongs to the current
141+
// deployment. If the deployment IDs differ, the Installed condition
142+
// was set by a previous (now superseded) apply attempt and the
143+
// message is stale.
144+
if deployment.Status.AppliedDeploymentID == deployment.Spec.DeploymentID {
145+
message = MessageFromCondition("Installed", deployment.Status.Conditions)
146+
}
141147
}
142148
if message == "" {
143149
message = MessageFromCondition("Monitored", deployment.Status.Conditions)

internal/cmd/controller/summary/summary_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package summary_test
22

33
import (
4+
"errors"
45
"fmt"
56
"testing"
67

@@ -113,3 +114,56 @@ func TestSetReadyConditions_ReasonClearedWhenBecomingReady(t *testing.T) {
113114
c.GetReason(bundleStatus))
114115
}
115116
}
117+
118+
func setCondition(bd *fleet.BundleDeployment, condType, message string) {
119+
condition.Cond(condType).SetError(bd, "", errors.New(message))
120+
}
121+
122+
func TestMessageFromDeployment(t *testing.T) {
123+
t.Run("nil deployment returns empty string", func(t *testing.T) {
124+
if msg := summary.MessageFromDeployment(nil); msg != "" {
125+
t.Errorf("expected empty string, got %q", msg)
126+
}
127+
})
128+
129+
t.Run("Deployed condition takes priority over Installed", func(t *testing.T) {
130+
bd := &fleet.BundleDeployment{}
131+
bd.Spec.DeploymentID = "id1"
132+
bd.Status.AppliedDeploymentID = "id1"
133+
setCondition(bd, "Deployed", "deploy error")
134+
setCondition(bd, "Installed", "install error")
135+
if msg := summary.MessageFromDeployment(bd); msg != "deploy error" {
136+
t.Errorf("expected deploy error to take priority, got %q", msg)
137+
}
138+
})
139+
140+
t.Run("Installed shown when deployment IDs match", func(t *testing.T) {
141+
bd := &fleet.BundleDeployment{}
142+
bd.Spec.DeploymentID = "id1"
143+
bd.Status.AppliedDeploymentID = "id1"
144+
setCondition(bd, "Installed", "install error")
145+
if msg := summary.MessageFromDeployment(bd); msg != "install error" {
146+
t.Errorf("expected install error, got %q", msg)
147+
}
148+
})
149+
150+
t.Run("Installed suppressed when deployment IDs differ", func(t *testing.T) {
151+
bd := &fleet.BundleDeployment{}
152+
bd.Spec.DeploymentID = "id2"
153+
bd.Status.AppliedDeploymentID = "id1"
154+
setCondition(bd, "Installed", "stale install error")
155+
if msg := summary.MessageFromDeployment(bd); msg != "" {
156+
t.Errorf("expected stale Installed message to be suppressed, got %q", msg)
157+
}
158+
})
159+
160+
t.Run("Monitored used as fallback when Deployed and Installed are absent", func(t *testing.T) {
161+
bd := &fleet.BundleDeployment{}
162+
bd.Spec.DeploymentID = "id1"
163+
bd.Status.AppliedDeploymentID = "id1"
164+
setCondition(bd, "Monitored", "monitor error")
165+
if msg := summary.MessageFromDeployment(bd); msg != "monitor error" {
166+
t.Errorf("expected monitor error as fallback, got %q", msg)
167+
}
168+
})
169+
}

internal/cmd/controller/target/target.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,26 @@ func (t *Target) state() fleet.BundleState {
136136
case nil:
137137
return fleet.Pending
138138
default:
139-
return summary.GetDeploymentState(t.Deployment)
139+
return summary.GetDeploymentState(t.effectiveDeployment())
140140
}
141141
}
142142

143143
// message returns a relevant message from the target (pure function)
144144
func (t *Target) message() string {
145-
return summary.MessageFromDeployment(t.Deployment)
145+
return summary.MessageFromDeployment(t.effectiveDeployment())
146+
}
147+
148+
// effectiveDeployment returns t.Deployment with Spec.DeploymentID overridden
149+
// to t.DeploymentID, or t.Deployment unchanged when the IDs already match.
150+
// Status is computed (via state and message) before the reconciler writes the
151+
// new BD spec, so the cached Spec.DeploymentID lags at that point.
152+
func (t *Target) effectiveDeployment() *fleet.BundleDeployment {
153+
if t.Deployment == nil || t.DeploymentID == t.Deployment.Spec.DeploymentID {
154+
return t.Deployment
155+
}
156+
bd := *t.Deployment
157+
bd.Spec.DeploymentID = t.DeploymentID
158+
return &bd
146159
}
147160

148161
// initialiseOptionsMaps initialises options and staged options maps of namespace labels and annotations on bd, if

0 commit comments

Comments
 (0)