Skip to content

Commit daea219

Browse files
committed
Verify cluster in API server before deleting namespace
The cleanup controller used only the informer cache to decide whether a fleet-managed namespace should be removed. During startup or under load, the cache may not yet contain a newly created Cluster, causing the namespace to be wrongly deleted and breaking bundle deployment. Confirm a cache not-found result with a live API call before deleting the namespace.
1 parent cd90424 commit daea219

3 files changed

Lines changed: 199 additions & 13 deletions

File tree

internal/cmd/controller/cleanup/controllers/cleanup/controller.go

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ import (
1616
rbacv1 "k8s.io/api/rbac/v1"
1717
apierrors "k8s.io/apimachinery/pkg/api/errors"
1818
"k8s.io/apimachinery/pkg/api/meta"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/runtime"
2021
)
2122

2223
type handler struct {
23-
apply apply.Apply
24-
clusters fleetcontrollers.ClusterCache
25-
namespaces corecontrollers.NamespaceClient
24+
apply apply.Apply
25+
clusters fleetcontrollers.ClusterCache
26+
clustersClient fleetcontrollers.ClusterClient
27+
namespaces corecontrollers.NamespaceClient
2628
}
2729

2830
func Register(ctx context.Context, apply apply.Apply,
@@ -33,11 +35,13 @@ func Register(ctx context.Context, apply apply.Apply,
3335
clusterRole rbaccontrollers.ClusterRoleController,
3436
clusterRoleBinding rbaccontrollers.ClusterRoleBindingController,
3537
namespaces corecontrollers.NamespaceController,
36-
clusterCache fleetcontrollers.ClusterCache) {
38+
clusterCache fleetcontrollers.ClusterCache,
39+
clusterClient fleetcontrollers.ClusterClient) {
3740
h := &handler{
38-
apply: apply,
39-
clusters: clusterCache,
40-
namespaces: namespaces,
41+
apply: apply,
42+
clusters: clusterCache,
43+
clustersClient: clusterClient,
44+
namespaces: namespaces,
4145
}
4246

4347
clusterRole.OnChange(ctx, "managed-cleanup", func(_ string, obj *rbacv1.ClusterRole) (*rbacv1.ClusterRole, error) {
@@ -90,15 +94,29 @@ func (h *handler) cleanupNamespace(key string, obj *corev1.Namespace) (*corev1.N
9094
return obj, nil
9195
}
9296

93-
// check if the cluster for this cluster namespace still exists, otherwise clean up the namespace
94-
_, err := h.clusters.Get(obj.Annotations[fleet.ClusterNamespaceAnnotation], obj.Annotations[fleet.ClusterAnnotation])
95-
if apierrors.IsNotFound(err) {
96-
logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name)
97+
clusterNS := obj.Annotations[fleet.ClusterNamespaceAnnotation]
98+
clusterName := obj.Annotations[fleet.ClusterAnnotation]
9799

98-
err = h.namespaces.Delete(key, nil)
100+
// check if the cluster for this cluster namespace still exists, otherwise clean up the namespace.
101+
// First consult the informer cache; if the cache reports not-found, confirm with a live API call
102+
// to avoid a race where the Cluster was just created and hasn't been reflected in the cache yet.
103+
_, err := h.clusters.Get(clusterNS, clusterName)
104+
if !apierrors.IsNotFound(err) {
99105
return obj, err
100106
}
101-
return obj, err
107+
108+
// Cache said not-found — verify against the API server before deleting.
109+
_, err = h.clustersClient.Get(clusterNS, clusterName, metav1.GetOptions{})
110+
if err == nil {
111+
// Cluster exists in the API server; the cache is stale. Do not delete the namespace.
112+
return obj, nil
113+
}
114+
if !apierrors.IsNotFound(err) {
115+
return obj, err
116+
}
117+
118+
logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name)
119+
return obj, h.namespaces.Delete(key, nil)
102120
}
103121

104122
func (h *handler) cleanup(obj runtime.Object) error {
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
package cleanup
2+
3+
import (
4+
"testing"
5+
6+
"github.com/rancher/wrangler/v3/pkg/generic/fake"
7+
"go.uber.org/mock/gomock"
8+
9+
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
10+
11+
corev1 "k8s.io/api/core/v1"
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/runtime/schema"
15+
16+
. "github.com/onsi/ginkgo/v2"
17+
. "github.com/onsi/gomega"
18+
)
19+
20+
func TestCleanup(t *testing.T) {
21+
RegisterFailHandler(Fail)
22+
RunSpecs(t, "Cleanup Controller Suite")
23+
}
24+
25+
var _ = Describe("cleanupNamespace", func() {
26+
var (
27+
mockCtrl *gomock.Controller
28+
clusterCache *fake.MockCacheInterface[*fleet.Cluster]
29+
clusterClient *fake.MockClientInterface[*fleet.Cluster, *fleet.ClusterList]
30+
namespaceClient *fake.MockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList]
31+
h *handler
32+
ns *corev1.Namespace
33+
notFound = apierrors.NewNotFound(schema.GroupResource{Group: "fleet.cattle.io", Resource: "clusters"}, "test-cluster")
34+
)
35+
36+
BeforeEach(func() {
37+
mockCtrl = gomock.NewController(GinkgoT())
38+
clusterCache = fake.NewMockCacheInterface[*fleet.Cluster](mockCtrl)
39+
clusterClient = fake.NewMockClientInterface[*fleet.Cluster, *fleet.ClusterList](mockCtrl)
40+
namespaceClient = fake.NewMockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList](mockCtrl)
41+
42+
h = &handler{
43+
clusters: clusterCache,
44+
clustersClient: clusterClient,
45+
namespaces: namespaceClient,
46+
}
47+
48+
ns = &corev1.Namespace{
49+
ObjectMeta: metav1.ObjectMeta{
50+
Name: "cluster-fleet-default-test-cluster-abc123",
51+
Labels: map[string]string{
52+
fleet.ManagedLabel: "true",
53+
},
54+
Annotations: map[string]string{
55+
fleet.ClusterAnnotation: "test-cluster",
56+
fleet.ClusterNamespaceAnnotation: "fleet-default",
57+
},
58+
},
59+
}
60+
})
61+
62+
AfterEach(func() {
63+
mockCtrl.Finish()
64+
})
65+
66+
Context("when namespace has no managed label", func() {
67+
It("does not delete the namespace", func() {
68+
ns.Labels = map[string]string{}
69+
result, err := h.cleanupNamespace(ns.Name, ns)
70+
Expect(err).NotTo(HaveOccurred())
71+
Expect(result).To(Equal(ns))
72+
})
73+
})
74+
75+
Context("when namespace is nil", func() {
76+
It("returns nil without error", func() {
77+
result, err := h.cleanupNamespace("some-key", nil)
78+
Expect(err).NotTo(HaveOccurred())
79+
Expect(result).To(BeNil())
80+
})
81+
})
82+
83+
Context("when cluster exists in cache", func() {
84+
It("does not delete the namespace", func() {
85+
clusterCache.EXPECT().
86+
Get("fleet-default", "test-cluster").
87+
Return(&fleet.Cluster{}, nil)
88+
89+
result, err := h.cleanupNamespace(ns.Name, ns)
90+
Expect(err).NotTo(HaveOccurred())
91+
Expect(result).To(Equal(ns))
92+
})
93+
})
94+
95+
Context("when cluster is missing from cache but exists in API server", func() {
96+
// This tests the fix for the race condition described in
97+
// https://github.com/rancher/fleet/issues/3830: the cleanup
98+
// controller's informer cache may not yet reflect a newly created
99+
// Cluster, so a not-found cache result is confirmed via a live
100+
// API call before the namespace is deleted.
101+
It("does not delete the namespace", func() {
102+
// Cache says cluster not found (stale)
103+
clusterCache.EXPECT().
104+
Get("fleet-default", "test-cluster").
105+
Return(nil, notFound)
106+
107+
// API server confirms cluster exists
108+
clusterClient.EXPECT().
109+
Get("fleet-default", "test-cluster", gomock.Any()).
110+
Return(&fleet.Cluster{}, nil)
111+
112+
result, err := h.cleanupNamespace(ns.Name, ns)
113+
Expect(err).NotTo(HaveOccurred())
114+
Expect(result).To(Equal(ns))
115+
})
116+
})
117+
118+
Context("when cluster is missing from both cache and API server", func() {
119+
It("deletes the namespace", func() {
120+
clusterCache.EXPECT().
121+
Get("fleet-default", "test-cluster").
122+
Return(nil, notFound)
123+
124+
clusterClient.EXPECT().
125+
Get("fleet-default", "test-cluster", gomock.Any()).
126+
Return(nil, notFound)
127+
128+
namespaceClient.EXPECT().
129+
Delete(ns.Name, gomock.Any()).
130+
Return(nil)
131+
132+
result, err := h.cleanupNamespace(ns.Name, ns)
133+
Expect(err).NotTo(HaveOccurred())
134+
Expect(result).To(Equal(ns))
135+
})
136+
})
137+
138+
Context("when cache lookup returns a non-NotFound error", func() {
139+
It("returns the error without deleting", func() {
140+
apiErr := apierrors.NewServiceUnavailable("API server unavailable")
141+
clusterCache.EXPECT().
142+
Get("fleet-default", "test-cluster").
143+
Return(nil, apiErr)
144+
145+
result, err := h.cleanupNamespace(ns.Name, ns)
146+
Expect(err).To(Equal(apiErr))
147+
Expect(result).To(Equal(ns))
148+
})
149+
})
150+
151+
Context("when API server lookup returns a non-NotFound error", func() {
152+
It("returns the error without deleting", func() {
153+
apiErr := apierrors.NewServiceUnavailable("API server unavailable")
154+
clusterCache.EXPECT().
155+
Get("fleet-default", "test-cluster").
156+
Return(nil, notFound)
157+
158+
clusterClient.EXPECT().
159+
Get("fleet-default", "test-cluster", gomock.Any()).
160+
Return(nil, apiErr)
161+
162+
result, err := h.cleanupNamespace(ns.Name, ns)
163+
Expect(err).To(Equal(apiErr))
164+
Expect(result).To(Equal(ns))
165+
})
166+
})
167+
})

internal/cmd/controller/cleanup/controllers/controllers.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ func Register(ctx context.Context, appCtx *AppContext) error {
6767
appCtx.RBAC.ClusterRoleBinding(),
6868
appCtx.Core.Namespace(),
6969
appCtx.Cluster().Cache(),
70+
appCtx.Cluster(),
7071
)
7172

7273
if err := appCtx.Start(ctx); err != nil {

0 commit comments

Comments
 (0)