Skip to content

Commit 8162ec0

Browse files
committed
Verify cluster in API server before deleting namespace
The cleanup controller used only the informer cache to decide whether a fleet-managed namespace should be removed. During startup or under load, the cache may not yet contain a newly created Cluster, causing the namespace to be wrongly deleted and breaking bundle deployment. Confirm a cache not-found result with a live API call before deleting the namespace.
1 parent cd90424 commit 8162ec0

3 files changed

Lines changed: 195 additions & 13 deletions

File tree

internal/cmd/controller/cleanup/controllers/cleanup/controller.go

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@ import (
1616
rbacv1 "k8s.io/api/rbac/v1"
1717
apierrors "k8s.io/apimachinery/pkg/api/errors"
1818
"k8s.io/apimachinery/pkg/api/meta"
19+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/runtime"
2021
)
2122

2223
type handler struct {
23-
apply apply.Apply
24-
clusters fleetcontrollers.ClusterCache
25-
namespaces corecontrollers.NamespaceClient
24+
apply apply.Apply
25+
clusters fleetcontrollers.ClusterCache
26+
clustersClient fleetcontrollers.ClusterClient
27+
namespaces corecontrollers.NamespaceClient
2628
}
2729

2830
func Register(ctx context.Context, apply apply.Apply,
@@ -33,11 +35,13 @@ func Register(ctx context.Context, apply apply.Apply,
3335
clusterRole rbaccontrollers.ClusterRoleController,
3436
clusterRoleBinding rbaccontrollers.ClusterRoleBindingController,
3537
namespaces corecontrollers.NamespaceController,
36-
clusterCache fleetcontrollers.ClusterCache) {
38+
clusterCache fleetcontrollers.ClusterCache,
39+
clusterClient fleetcontrollers.ClusterClient) {
3740
h := &handler{
38-
apply: apply,
39-
clusters: clusterCache,
40-
namespaces: namespaces,
41+
apply: apply,
42+
clusters: clusterCache,
43+
clustersClient: clusterClient,
44+
namespaces: namespaces,
4145
}
4246

4347
clusterRole.OnChange(ctx, "managed-cleanup", func(_ string, obj *rbacv1.ClusterRole) (*rbacv1.ClusterRole, error) {
@@ -90,15 +94,29 @@ func (h *handler) cleanupNamespace(key string, obj *corev1.Namespace) (*corev1.N
9094
return obj, nil
9195
}
9296

93-
// check if the cluster for this cluster namespace still exists, otherwise clean up the namespace
94-
_, err := h.clusters.Get(obj.Annotations[fleet.ClusterNamespaceAnnotation], obj.Annotations[fleet.ClusterAnnotation])
95-
if apierrors.IsNotFound(err) {
96-
logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name)
97+
clusterNS := obj.Annotations[fleet.ClusterNamespaceAnnotation]
98+
clusterName := obj.Annotations[fleet.ClusterAnnotation]
9799

98-
err = h.namespaces.Delete(key, nil)
100+
// check if the cluster for this cluster namespace still exists, otherwise clean up the namespace.
101+
// First consult the informer cache; if the cache reports not-found, confirm with a live API call
102+
// to avoid a race where the Cluster was just created and hasn't been reflected in the cache yet.
103+
_, err := h.clusters.Get(clusterNS, clusterName)
104+
if !apierrors.IsNotFound(err) {
99105
return obj, err
100106
}
101-
return obj, err
107+
108+
// Cache said not-found — verify against the API server before deleting.
109+
_, err = h.clustersClient.Get(clusterNS, clusterName, metav1.GetOptions{})
110+
if err == nil {
111+
// Cluster exists in the API server; the cache is stale. Do not delete the namespace.
112+
return obj, nil
113+
}
114+
if !apierrors.IsNotFound(err) {
115+
return obj, err
116+
}
117+
118+
logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name)
119+
return obj, h.namespaces.Delete(key, nil)
102120
}
103121

104122
func (h *handler) cleanup(obj runtime.Object) error {
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
package cleanup
2+
3+
import (
4+
"testing"
5+
6+
"github.com/rancher/wrangler/v3/pkg/generic/fake"
7+
"go.uber.org/mock/gomock"
8+
9+
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
10+
11+
corev1 "k8s.io/api/core/v1"
12+
apierrors "k8s.io/apimachinery/pkg/api/errors"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/runtime/schema"
15+
16+
. "github.com/onsi/ginkgo/v2"
17+
. "github.com/onsi/gomega"
18+
)
19+
20+
func TestCleanup(t *testing.T) {
21+
RegisterFailHandler(Fail)
22+
RunSpecs(t, "Cleanup Controller Suite")
23+
}
24+
25+
var _ = Describe("cleanupNamespace", func() {
26+
var (
27+
mockCtrl *gomock.Controller
28+
clusterCache *fake.MockCacheInterface[*fleet.Cluster]
29+
clusterClient *fake.MockClientInterface[*fleet.Cluster, *fleet.ClusterList]
30+
namespaceClient *fake.MockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList]
31+
h *handler
32+
ns *corev1.Namespace
33+
notFound = apierrors.NewNotFound(schema.GroupResource{Group: "fleet.cattle.io", Resource: "clusters"}, "test-cluster")
34+
)
35+
36+
BeforeEach(func() {
37+
mockCtrl = gomock.NewController(GinkgoT())
38+
clusterCache = fake.NewMockCacheInterface[*fleet.Cluster](mockCtrl)
39+
clusterClient = fake.NewMockClientInterface[*fleet.Cluster, *fleet.ClusterList](mockCtrl)
40+
namespaceClient = fake.NewMockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList](mockCtrl)
41+
42+
h = &handler{
43+
clusters: clusterCache,
44+
clustersClient: clusterClient,
45+
namespaces: namespaceClient,
46+
}
47+
48+
ns = &corev1.Namespace{
49+
ObjectMeta: metav1.ObjectMeta{
50+
Name: "cluster-fleet-default-test-cluster-abc123",
51+
Labels: map[string]string{
52+
fleet.ManagedLabel: "true",
53+
},
54+
Annotations: map[string]string{
55+
fleet.ClusterAnnotation: "test-cluster",
56+
fleet.ClusterNamespaceAnnotation: "fleet-default",
57+
},
58+
},
59+
}
60+
})
61+
62+
Context("when namespace has no managed label", func() {
63+
It("does not delete the namespace", func() {
64+
ns.Labels = map[string]string{}
65+
result, err := h.cleanupNamespace(ns.Name, ns)
66+
Expect(err).NotTo(HaveOccurred())
67+
Expect(result).To(Equal(ns))
68+
})
69+
})
70+
71+
Context("when namespace is nil", func() {
72+
It("returns nil without error", func() {
73+
result, err := h.cleanupNamespace("some-key", nil)
74+
Expect(err).NotTo(HaveOccurred())
75+
Expect(result).To(BeNil())
76+
})
77+
})
78+
79+
Context("when cluster exists in cache", func() {
80+
It("does not delete the namespace", func() {
81+
clusterCache.EXPECT().
82+
Get("fleet-default", "test-cluster").
83+
Return(&fleet.Cluster{}, nil)
84+
85+
result, err := h.cleanupNamespace(ns.Name, ns)
86+
Expect(err).NotTo(HaveOccurred())
87+
Expect(result).To(Equal(ns))
88+
})
89+
})
90+
91+
Context("when cluster is missing from cache but exists in API server", func() {
92+
// This tests the fix for the race condition described in
93+
// https://github.com/rancher/fleet/issues/3830: the cleanup
94+
// controller's informer cache may not yet reflect a newly created
95+
// Cluster, so a not-found cache result is confirmed via a live
96+
// API call before the namespace is deleted.
97+
It("does not delete the namespace", func() {
98+
// Cache says cluster not found (stale)
99+
clusterCache.EXPECT().
100+
Get("fleet-default", "test-cluster").
101+
Return(nil, notFound)
102+
103+
// API server confirms cluster exists
104+
clusterClient.EXPECT().
105+
Get("fleet-default", "test-cluster", gomock.Any()).
106+
Return(&fleet.Cluster{}, nil)
107+
108+
result, err := h.cleanupNamespace(ns.Name, ns)
109+
Expect(err).NotTo(HaveOccurred())
110+
Expect(result).To(Equal(ns))
111+
})
112+
})
113+
114+
Context("when cluster is missing from both cache and API server", func() {
115+
It("deletes the namespace", func() {
116+
clusterCache.EXPECT().
117+
Get("fleet-default", "test-cluster").
118+
Return(nil, notFound)
119+
120+
clusterClient.EXPECT().
121+
Get("fleet-default", "test-cluster", gomock.Any()).
122+
Return(nil, notFound)
123+
124+
namespaceClient.EXPECT().
125+
Delete(ns.Name, gomock.Any()).
126+
Return(nil)
127+
128+
result, err := h.cleanupNamespace(ns.Name, ns)
129+
Expect(err).NotTo(HaveOccurred())
130+
Expect(result).To(Equal(ns))
131+
})
132+
})
133+
134+
Context("when cache lookup returns a non-NotFound error", func() {
135+
It("returns the error without deleting", func() {
136+
apiErr := apierrors.NewServiceUnavailable("API server unavailable")
137+
clusterCache.EXPECT().
138+
Get("fleet-default", "test-cluster").
139+
Return(nil, apiErr)
140+
141+
result, err := h.cleanupNamespace(ns.Name, ns)
142+
Expect(err).To(Equal(apiErr))
143+
Expect(result).To(Equal(ns))
144+
})
145+
})
146+
147+
Context("when API server lookup returns a non-NotFound error", func() {
148+
It("returns the error without deleting", func() {
149+
apiErr := apierrors.NewServiceUnavailable("API server unavailable")
150+
clusterCache.EXPECT().
151+
Get("fleet-default", "test-cluster").
152+
Return(nil, notFound)
153+
154+
clusterClient.EXPECT().
155+
Get("fleet-default", "test-cluster", gomock.Any()).
156+
Return(nil, apiErr)
157+
158+
result, err := h.cleanupNamespace(ns.Name, ns)
159+
Expect(err).To(Equal(apiErr))
160+
Expect(result).To(Equal(ns))
161+
})
162+
})
163+
})

internal/cmd/controller/cleanup/controllers/controllers.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ func Register(ctx context.Context, appCtx *AppContext) error {
6767
appCtx.RBAC.ClusterRoleBinding(),
6868
appCtx.Core.Namespace(),
6969
appCtx.Cluster().Cache(),
70+
appCtx.Cluster(),
7071
)
7172

7273
if err := appCtx.Start(ctx); err != nil {

0 commit comments

Comments
 (0)