diff --git a/internal/cmd/controller/cleanup/controllers/cleanup/controller.go b/internal/cmd/controller/cleanup/controllers/cleanup/controller.go index d809d02027..b2cbc86c68 100644 --- a/internal/cmd/controller/cleanup/controllers/cleanup/controller.go +++ b/internal/cmd/controller/cleanup/controllers/cleanup/controller.go @@ -16,13 +16,15 @@ import ( rbacv1 "k8s.io/api/rbac/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) type handler struct { - apply apply.Apply - clusters fleetcontrollers.ClusterCache - namespaces corecontrollers.NamespaceClient + apply apply.Apply + clusters fleetcontrollers.ClusterCache + clustersClient fleetcontrollers.ClusterClient + namespaces corecontrollers.NamespaceClient } func Register(ctx context.Context, apply apply.Apply, @@ -33,11 +35,13 @@ func Register(ctx context.Context, apply apply.Apply, clusterRole rbaccontrollers.ClusterRoleController, clusterRoleBinding rbaccontrollers.ClusterRoleBindingController, namespaces corecontrollers.NamespaceController, - clusterCache fleetcontrollers.ClusterCache) { + clusterCache fleetcontrollers.ClusterCache, + clusterClient fleetcontrollers.ClusterClient) { h := &handler{ - apply: apply, - clusters: clusterCache, - namespaces: namespaces, + apply: apply, + clusters: clusterCache, + clustersClient: clusterClient, + namespaces: namespaces, } clusterRole.OnChange(ctx, "managed-cleanup", func(_ string, obj *rbacv1.ClusterRole) (*rbacv1.ClusterRole, error) { @@ -90,15 +94,29 @@ func (h *handler) cleanupNamespace(key string, obj *corev1.Namespace) (*corev1.N return obj, nil } - // check if the cluster for this cluster namespace still exists, otherwise clean up the namespace - _, err := h.clusters.Get(obj.Annotations[fleet.ClusterNamespaceAnnotation], obj.Annotations[fleet.ClusterAnnotation]) - if apierrors.IsNotFound(err) { - logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name) + clusterNS := obj.Annotations[fleet.ClusterNamespaceAnnotation] + clusterName := obj.Annotations[fleet.ClusterAnnotation] - err = h.namespaces.Delete(key, nil) + // check if the cluster for this cluster namespace still exists, otherwise clean up the namespace. + // First consult the informer cache; if the cache reports not-found, confirm with a live API call + // to avoid a race where the Cluster was just created and hasn't been reflected in the cache yet. + _, err := h.clusters.Get(clusterNS, clusterName) + if !apierrors.IsNotFound(err) { return obj, err } - return obj, err + + // Cache said not-found — verify against the API server before deleting. + _, err = h.clustersClient.Get(clusterNS, clusterName, metav1.GetOptions{}) + if err == nil { + // Cluster exists in the API server; the cache is stale. Do not delete the namespace. + return obj, nil + } + if !apierrors.IsNotFound(err) { + return obj, err + } + + logrus.Infof("Cleaning up fleet-managed namespace %q, cluster not found", obj.Name) + return obj, h.namespaces.Delete(key, nil) } func (h *handler) cleanup(obj runtime.Object) error { diff --git a/internal/cmd/controller/cleanup/controllers/cleanup/controller_test.go b/internal/cmd/controller/cleanup/controllers/cleanup/controller_test.go new file mode 100644 index 0000000000..7c5646ecbb --- /dev/null +++ b/internal/cmd/controller/cleanup/controllers/cleanup/controller_test.go @@ -0,0 +1,163 @@ +package cleanup + +import ( + "testing" + + "github.com/rancher/wrangler/v3/pkg/generic/fake" + "go.uber.org/mock/gomock" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestCleanup(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Cleanup Controller Suite") +} + +var _ = Describe("cleanupNamespace", func() { + var ( + mockCtrl *gomock.Controller + clusterCache *fake.MockCacheInterface[*fleet.Cluster] + clusterClient *fake.MockClientInterface[*fleet.Cluster, *fleet.ClusterList] + namespaceClient *fake.MockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList] + h *handler + ns *corev1.Namespace + notFound = apierrors.NewNotFound(schema.GroupResource{Group: "fleet.cattle.io", Resource: "clusters"}, "test-cluster") + ) + + BeforeEach(func() { + mockCtrl = gomock.NewController(GinkgoT()) + clusterCache = fake.NewMockCacheInterface[*fleet.Cluster](mockCtrl) + clusterClient = fake.NewMockClientInterface[*fleet.Cluster, *fleet.ClusterList](mockCtrl) + namespaceClient = fake.NewMockNonNamespacedClientInterface[*corev1.Namespace, *corev1.NamespaceList](mockCtrl) + + h = &handler{ + clusters: clusterCache, + clustersClient: clusterClient, + namespaces: namespaceClient, + } + + ns = &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cluster-fleet-default-test-cluster-abc123", + Labels: map[string]string{ + fleet.ManagedLabel: "true", + }, + Annotations: map[string]string{ + fleet.ClusterAnnotation: "test-cluster", + fleet.ClusterNamespaceAnnotation: "fleet-default", + }, + }, + } + }) + + Context("when namespace has no managed label", func() { + It("does not delete the namespace", func() { + ns.Labels = map[string]string{} + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).NotTo(HaveOccurred()) + Expect(result).To(Equal(ns)) + }) + }) + + Context("when namespace is nil", func() { + It("returns nil without error", func() { + result, err := h.cleanupNamespace("some-key", nil) + Expect(err).NotTo(HaveOccurred()) + Expect(result).To(BeNil()) + }) + }) + + Context("when cluster exists in cache", func() { + It("does not delete the namespace", func() { + clusterCache.EXPECT(). + Get("fleet-default", "test-cluster"). + Return(&fleet.Cluster{}, nil) + + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).NotTo(HaveOccurred()) + Expect(result).To(Equal(ns)) + }) + }) + + Context("when cluster is missing from cache but exists in API server", func() { + // This tests the fix for the race condition described in + // https://github.com/rancher/fleet/issues/3830: the cleanup + // controller's informer cache may not yet reflect a newly created + // Cluster, so a not-found cache result is confirmed via a live + // API call before the namespace is deleted. + It("does not delete the namespace", func() { + // Cache says cluster not found (stale) + clusterCache.EXPECT(). + Get("fleet-default", "test-cluster"). + Return(nil, notFound) + + // API server confirms cluster exists + clusterClient.EXPECT(). + Get("fleet-default", "test-cluster", gomock.Any()). + Return(&fleet.Cluster{}, nil) + + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).NotTo(HaveOccurred()) + Expect(result).To(Equal(ns)) + }) + }) + + Context("when cluster is missing from both cache and API server", func() { + It("deletes the namespace", func() { + clusterCache.EXPECT(). + Get("fleet-default", "test-cluster"). + Return(nil, notFound) + + clusterClient.EXPECT(). + Get("fleet-default", "test-cluster", gomock.Any()). + Return(nil, notFound) + + namespaceClient.EXPECT(). + Delete(ns.Name, gomock.Any()). + Return(nil) + + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).NotTo(HaveOccurred()) + Expect(result).To(Equal(ns)) + }) + }) + + Context("when cache lookup returns a non-NotFound error", func() { + It("returns the error without deleting", func() { + apiErr := apierrors.NewServiceUnavailable("API server unavailable") + clusterCache.EXPECT(). + Get("fleet-default", "test-cluster"). + Return(nil, apiErr) + + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).To(Equal(apiErr)) + Expect(result).To(Equal(ns)) + }) + }) + + Context("when API server lookup returns a non-NotFound error", func() { + It("returns the error without deleting", func() { + apiErr := apierrors.NewServiceUnavailable("API server unavailable") + clusterCache.EXPECT(). + Get("fleet-default", "test-cluster"). + Return(nil, notFound) + + clusterClient.EXPECT(). + Get("fleet-default", "test-cluster", gomock.Any()). + Return(nil, apiErr) + + result, err := h.cleanupNamespace(ns.Name, ns) + Expect(err).To(Equal(apiErr)) + Expect(result).To(Equal(ns)) + }) + }) +}) diff --git a/internal/cmd/controller/cleanup/controllers/controllers.go b/internal/cmd/controller/cleanup/controllers/controllers.go index 6114e8aed7..8173fb9c41 100644 --- a/internal/cmd/controller/cleanup/controllers/controllers.go +++ b/internal/cmd/controller/cleanup/controllers/controllers.go @@ -67,6 +67,7 @@ func Register(ctx context.Context, appCtx *AppContext) error { appCtx.RBAC.ClusterRoleBinding(), appCtx.Core.Namespace(), appCtx.Cluster().Cache(), + appCtx.Cluster(), ) if err := appCtx.Start(ctx); err != nil {