Skip to content

Commit 3f1e7ed

Browse files
committed
operator: Clean up data PVCs on cleanupMetadata, regardless of cleanupData option
Signed-off-by: Aaron Wilson <aawilson@nvidia.com>
1 parent 343e4a4 commit 3f1e7ed

File tree

8 files changed

+108
-76
lines changed

8 files changed

+108
-76
lines changed

operator/CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ We structure this changelog in accordance with [Keep a Changelog](https://keepac
1515
- Fixed syncPodTemplate not syncing env var removals when the desired env slice was a prefix of the current, causing rollout loops on config changes such as clearing `authNSecretName`.
1616
- Invalidate the AIStore API client on authentication failure, not just on failure to acquire initial token.
1717
- Mount path size is now optional to better support hostPath data mounts.
18+
- On cluster decommission, data PVCs are now deleted whenever `cleanupMetadata` is enabled, regardless of `cleanupData`.
19+
- Clarifies the purpose of `cleanupData` as an API option to AIS for cleaning data on disk.
20+
- Use node selectors for host path cleanup jobs rather than querying existing pods by label.
21+
- Validate CRD only allows `cleanupData` if `cleanupMetadata` is enabled.
1822

1923
## v2.15.0
2024

operator/api/v1beta1/aistore_types.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ type CertIssuerRef struct {
301301
// AIStoreSpec defines the desired state of AIStore
302302
// +kubebuilder:validation:XValidation:rule="(has(self.targetSpec.size) && has(self.proxySpec.size)) || has(self.size)",message="Invalid cluster size, it is either not specified or value is not valid"
303303
// +kubebuilder:validation:XValidation:rule="[has(self.tls), has(self.tlsCertificate), has(self.tlsCertManagerIssuerName), has(self.tlsSecretName)].filter(x, x).size() <= 1",message="specify only one TLS option: tls, tlsCertificate, tlsCertManagerIssuerName, or tlsSecretName"
304+
// +kubebuilder:validation:XValidation:rule="!has(self.cleanupData) || !self.cleanupData || (has(self.cleanupMetadata) && self.cleanupMetadata)",message="cleanupData requires cleanupMetadata to be enabled"
304305
type AIStoreSpec struct {
305306
// Size of the cluster i.e. number of proxies and number of targets.
306307
// This can be changed by specifying size in either `proxySpec` or `targetSpec`.
@@ -363,15 +364,21 @@ type AIStoreSpec struct {
363364
ShutdownCluster *bool `json:"shutdownCluster,omitempty"`
364365

365366
// CleanupMetadata determines whether to clean up cluster and bucket metadata when the cluster is decommissioned.
366-
// When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data.
367+
// When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data according to the CleanupData option.
367368
// When disabled, the operator will call the AIS shutdown API to preserve metadata before deleting other k8s resources.
368-
// The metadata stored in the state PVCs will be preserved to be usable in a future AIS deployment.
369+
// State cleanup:
370+
// - All state PVCs will be deleted and cleaned up according to the storage class
371+
// - Any state host paths will be cleaned with operator-managed jobs
372+
// Data cleanup:
373+
// - Data PVCs will be deleted but data on disk will remain unless specified by CleanupData
374+
// - The reclamation behavior for PVs bound to the PVCs depends on the PVC's reclaim policy, or, if the PV was dynamically provisioned, on the reclaim policy of the associated StorageClass.
369375
// +optional
370376
CleanupMetadata *bool `json:"cleanupMetadata,omitempty"`
371377

372-
// CleanupData determines whether to clean up PVCs and user data (including buckets and objects) when the cluster is decommissioned.
373-
// The reclamation of PVs linked to the PVCs depends on the PV reclaim policy or the default policy of the associated StorageClass.
374-
// This field is relevant only if you are deleting the CR (leading to decommissioning of the cluster).
378+
// CleanupData determines whether to clean up user data (including buckets and objects) when the cluster is decommissioned.
379+
// This field is relevant only if CleanupMetadata is true when deleting the AIStore CR (leading to decommissioning of the cluster).
380+
// If this option is enabled, AIStore itself will delete user data on cluster decommission.
381+
// If not enabled, user data cleanup will be left to the PV reclaim policy.
375382
// +optional
376383
CleanupData *bool `json:"cleanupData,omitempty"`
377384

operator/api/v1beta1/aistore_webhook.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,13 +132,29 @@ func (aisw *AIStoreWebhook) validateSpec(ctx context.Context, ais *AIStore) (adm
132132
)
133133
}
134134

135+
func (ais *AIStore) validateCleanupConfig() (admission.Warnings, error) {
136+
if !ais.ShouldCleanupMetadata() {
137+
return nil, nil
138+
}
139+
if ais.Spec.StateStorageClass != nil {
140+
return nil, nil
141+
}
142+
if len(ais.Spec.TargetSpec.NodeSelector) == 0 || len(ais.Spec.ProxySpec.NodeSelector) == 0 {
143+
return admission.Warnings{
144+
"cleanupMetadata is enabled with hostpath state and empty nodeSelector; host cleanup jobs will run on ALL nodes in the cluster",
145+
}, nil
146+
}
147+
return nil, nil
148+
}
149+
135150
func (ais *AIStore) ValidateSpec(_ context.Context, extraValidations ...func() (admission.Warnings, error)) (admission.Warnings, error) {
136151
var allWarnings admission.Warnings
137152
base := []func() (admission.Warnings, error){
138153
ais.validateSize,
139154
ais.validateStateStorage,
140155
ais.validateAutoScaling,
141156
ais.validateServiceSpec,
157+
ais.validateCleanupConfig,
142158
}
143159

144160
validations := make(

operator/config/base/crd/ais.nvidia.com_aistores.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,16 +1374,22 @@ spec:
13741374
type: string
13751375
cleanupData:
13761376
description: |-
1377-
CleanupData determines whether to clean up PVCs and user data (including buckets and objects) when the cluster is decommissioned.
1378-
The reclamation of PVs linked to the PVCs depends on the PV reclaim policy or the default policy of the associated StorageClass.
1379-
This field is relevant only if you are deleting the CR (leading to decommissioning of the cluster).
1377+
CleanupData determines whether to clean up user data (including buckets and objects) when the cluster is decommissioned.
1378+
This field is relevant only if CleanupMetadata is true when deleting the AIStore CR (leading to decommissioning of the cluster).
1379+
If this option is enabled, AIStore itself will delete user data on cluster decommission.
1380+
If not enabled, user data cleanup will be left to the PV reclaim policy.
13801381
type: boolean
13811382
cleanupMetadata:
13821383
description: |-
13831384
CleanupMetadata determines whether to clean up cluster and bucket metadata when the cluster is decommissioned.
1384-
When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data.
1385+
When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data according to the CleanupData option.
13851386
When disabled, the operator will call the AIS shutdown API to preserve metadata before deleting other k8s resources.
1386-
The metadata stored in the state PVCs will be preserved to be usable in a future AIS deployment.
1387+
State cleanup:
1388+
- All state PVCs will be deleted and cleaned up according to the storage class
1389+
- Any state host paths will be cleaned with operator-managed jobs
1390+
Data cleanup:
1391+
- Data PVCs will be deleted but data on disk will remain unless specified by CleanupData
1392+
- The reclamation behavior for PVs bound to the PVCs depends on the PVC's reclaim policy, or, if the PV was dynamically provisioned, on the reclaim policy of the associated StorageClass.
13871393
type: boolean
13881394
clusterDomain:
13891395
description: 'Defines the cluster domain name for DNS. Default: cluster.local.'
@@ -5902,6 +5908,9 @@ spec:
59025908
or tlsSecretName'
59035909
rule: '[has(self.tls), has(self.tlsCertificate), has(self.tlsCertManagerIssuerName),
59045910
has(self.tlsSecretName)].filter(x, x).size() <= 1'
5911+
- message: cleanupData requires cleanupMetadata to be enabled
5912+
rule: '!has(self.cleanupData) || !self.cleanupData || (has(self.cleanupMetadata)
5913+
&& self.cleanupMetadata)'
59055914
status:
59065915
description: AIStoreStatus defines the observed state of AIStore
59075916
properties:

operator/helm/ais-operator/templates/aistore-crd.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,16 +1376,22 @@ spec:
13761376
type: string
13771377
cleanupData:
13781378
description: |-
1379-
CleanupData determines whether to clean up PVCs and user data (including buckets and objects) when the cluster is decommissioned.
1380-
The reclamation of PVs linked to the PVCs depends on the PV reclaim policy or the default policy of the associated StorageClass.
1381-
This field is relevant only if you are deleting the CR (leading to decommissioning of the cluster).
1379+
CleanupData determines whether to clean up user data (including buckets and objects) when the cluster is decommissioned.
1380+
This field is relevant only if CleanupMetadata is true when deleting the AIStore CR (leading to decommissioning of the cluster).
1381+
If this option is enabled, AIStore itself will delete user data on cluster decommission.
1382+
If not enabled, user data cleanup will be left to the PV reclaim policy.
13821383
type: boolean
13831384
cleanupMetadata:
13841385
description: |-
13851386
CleanupMetadata determines whether to clean up cluster and bucket metadata when the cluster is decommissioned.
1386-
When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data.
1387+
When enabled, the cluster will fully decommission, removing metadata and optionally deleting user data according to the CleanupData option.
13871388
When disabled, the operator will call the AIS shutdown API to preserve metadata before deleting other k8s resources.
1388-
The metadata stored in the state PVCs will be preserved to be usable in a future AIS deployment.
1389+
State cleanup:
1390+
- All state PVCs will be deleted and cleaned up according to the storage class
1391+
- Any state host paths will be cleaned with operator-managed jobs
1392+
Data cleanup:
1393+
- Data PVCs will be deleted but data on disk will remain unless specified by CleanupData
1394+
- The reclamation behavior for PVs bound to the PVCs depends on the PVC's reclaim policy, or, if the PV was dynamically provisioned, on the reclaim policy of the associated StorageClass.
13891395
type: boolean
13901396
clusterDomain:
13911397
description: 'Defines the cluster domain name for DNS. Default: cluster.local.'
@@ -5890,6 +5896,9 @@ spec:
58905896
or tlsSecretName'
58915897
rule: '[has(self.tls), has(self.tlsCertificate), has(self.tlsCertManagerIssuerName),
58925898
has(self.tlsSecretName)].filter(x, x).size() <= 1'
5899+
- message: cleanupData requires cleanupMetadata to be enabled
5900+
rule: '!has(self.cleanupData) || !self.cleanupData || (has(self.cleanupMetadata)
5901+
&& self.cleanupMetadata)'
58935902
status:
58945903
description: AIStoreStatus defines the observed state of AIStore
58955904
properties:

operator/pkg/client/api.go

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ import (
1010
"reflect"
1111

1212
aisv1 "github.com/ais-operator/api/v1beta1"
13-
"github.com/ais-operator/pkg/resources/proxy"
14-
"github.com/ais-operator/pkg/resources/target"
1513
appsv1 "k8s.io/api/apps/v1"
1614
batchv1 "k8s.io/api/batch/v1"
1715
corev1 "k8s.io/api/core/v1"
@@ -137,22 +135,6 @@ func (c *K8sClient) GetPod(ctx context.Context, name types.NamespacedName) (*cor
137135

138136
func (c *K8sClient) Status() client.StatusWriter { return c.client.Status() }
139137

140-
// listPodsAndUpdateNodeNames lists pods based on the provided label selector and updates the uniqueNodeNames map with the node names of those pods
141-
func (c *K8sClient) listPodsAndUpdateNodeNames(ctx context.Context, ais *aisv1.AIStore, labelSelector map[string]string, uniqueNodeNames sets.Set[string]) error {
142-
pods, err := c.ListPods(ctx, ais, labelSelector)
143-
if err != nil {
144-
return err
145-
}
146-
for i := range pods.Items {
147-
pod := &pods.Items[i]
148-
// check if the pod is running on a node (not failed or pending)
149-
if pod.Spec.NodeName != "" {
150-
uniqueNodeNames.Insert(pod.Spec.NodeName)
151-
}
152-
}
153-
return nil
154-
}
155-
156138
// ListNodesMatchingSelector returns a NodeList matching the given node selector
157139
func (c *K8sClient) ListNodesMatchingSelector(ctx context.Context, nodeSelector map[string]string) (*corev1.NodeList, error) {
158140
nodeList := &corev1.NodeList{}
@@ -161,16 +143,23 @@ func (c *K8sClient) ListNodesMatchingSelector(ctx context.Context, nodeSelector
161143
return nodeList, err
162144
}
163145

164-
// ListNodesRunningAIS returns a map of unique node names where AIS pods are running
165-
func (c *K8sClient) ListNodesRunningAIS(ctx context.Context, ais *aisv1.AIStore) ([]string, error) {
166-
uniqueNodeNames := sets.New[string]()
167-
if err := c.listPodsAndUpdateNodeNames(ctx, ais, proxy.RequiredPodLabels(ais), uniqueNodeNames); err != nil {
168-
return nil, err
169-
}
170-
if err := c.listPodsAndUpdateNodeNames(ctx, ais, target.RequiredPodLabels(ais), uniqueNodeNames); err != nil {
171-
return nil, err
146+
// ListNodesMatchingAISSelectors returns the union of node names matching
147+
// the proxy or target NodeSelector specs. A nil/empty selector matches all nodes.
148+
func (c *K8sClient) ListNodesMatchingAISSelectors(ctx context.Context, ais *aisv1.AIStore) ([]string, error) {
149+
nodeNames := sets.New[string]()
150+
for _, selector := range []map[string]string{
151+
ais.Spec.TargetSpec.NodeSelector,
152+
ais.Spec.ProxySpec.NodeSelector,
153+
} {
154+
nodes, err := c.ListNodesMatchingSelector(ctx, selector)
155+
if err != nil {
156+
return nil, err
157+
}
158+
for i := range nodes.Items {
159+
nodeNames.Insert(nodes.Items[i].Name)
160+
}
172161
}
173-
return uniqueNodeNames.UnsortedList(), nil
162+
return nodeNames.UnsortedList(), nil
174163
}
175164

176165
//////////////////////////////////////

operator/pkg/controllers/cluster_cleanup.go

Lines changed: 28 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,11 @@ import (
1616
"github.com/ais-operator/pkg/resources/statsd"
1717
"github.com/ais-operator/pkg/resources/target"
1818
batchv1 "k8s.io/api/batch/v1"
19+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1920
logf "sigs.k8s.io/controller-runtime/pkg/log"
2021
)
2122

2223
func (r *AIStoreReconciler) cleanup(ctx context.Context, ais *aisv1.AIStore) (updated bool, err error) {
23-
nodeNames, err := r.k8sClient.ListNodesRunningAIS(ctx, ais)
24-
if err != nil {
25-
r.log.Error(err, "Failed to list nodes running AIS")
26-
}
2724
updated, err = cmn.AnyFunc(
2825
func() (bool, error) { return r.cleanupTarget(ctx, ais) },
2926
func() (bool, error) { return r.cleanupProxy(ctx, ais) },
@@ -33,28 +30,26 @@ func (r *AIStoreReconciler) cleanup(ctx context.Context, ais *aisv1.AIStore) (up
3330
func() (bool, error) { return r.cleanupPVC(ctx, ais) },
3431
func() (bool, error) { return r.cleanupTLS(ctx, ais) },
3532
)
36-
if updated && ais.ShouldCleanupMetadata() {
37-
err = r.createCleanupJobs(ctx, ais, nodeNames)
38-
if err != nil {
39-
return
40-
}
41-
err = r.updateStatusWithState(ctx, ais, aisv1.HostCleanup)
42-
if err != nil {
43-
return
44-
}
33+
if err != nil {
34+
return
35+
}
36+
// If not using storage class, trigger jobs to clean up host mounts
37+
// Note this is not part of the above list to avoid host cleanup jobs blocking any K8s resource cleanup
38+
if ais.ShouldCleanupMetadata() && ais.Spec.StateStorageClass == nil {
39+
err = r.cleanupHostStateMounts(ctx, ais)
4540
}
4641
return
4742
}
4843

4944
func (r *AIStoreReconciler) createCleanupJobs(ctx context.Context, ais *aisv1.AIStore, nodes []string) error {
50-
if ais.Spec.StateStorageClass != nil {
51-
return nil
52-
}
5345
logger := logf.FromContext(ctx)
5446
logger.Info("Creating manual cleanup jobs", "nodes", nodes)
5547
for _, nodeName := range nodes {
5648
jobDef := cmn.NewCleanupJob(ais, nodeName)
5749
if err := r.k8sClient.Create(ctx, jobDef); err != nil {
50+
if apierrors.IsAlreadyExists(err) {
51+
continue
52+
}
5853
logger.Error(err, "Failed to create cleanup job", "name", jobDef.Name, "node", nodeName)
5954
return err
6055
}
@@ -111,17 +106,27 @@ func (r *AIStoreReconciler) deleteFinishedJobs(ctx context.Context, jobs *batchv
111106
return remaining, nil
112107
}
113108

109+
func (r *AIStoreReconciler) cleanupHostStateMounts(ctx context.Context, ais *aisv1.AIStore) error {
110+
nodeNames, err := r.k8sClient.ListNodesMatchingAISSelectors(ctx, ais)
111+
if err != nil {
112+
return err
113+
}
114+
err = r.createCleanupJobs(ctx, ais, nodeNames)
115+
if err != nil {
116+
return err
117+
}
118+
return r.updateStatusWithState(ctx, ais, aisv1.HostCleanup)
119+
}
120+
114121
func (r *AIStoreReconciler) cleanupPVC(ctx context.Context, ais *aisv1.AIStore) (bool, error) {
122+
// Note: Data PVCs must be deleted in-sync with state PVCs
123+
// Otherwise we risk new PVs being created on nodes with conflicting pre-existing state,
124+
// including cluster maps with ordinal-based addresses, e.g. ais-target-0.ais.svc.cluster.local
115125
if !ais.ShouldCleanupMetadata() {
116126
return false, nil
117127
}
118-
if ais.Spec.CleanupData != nil && *ais.Spec.CleanupData {
119-
return r.deleteAllPVCs(ctx, ais)
120-
}
121-
if ais.Spec.StateStorageClass != nil {
122-
return r.deleteStatePVCs(ctx, ais)
123-
}
124-
return false, nil
128+
// Includes all PVCs (both data and state)
129+
return r.deleteAllPVCs(ctx, ais)
125130
}
126131

127132
func (r *AIStoreReconciler) deleteAllPVCs(ctx context.Context, ais *aisv1.AIStore) (bool, error) {
@@ -134,17 +139,6 @@ func (r *AIStoreReconciler) deleteAllPVCs(ctx context.Context, ais *aisv1.AIStor
134139
return r.k8sClient.DeletePVCs(ctx, ais.Namespace, proxy.RequiredPodLabels(ais), nil)
135140
}
136141

137-
// Cleans up only dynamically created volumes by adding a filter by the defined state storage class
138-
func (r *AIStoreReconciler) deleteStatePVCs(ctx context.Context, ais *aisv1.AIStore) (bool, error) {
139-
r.log.Info("Cleaning up dynamic target PVCs")
140-
updated, err := r.k8sClient.DeletePVCs(ctx, ais.Namespace, target.RequiredPodLabels(ais), ais.Spec.StateStorageClass)
141-
if err != nil {
142-
return updated, err
143-
}
144-
r.log.Info("Cleaning up dynamic proxy PVCs")
145-
return r.k8sClient.DeletePVCs(ctx, ais.Namespace, proxy.RequiredPodLabels(ais), ais.Spec.StateStorageClass)
146-
}
147-
148142
func (r *AIStoreReconciler) cleanupRBAC(ctx context.Context, ais *aisv1.AIStore) (anyUpdated bool, err error) {
149143
return cmn.AnyFunc(
150144
func() (bool, error) {

operator/tests/e2e/cluster_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ var _ = Describe("Run Controller", func() {
509509

510510
It("Re-deploying with CleanupMetadata disabled should recover cluster", func(ctx context.Context) {
511511
cluArgs.CleanupMetadata = false
512+
cluArgs.CleanupData = false
512513
By("Deploy with cleanupMetadata false")
513514
cc := newClientCluster(ctx, AISTestCfg, WorkerCfg.K8sClient, cluArgs)
514515
defer func() {
@@ -537,6 +538,7 @@ var _ = Describe("Run Controller", func() {
537538
cc.destroyClusterOnly()
538539
// Cleanup metadata to remove PVCs so we can destroyAndCleanup PVs at the end
539540
cluArgs.CleanupMetadata = true
541+
cluArgs.CleanupData = true
540542
// Same cluster should recover all the same data and metadata
541543
By("Redeploy with cleanupMetadata true")
542544
cc.recreate(ctx, cluArgs)
@@ -546,12 +548,14 @@ var _ = Describe("Run Controller", func() {
546548

547549
It("Should detect port change when cluster is redeployed with different port", func(ctx context.Context) {
548550
cluArgs.CleanupMetadata = false
551+
cluArgs.CleanupData = false
549552
By("Deploy initial cluster with default ports")
550553
cc := newClientCluster(ctx, AISTestCfg, WorkerCfg.K8sClient, cluArgs)
551554
defer func() {
552555
cc.printLogs(ctx)
553556
// Ensure final cleanup has CleanupMetadata enabled
554557
cluArgs.CleanupMetadata = true
558+
cluArgs.CleanupData = true
555559
cc.destroyAndCleanup()
556560
}()
557561
cc.create(ctx)

0 commit comments

Comments
 (0)