Skip to content

Commit 021a934

Browse files
mprycclaude
andauthored
Fix OADP-7562: handle wildcard in scoped excluded cluster resources (#333)
Signed-off-by: Michal Pryc <mpryc@redhat.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 17f1ee7 commit 021a934

7 files changed

Lines changed: 224 additions & 56 deletions

internal/common/constant/constant.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ const JSONTagString = "json"
9393
// CommaString defines a constant for the comma string
9494
const CommaString = ","
9595

96+
// WildcardString defines the wildcard character used in Velero resource filters
97+
const WildcardString = "*"
98+
9699
// MaximumNacObjectNameLength represents Generated Non Admin Object Name and
97100
// must be below 63 characters, because it's used within object Label Value
98101
const MaximumNacObjectNameLength = validation.DNS1123LabelMaxLength

internal/common/function/function_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,11 @@ func TestValidateBackupSpecEnforcedFields(t *testing.T) {
378378
ParallelFilesUpload: 32, //nolint:revive // just test
379379
},
380380
},
381+
{
382+
name: "VolumeGroupSnapshotLabelKey",
383+
enforcedValue: "enforced-label-key",
384+
overrideValue: "user-label-key",
385+
},
381386
}
382387
for _, test := range tests {
383388
t.Run(test.name, func(t *testing.T) {

internal/controller/nonadminbackup_controller.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"context"
2222
"errors"
2323
"reflect"
24+
"slices"
2425

2526
"github.com/go-logr/logr"
2627
velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
@@ -705,11 +706,18 @@ func (r *NonAdminBackupReconciler) createVeleroBackupAndSyncWithNonAdminBackup(c
705706

706707
if haveNewResourceFilterParameters {
707708
// Use the new-style exclusion list
708-
backupSpec.ExcludedNamespaceScopedResources = append(backupSpec.ExcludedNamespaceScopedResources,
709-
alwaysExcludedNamespacedResources...)
710-
backupSpec.ExcludedClusterScopedResources = append(backupSpec.ExcludedClusterScopedResources,
711-
alwaysExcludedClusterResources...)
712-
} else {
709+
// Skip appending when wildcard '*' is already present, as it already
710+
// excludes everything and mixing '*' with specific items causes Velero
711+
// FailedValidation.
712+
if !slices.Contains(backupSpec.ExcludedNamespaceScopedResources, constant.WildcardString) {
713+
backupSpec.ExcludedNamespaceScopedResources = append(backupSpec.ExcludedNamespaceScopedResources,
714+
alwaysExcludedNamespacedResources...)
715+
}
716+
if !slices.Contains(backupSpec.ExcludedClusterScopedResources, constant.WildcardString) {
717+
backupSpec.ExcludedClusterScopedResources = append(backupSpec.ExcludedClusterScopedResources,
718+
alwaysExcludedClusterResources...)
719+
}
720+
} else if !slices.Contains(backupSpec.ExcludedResources, constant.WildcardString) {
713721
// Fallback to the old-style exclusion list
714722
backupSpec.ExcludedResources = append(backupSpec.ExcludedResources,
715723
alwaysExcludedNamespacedResources...)

internal/controller/nonadminbackup_controller_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1546,6 +1546,86 @@ var _ = ginkgo.Describe("Test full reconcile loop of NonAdminBackup Controller",
15461546
},
15471547
addSyncLabel: true,
15481548
}),
1549+
ginkgo.Entry("Should not append default excluded resources when wildcard is used in excludedClusterScopedResources", nonAdminBackupFullReconcileScenario{
1550+
spec: nacv1alpha1.NonAdminBackupSpec{
1551+
BackupSpec: &velerov1.BackupSpec{
1552+
ExcludedClusterScopedResources: []string{"*"},
1553+
IncludedNamespaceScopedResources: []string{"pvc"},
1554+
},
1555+
},
1556+
status: nacv1alpha1.NonAdminBackupStatus{
1557+
Phase: nacv1alpha1.NonAdminPhaseCreated,
1558+
VeleroBackup: &nacv1alpha1.VeleroBackup{
1559+
Namespace: oadpNamespace,
1560+
Status: nil,
1561+
Spec: &velerov1.BackupSpec{
1562+
ExcludedClusterScopedResources: []string{"*"},
1563+
IncludedNamespaceScopedResources: []string{"pvc"},
1564+
ExcludedNamespaceScopedResources: []string{
1565+
nacv1alpha1.NonAdminBackups,
1566+
nacv1alpha1.NonAdminRestores,
1567+
nacv1alpha1.NonAdminBackupStorageLocations,
1568+
},
1569+
},
1570+
},
1571+
Conditions: []metav1.Condition{
1572+
{
1573+
Type: "Accepted",
1574+
Status: metav1.ConditionTrue,
1575+
Reason: "BackupAccepted",
1576+
Message: "backup accepted",
1577+
},
1578+
{
1579+
Type: "Queued",
1580+
Status: metav1.ConditionTrue,
1581+
Reason: "BackupScheduled",
1582+
Message: "Created Velero Backup object",
1583+
},
1584+
},
1585+
},
1586+
}),
1587+
ginkgo.Entry("Should not append default excluded resources when wildcard is used in excludedNamespaceScopedResources", nonAdminBackupFullReconcileScenario{
1588+
spec: nacv1alpha1.NonAdminBackupSpec{
1589+
BackupSpec: &velerov1.BackupSpec{
1590+
ExcludedNamespaceScopedResources: []string{"*"},
1591+
IncludedClusterScopedResources: []string{"persistentvolumes"},
1592+
},
1593+
},
1594+
status: nacv1alpha1.NonAdminBackupStatus{
1595+
Phase: nacv1alpha1.NonAdminPhaseCreated,
1596+
VeleroBackup: &nacv1alpha1.VeleroBackup{
1597+
Namespace: oadpNamespace,
1598+
Status: nil,
1599+
Spec: &velerov1.BackupSpec{
1600+
ExcludedNamespaceScopedResources: []string{"*"},
1601+
IncludedClusterScopedResources: []string{"persistentvolumes"},
1602+
ExcludedClusterScopedResources: []string{
1603+
"securitycontextconstraints",
1604+
"clusterroles",
1605+
"clusterrolebindings",
1606+
"priorityclasses",
1607+
"customresourcedefinitions",
1608+
"virtualmachineclusterinstancetypes",
1609+
"virtualmachineclusterpreferences",
1610+
},
1611+
},
1612+
},
1613+
Conditions: []metav1.Condition{
1614+
{
1615+
Type: "Accepted",
1616+
Status: metav1.ConditionTrue,
1617+
Reason: "BackupAccepted",
1618+
Message: "backup accepted",
1619+
},
1620+
{
1621+
Type: "Queued",
1622+
Status: metav1.ConditionTrue,
1623+
Reason: "BackupScheduled",
1624+
Message: "Created Velero Backup object",
1625+
},
1626+
},
1627+
},
1628+
}),
15491629
)
15501630

15511631
ginkgo.DescribeTable("Reconcile triggered by NonAdminBackup sync event",

internal/controller/nonadminbackupstoragelocation_controller.go

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -12,42 +12,43 @@ distributed under the License is distributed on an "AS IS" BASIS,
1212
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
See the License for the specific language governing permissions and
1414
limitations under the License.
15-
16-
RESOURCE CONFLICT RESOLUTION ENHANCEMENTS:
17-
This file has been enhanced to resolve resource conflict issues that occurred when
18-
multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest
19-
objects simultaneously. The following changes were made:
20-
21-
1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function):
22-
- Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings
23-
- Handles "object has been modified" errors gracefully
24-
- Fetches fresh object copies to avoid stale ResourceVersion conflicts
25-
- Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter)
26-
27-
2. NIL SAFETY CHECKS (ensureNonAdminRequest function):
28-
- Prevents panic when SourceNonAdminBSL is nil during initialization
29-
- Converts terminal errors to requeue conditions for uninitialized status
30-
- Allows proper status initialization timing in high-concurrency environments
31-
32-
3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function):
33-
- Uses fast-path direct updates for new objects
34-
- Falls back to retry logic only when conflicts are detected
35-
- Preserves computed status values while ensuring conflict resilience
36-
37-
4. TEST ENVIRONMENT ADAPTATIONS:
38-
- Increased timeouts to accommodate retry logic execution time
39-
- Reduced polling frequency to handle Kubernetes client rate limiting
40-
- Added delays to prevent overwhelming API server during test runs
41-
42-
These enhancements ensure that OADP non-admin backup operations complete successfully
43-
even under high concurrency or when multiple reconciliation events occur simultaneously.
4415
*/
4516

17+
// RESOURCE CONFLICT RESOLUTION ENHANCEMENTS:
18+
// This file has been enhanced to resolve resource conflict issues that occurred when
19+
// multiple controllers or processes attempted to update the same NonAdminBackupStorageLocationRequest
20+
// objects simultaneously. The following changes were made:
21+
//
22+
// 1. RETRY LOGIC FRAMEWORK (updateStatusWithRetry function):
23+
// - Uses standard Kubernetes client-go retry.RetryOnConflict with DefaultRetry settings
24+
// - Handles "object has been modified" errors gracefully
25+
// - Fetches fresh object copies to avoid stale ResourceVersion conflicts
26+
// - Leverages proven Kubernetes retry patterns (5 attempts, 10ms+jitter)
27+
//
28+
// 2. NIL SAFETY CHECKS (ensureNonAdminRequest function):
29+
// - Prevents panic when SourceNonAdminBSL is nil during initialization
30+
// - Converts terminal errors to requeue conditions for uninitialized status
31+
// - Allows proper status initialization timing in high-concurrency environments
32+
//
33+
// 3. OPTIMIZED STATUS UPDATES (createNonAdminRequest function):
34+
// - Uses fast-path direct updates for new objects
35+
// - Falls back to retry logic only when conflicts are detected
36+
// - Preserves computed status values while ensuring conflict resilience
37+
//
38+
// 4. TEST ENVIRONMENT ADAPTATIONS:
39+
// - Increased timeouts to accommodate retry logic execution time
40+
// - Reduced polling frequency to handle Kubernetes client rate limiting
41+
// - Added delays to prevent overwhelming API server during test runs
42+
//
43+
// These enhancements ensure that OADP non-admin backup operations complete successfully
44+
// even under high concurrency or when multiple reconciliation events occur simultaneously.
45+
4646
package controller
4747

4848
import (
4949
"context"
5050
"errors"
51+
"fmt"
5152
"reflect"
5253
"time"
5354

@@ -125,7 +126,10 @@ func (r *NonAdminBackupStorageLocationReconciler) updateStatusWithRetry(ctx cont
125126
// Get the latest version of the object from the API server to ensure we have
126127
// the most recent ResourceVersion and avoid stale object conflicts
127128
key := client.ObjectKeyFromObject(obj)
128-
fresh := obj.DeepCopyObject().(client.Object)
129+
fresh, ok := obj.DeepCopyObject().(client.Object)
130+
if !ok {
131+
return errors.New("failed to convert deep copy to client.Object")
132+
}
129133
if err := r.Get(ctx, key, fresh); err != nil {
130134
return err // RetryOnConflict will handle conflict vs non-conflict errors
131135
}
@@ -643,7 +647,11 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
643647
// - Event-driven reconciliation causing concurrent status updates
644648
logger.V(1).Info("NonAdminBackupStorageLocationRequest already exists")
645649
if updateErr := r.updateStatusWithRetry(ctx, logger, nabslRequest, func(obj client.Object) bool {
646-
req := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest)
650+
req, ok := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest)
651+
if !ok {
652+
logger.Error(fmt.Errorf("expected *NonAdminBackupStorageLocationRequest, got %T", obj), "Unexpected type assertion failure")
653+
return false
654+
}
647655
return updatePhaseIfNeeded(&req.Status.Phase, req.Spec.ApprovalDecision)
648656
}); updateErr != nil {
649657
logger.Error(updateErr, failedUpdateStatusError)
@@ -704,26 +712,22 @@ func (r *NonAdminBackupStorageLocationReconciler) createNonAdminRequest(ctx cont
704712
// - Correctness: Proper status initialization even under load
705713
if updated := updateNonAdminRequestStatus(&nonAdminBslRequest.Status, nabsl, approvalDecision); updated {
706714
if updateErr := r.Status().Update(ctx, &nonAdminBslRequest); updateErr != nil {
707-
if apierrors.IsConflict(updateErr) {
708-
// CONFLICT DETECTED: Another process modified the request between create and status update
709-
// This can happen when:
710-
// - Admin approves/rejects the request immediately after creation
711-
// - Multiple reconcile loops are triggered by related events
712-
// - High concurrency in the test environment
713-
logger.V(1).Info("Conflict on initial status update, retrying with fresh object...")
714-
if retryErr := r.updateStatusWithRetry(ctx, logger, &nonAdminBslRequest, func(obj client.Object) bool {
715-
req := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest)
716-
return updateNonAdminRequestStatus(&req.Status, nabsl, approvalDecision)
717-
}); retryErr != nil {
718-
logger.Error(retryErr, failedUpdateStatusError)
719-
return false, retryErr
720-
}
721-
} else {
722-
// NON-CONFLICT ERROR: Validation, permission, or other API server issue
723-
// Don't retry these as they're likely to persist
715+
if !apierrors.IsConflict(updateErr) {
724716
logger.Error(updateErr, failedUpdateStatusError)
725717
return false, updateErr
726718
}
719+
// CONFLICT DETECTED: Another process modified the request between create and status update
720+
logger.V(1).Info("Conflict on initial status update, retrying with fresh object...")
721+
if retryErr := r.updateStatusWithRetry(ctx, logger, &nonAdminBslRequest, func(obj client.Object) bool {
722+
req, ok := obj.(*nacv1alpha1.NonAdminBackupStorageLocationRequest)
723+
if !ok {
724+
return false
725+
}
726+
return updateNonAdminRequestStatus(&req.Status, nabsl, approvalDecision)
727+
}); retryErr != nil {
728+
logger.Error(retryErr, failedUpdateStatusError)
729+
return false, retryErr
730+
}
727731
}
728732
}
729733

internal/controller/nonadminrestore_controller.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"errors"
2222
"reflect"
23+
"slices"
2324

2425
"github.com/go-logr/logr"
2526
velerov1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
@@ -348,8 +349,10 @@ func (r *NonAdminRestoreReconciler) createVeleroRestore(ctx context.Context, log
348349
}
349350
}
350351

351-
restoreSpec.ExcludedResources = append(restoreSpec.ExcludedResources,
352-
"volumesnapshotclasses")
352+
if !slices.Contains(restoreSpec.ExcludedResources, constant.WildcardString) {
353+
restoreSpec.ExcludedResources = append(restoreSpec.ExcludedResources,
354+
"volumesnapshotclasses")
355+
}
353356

354357
veleroRestore = &velerov1.Restore{
355358
ObjectMeta: metav1.ObjectMeta{

internal/controller/nonadminrestore_controller_test.go

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"reflect"
23+
"slices"
2324
"strings"
2425
"time"
2526

@@ -366,7 +367,11 @@ var _ = ginkgo.Describe("Test full reconcile loop of NonAdminRestore Controller"
366367
ginkgo.By("Validating Velero Restore Spec")
367368
expectedSpec := scenario.enforcedRestoreSpec.DeepCopy()
368369
expectedSpec.IncludedNamespaces = []string{nonAdminRestoreNamespace}
369-
expectedSpec.ExcludedResources = []string{"volumesnapshotclasses"}
370+
if !slices.Contains(scenario.spec.RestoreSpec.ExcludedResources, "*") {
371+
expectedSpec.ExcludedResources = append(expectedSpec.ExcludedResources, "volumesnapshotclasses")
372+
} else {
373+
expectedSpec.ExcludedResources = scenario.spec.RestoreSpec.ExcludedResources
374+
}
370375
gomega.Expect(reflect.DeepEqual(veleroRestore.Spec, *expectedSpec)).To(gomega.BeTrue())
371376
}
372377

@@ -605,5 +610,65 @@ var _ = ginkgo.Describe("Test full reconcile loop of NonAdminRestore Controller"
605610
QueueInfo: nil,
606611
},
607612
}),
613+
ginkgo.Entry("Should not append default excluded resources when wildcard is used in excludedResources", nonAdminRestoreFullReconcileScenario{
614+
spec: nacv1alpha1.NonAdminRestoreSpec{
615+
RestoreSpec: &velerov1.RestoreSpec{
616+
BackupName: "test",
617+
ExcludedResources: []string{"*"},
618+
},
619+
},
620+
backupStatus: nacv1alpha1.NonAdminBackupStatus{
621+
Phase: nacv1alpha1.NonAdminPhaseCreated,
622+
VeleroBackup: &nacv1alpha1.VeleroBackup{
623+
Status: &velerov1.BackupStatus{
624+
Phase: velerov1.BackupPhaseCompleted,
625+
CompletionTimestamp: &metav1.Time{Time: time.Now()},
626+
},
627+
},
628+
Conditions: []metav1.Condition{
629+
{
630+
Type: "Accepted",
631+
Status: metav1.ConditionTrue,
632+
Reason: "BackupAccepted",
633+
Message: "backup accepted",
634+
LastTransitionTime: metav1.NewTime(time.Now()),
635+
},
636+
{
637+
Type: "Queued",
638+
Status: metav1.ConditionTrue,
639+
Reason: "BackupScheduled",
640+
Message: "Created Velero Backup object",
641+
LastTransitionTime: metav1.NewTime(time.Now()),
642+
},
643+
},
644+
QueueInfo: &nacv1alpha1.QueueInfo{
645+
EstimatedQueuePosition: 0,
646+
},
647+
},
648+
status: nacv1alpha1.NonAdminRestoreStatus{
649+
Phase: nacv1alpha1.NonAdminPhaseCreated,
650+
VeleroRestore: &nacv1alpha1.VeleroRestore{
651+
Status: nil,
652+
},
653+
Conditions: []metav1.Condition{
654+
{
655+
Type: "Accepted",
656+
Status: metav1.ConditionTrue,
657+
Reason: "RestoreAccepted",
658+
Message: "restore accepted",
659+
},
660+
{
661+
Type: "Queued",
662+
Status: metav1.ConditionTrue,
663+
Reason: "RestoreScheduled",
664+
Message: "Created Velero Restore object",
665+
},
666+
},
667+
QueueInfo: &nacv1alpha1.QueueInfo{
668+
EstimatedQueuePosition: 1,
669+
},
670+
},
671+
enforcedRestoreSpec: &velerov1.RestoreSpec{},
672+
}),
608673
)
609674
})

0 commit comments

Comments
 (0)