Skip to content

Commit 9bcf1c5

Browse files
committed
Auto-exclude Velero namespace from backups instead of hard error
Velero does not support backing up its own namespace. Previously this would fail during execution with no clear signal at request time. This change auto-excludes the Velero namespace during backup preparation: - When included via wildcard or empty includes (all namespaces): adds the Velero namespace to ExcludedNamespaces with a warning log. - When explicitly listed alongside other namespaces: removes it from IncludedNamespaces, adds to ExcludedNamespaces, and logs a warning. - When it is the only included namespace: returns a validation error to prevent a silent empty backup. - Avoids duplicate entries in ExcludedNamespaces if the namespace was already excluded by prior logic (e.g. exclude-from-backup label). Signed-off-by: Achin Mishra <urs.achin.007@gmail.com>
2 parents 5568ccf + 8a6ac7a commit 9bcf1c5

5 files changed

Lines changed: 147 additions & 31 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Enhance backup deletion logic to handle tarball download failures

pkg/controller/backup_controller.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,17 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel
600600
fmt.Sprintf("Velero namespace %q is the only included namespace, but Velero does not support backing up its own namespace", veleroNs))
601601
} else {
602602
logger.Warnf("Velero namespace %q is being excluded from this backup because Velero does not support backing up its own namespace.", veleroNs)
603-
request.Spec.ExcludedNamespaces = append(request.Spec.ExcludedNamespaces, veleroNs)
603+
// Only append if not already present (e.g. from the exclude-from-backup label logic).
604+
alreadyExcluded := false
605+
for _, ns := range request.Spec.ExcludedNamespaces {
606+
if ns == veleroNs {
607+
alreadyExcluded = true
608+
break
609+
}
610+
}
611+
if !alreadyExcluded {
612+
request.Spec.ExcludedNamespaces = append(request.Spec.ExcludedNamespaces, veleroNs)
613+
}
604614
}
605615
}
606616

pkg/controller/backup_controller_test.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -274,11 +274,11 @@ func TestProcessBackupVeleroNamespaceValidation(t *testing.T) {
274274
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Phase(velerov1api.BackupStorageLocationPhaseAvailable).Result()
275275

276276
tests := []struct {
277-
name string
278-
backup *velerov1api.Backup
279-
expectedIncludedNamespaces []string
280-
expectedExcludedNamespaces []string
281-
expectValidationError bool
277+
name string
278+
backup *velerov1api.Backup
279+
expectedIncludedNamespaces []string
280+
expectedExcludedNamespaces []string
281+
expectValidationError bool
282282
}{
283283
{
284284
name: "velero namespace explicitly as only IncludedNamespace returns validation error",
@@ -323,10 +323,10 @@ func TestProcessBackupVeleroNamespaceValidation(t *testing.T) {
323323
expectValidationError: true,
324324
},
325325
{
326-
name: "velero plus other namespaces in includes removes velero and backs up the others",
327-
backup: builder.ForBackup(velerov1api.DefaultNamespace, "backup-1").Phase(velerov1api.BackupPhaseReadyToStart).IncludedNamespaces("velero", "default", "kube-system").Result(),
328-
expectedIncludedNamespaces: []string{"default", "kube-system"},
329-
expectedExcludedNamespaces: []string{"velero"},
326+
name: "velero plus other namespaces in includes removes velero and backs up the others",
327+
backup: builder.ForBackup(velerov1api.DefaultNamespace, "backup-1").Phase(velerov1api.BackupPhaseReadyToStart).IncludedNamespaces("velero", "default", "kube-system").Result(),
328+
expectedIncludedNamespaces: []string{"default", "kube-system"},
329+
expectedExcludedNamespaces: []string{"velero"},
330330
},
331331
}
332332

pkg/controller/backup_deletion_controller.go

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23+
"strings"
2324
"time"
2425

2526
jsonpatch "github.com/evanphx/json-patch/v5"
@@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
267268

268269
if err != nil {
269270
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
271+
// for backups which failed before tarball object could be uploaded we do offline cleanup
270272
log.Info("Cleaning up CSI volumesnapshots")
271273
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)
274+
275+
// If the tarball simply does not exist (HTTP 404 / not found), the download
276+
// failure is permanent and not retryable, so we let deletion proceed.
277+
// For transient errors (throttling, auth failures, network issues), record
278+
// the error to fail the deletion so it can be retried later.
279+
if !isTarballNotFoundError(err) {
280+
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
281+
}
272282
} else {
273283
defer closeAndRemoveFile(backupFile, r.logger)
274284
deleteCtx := &delete.Context{
@@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
351361
}
352362
}
353363

354-
if backupStore != nil {
364+
if backupStore != nil && len(errs) == 0 {
355365
log.Info("Removing backup from backup storage")
356366
if err := backupStore.DeleteBackup(backup.Name); err != nil {
357367
errs = append(errs, err.Error())
358368
}
369+
} else if len(errs) > 0 {
370+
log.Info("Skipping removal of backup from backup storage due to previous errors")
359371
}
360372

361373
log.Info("Removing restores")
@@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,
691703

692704
return errs
693705
}
706+
707+
// isTarballNotFoundError reports whether err indicates that the backup tarball
708+
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
709+
// permanent and not retryable, so callers should let deletion proceed (skipping
710+
// DeleteItemAction plugins) rather than failing the entire deletion.
711+
//
712+
// Transient errors (throttling, auth failures, network timeouts) return false so
713+
// the deletion is failed and can be retried once the storage is reachable again.
714+
func isTarballNotFoundError(err error) bool {
715+
if err == nil {
716+
return false
717+
}
718+
// Lower-case once for all comparisons.
719+
msg := strings.ToLower(err.Error())
720+
// Common "not found" indicators across cloud providers:
721+
// - "not found" / "does not exist": generic, in-memory object store
722+
// - "nosuchkey": AWS S3
723+
// - "blobnotfound": Azure Blob Storage
724+
// - "objectnotexist": GCS
725+
return strings.Contains(msg, "not found") ||
726+
strings.Contains(msg, "does not exist") ||
727+
strings.Contains(msg, "nosuchkey") ||
728+
strings.Contains(msg, "blobnotfound") ||
729+
strings.Contains(msg, "objectnotexist")
730+
}

pkg/controller/backup_deletion_controller_test.go

Lines changed: 88 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ import (
2525
"reflect"
2626
"time"
2727

28-
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
29-
3028
"context"
3129

3230
"github.com/sirupsen/logrus"
@@ -606,7 +604,7 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
606604
// Make sure snapshot was deleted
607605
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
608606
})
609-
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
607+
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
610608
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
611609
backup.UID = "uid"
612610
backup.Spec.StorageLocation = "primary"
@@ -672,38 +670,108 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
672670

673671
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
674672
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))
675-
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
676673

677674
_, err := td.controller.Reconcile(t.Context(), td.req)
678675
require.NoError(t, err)
679676

680677
td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
681-
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
678+
// DeleteBackup (removing backup data from object storage) must NOT be called
679+
// when there are errors, so that the deletion can be retried later.
680+
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)
682681

683-
// the dbr should be deleted
682+
// the dbr should still exist and be marked Processed with errors
684683
res := &velerov1api.DeleteBackupRequest{}
685684
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
686-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
687-
if err == nil {
688-
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
689-
}
685+
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
686+
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
687+
require.Len(t, res.Status.Errors, 1)
688+
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")
690689

691-
// backup CR should be deleted
690+
// backup CR should NOT be deleted
692691
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
693692
Namespace: velerov1api.DefaultNamespace,
694693
Name: backup.Name,
695694
}, &velerov1api.Backup{})
696-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
695+
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
696+
})
697+
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
698+
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
699+
backup.UID = "uid"
700+
backup.Spec.StorageLocation = "primary"
697701

698-
// leaked CSI snapshot should be deleted
699-
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
700-
Namespace: "user-ns",
701-
Name: "vs-1",
702-
}, &snapshotv1api.VolumeSnapshot{})
703-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
702+
input := defaultTestDbr()
703+
input.Labels = nil
704704

705-
// Make sure snapshot was deleted
706-
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
705+
location := &velerov1api.BackupStorageLocation{
706+
ObjectMeta: metav1.ObjectMeta{
707+
Namespace: backup.Namespace,
708+
Name: backup.Spec.StorageLocation,
709+
},
710+
Spec: velerov1api.BackupStorageLocationSpec{
711+
Provider: "objStoreProvider",
712+
StorageType: velerov1api.StorageType{
713+
ObjectStorage: &velerov1api.ObjectStorageLocation{
714+
Bucket: "bucket",
715+
},
716+
},
717+
},
718+
Status: velerov1api.BackupStorageLocationStatus{
719+
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
720+
},
721+
}
722+
723+
snapshotLocation := &velerov1api.VolumeSnapshotLocation{
724+
ObjectMeta: metav1.ObjectMeta{
725+
Namespace: backup.Namespace,
726+
Name: "vsl-1",
727+
},
728+
Spec: velerov1api.VolumeSnapshotLocationSpec{
729+
Provider: "provider-1",
730+
},
731+
}
732+
733+
td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
734+
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")
735+
736+
snapshots := []*volume.Snapshot{
737+
{
738+
Spec: volume.SnapshotSpec{
739+
Location: "vsl-1",
740+
},
741+
Status: volume.SnapshotStatus{
742+
ProviderSnapshotID: "snap-1",
743+
},
744+
},
745+
}
746+
747+
pluginManager := &pluginmocks.Manager{}
748+
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
749+
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
750+
pluginManager.On("CleanupClients")
751+
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }
752+
753+
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
754+
// Simulate a 404/not-found error (tarball has already been removed from storage)
755+
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
756+
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
757+
758+
_, err := td.controller.Reconcile(t.Context(), td.req)
759+
require.NoError(t, err)
760+
761+
td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
762+
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
763+
764+
// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
765+
res := &velerov1api.DeleteBackupRequest{}
766+
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
767+
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)
768+
769+
// backup CR should be deleted because there are no errors in errs
770+
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
771+
Namespace: velerov1api.DefaultNamespace,
772+
Name: backup.Name,
773+
}, &velerov1api.Backup{})
774+
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
707775
})
708776
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
709777
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)

0 commit comments

Comments
 (0)