Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/9693-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enhance backup deletion logic to handle tarball download failures
39 changes: 38 additions & 1 deletion pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"

jsonpatch "github.com/evanphx/json-patch/v5"
Expand Down Expand Up @@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
// for backups which failed before tarball object could be uploaded we do offline cleanup
log.Info("Cleaning up CSI volumesnapshots")
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)

// If the tarball simply does not exist (HTTP 404 / not found), the download
// failure is permanent and not retryable, so we let deletion proceed.
// For transient errors (throttling, auth failures, network issues), record
// the error to fail the deletion so it can be retried later.
if !isTarballNotFoundError(err) {
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
Comment thread
priyansh17 marked this conversation as resolved.
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
Expand Down Expand Up @@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

if backupStore != nil {
if backupStore != nil && len(errs) == 0 {
Comment thread
priyansh17 marked this conversation as resolved.
log.Info("Removing backup from backup storage")
if err := backupStore.DeleteBackup(backup.Name); err != nil {
errs = append(errs, err.Error())
}
} else if len(errs) > 0 {
log.Info("Skipping removal of backup from backup storage due to previous errors")
}

log.Info("Removing restores")
Expand Down Expand Up @@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,

return errs
}

// isTarballNotFoundError reports whether err indicates that the backup tarball
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
// permanent and not retryable, so callers should let deletion proceed (skipping
// DeleteItemAction plugins) rather than failing the entire deletion.
//
// Transient errors (throttling, auth failures, network timeouts) return false so
// the deletion is failed and can be retried once the storage is reachable again.
func isTarballNotFoundError(err error) bool {
if err == nil {
return false
}
// Lower-case once for all comparisons.
msg := strings.ToLower(err.Error())
// Common "not found" indicators across cloud providers:
// - "not found" / "does not exist": generic, in-memory object store
// - "nosuchkey": AWS S3
// - "blobnotfound": Azure Blob Storage
// - "objectnotexist": GCS
return strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist") ||
strings.Contains(msg, "nosuchkey") ||
strings.Contains(msg, "blobnotfound") ||
strings.Contains(msg, "objectnotexist")
}
108 changes: 88 additions & 20 deletions pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ import (
"reflect"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"

"context"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -606,7 +604,7 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"
Expand Down Expand Up @@ -672,38 +670,108 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(t.Context(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
// DeleteBackup (removing backup data from object storage) must NOT be called
// when there are errors, so that the deletion can be retried later.
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted
// the dbr should still exist and be marked Processed with errors
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
if err == nil {
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
}
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
require.Len(t, res.Status.Errors, 1)
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")

// backup CR should be deleted
// backup CR should NOT be deleted
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
})
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"

// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
input := defaultTestDbr()
input.Labels = nil

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
location := &velerov1api.BackupStorageLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: backup.Spec.StorageLocation,
},
Spec: velerov1api.BackupStorageLocationSpec{
Provider: "objStoreProvider",
StorageType: velerov1api.StorageType{
ObjectStorage: &velerov1api.ObjectStorageLocation{
Bucket: "bucket",
},
},
},
Status: velerov1api.BackupStorageLocationStatus{
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
},
}

snapshotLocation := &velerov1api.VolumeSnapshotLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: "vsl-1",
},
Spec: velerov1api.VolumeSnapshotLocationSpec{
Provider: "provider-1",
},
}

td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")

snapshots := []*volume.Snapshot{
{
Spec: volume.SnapshotSpec{
Location: "vsl-1",
},
Status: volume.SnapshotStatus{
ProviderSnapshotID: "snap-1",
},
},
}

pluginManager := &pluginmocks.Manager{}
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
pluginManager.On("CleanupClients")
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
// Simulate a 404/not-found error (tarball has already been removed from storage)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(t.Context(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)

// backup CR should be deleted because there are no errors in errs
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
})
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)
Expand Down
Loading