Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/9693-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Enhance backup deletion logic to handle tarball download failures
1 change: 1 addition & 0 deletions changelogs/unreleased/9700-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix issue #9699, add a 2-second gap between temporary CSI VolumeSnapshotContent create and delete operations
8 changes: 8 additions & 0 deletions internal/delete/actions/csi/volumesnapshotcontent_action.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package csi

import (
"context"
"time"

"github.com/google/uuid"
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
Expand All @@ -40,6 +41,10 @@ type volumeSnapshotContentDeleteItemAction struct {
crClient crclient.Client
}

const tempVSCCreateDeleteGap = 2 * time.Second

var sleepBetweenTempVSCCreateAndDelete = time.Sleep

// AppliesTo returns information indicating
// VolumeSnapshotContentRestoreItemAction action should be invoked
// while restoring VolumeSnapshotContent.snapshot.storage.k8s.io resources
Expand Down Expand Up @@ -115,6 +120,9 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
}
p.log.Infof("Created temp VolumeSnapshotContent %s with DeletionPolicy=Delete to trigger cloud snapshot cleanup", snapCont.Name)

// Add a small delay before delete to avoid create/delete race conditions in CSI controllers.
sleepBetweenTempVSCCreateAndDelete(tempVSCCreateDeleteGap)

// Delete the temp VSC immediately to trigger cloud snapshot removal.
// The CSI driver will handle the actual cloud snapshot deletion.
if err := p.crClient.Delete(
Expand Down
49 changes: 49 additions & 0 deletions internal/delete/actions/csi/volumesnapshotcontent_action_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"testing"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
"github.com/sirupsen/logrus"
Expand All @@ -46,6 +47,21 @@ type fakeClientWithErrors struct {
deleteError error
}

type fakeClientWithCallTracking struct {
crclient.Client
events *[]string
}

func (c *fakeClientWithCallTracking) Create(ctx context.Context, obj crclient.Object, opts ...crclient.CreateOption) error {
*c.events = append(*c.events, "create")
return c.Client.Create(ctx, obj, opts...)
}

func (c *fakeClientWithCallTracking) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
*c.events = append(*c.events, "delete")
return c.Client.Delete(ctx, obj, opts...)
}

func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error {
if c.getError != nil {
return c.getError
Expand Down Expand Up @@ -325,6 +341,39 @@ func TestTryDeleteOriginalVSC(t *testing.T) {
})
}

func TestVSCExecute_CreateSleepDeleteOrder(t *testing.T) {
snapshotHandleStr := "test"
vsc := builder.ForVolumeSnapshotContent("bar").
ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).
Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).
Result()

vscMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(vsc)
require.NoError(t, err)

events := make([]string, 0, 3)
realClient := velerotest.NewFakeControllerRuntimeClient(t)
trackingClient := &fakeClientWithCallTracking{Client: realClient, events: &events}

originalSleep := sleepBetweenTempVSCCreateAndDelete
t.Cleanup(func() {
sleepBetweenTempVSCCreateAndDelete = originalSleep
})

sleepBetweenTempVSCCreateAndDelete = func(d time.Duration) {
require.Equal(t, tempVSCCreateDeleteGap, d)
events = append(events, "sleep")
}

p := volumeSnapshotContentDeleteItemAction{log: logrus.StandardLogger(), crClient: trackingClient}
err = p.Execute(&velero.DeleteItemActionExecuteInput{
Item: &unstructured.Unstructured{Object: vscMap},
Backup: builder.ForBackup("velero", "backup").Result(),
})
require.NoError(t, err)
require.Equal(t, []string{"create", "sleep", "delete"}, events)
}

func boolPtr(b bool) *bool {
return &b
}
Expand Down
39 changes: 38 additions & 1 deletion pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"

jsonpatch "github.com/evanphx/json-patch/v5"
Expand Down Expand Up @@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
// for backups which failed before tarball object could be uploaded we do offline cleanup
log.Info("Cleaning up CSI volumesnapshots")
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)

// If the tarball simply does not exist (HTTP 404 / not found), the download
// failure is permanent and not retryable, so we let deletion proceed.
// For transient errors (throttling, auth failures, network issues), record
// the error to fail the deletion so it can be retried later.
if !isTarballNotFoundError(err) {
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
Expand Down Expand Up @@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

if backupStore != nil {
if backupStore != nil && len(errs) == 0 {
log.Info("Removing backup from backup storage")
if err := backupStore.DeleteBackup(backup.Name); err != nil {
errs = append(errs, err.Error())
}
} else if len(errs) > 0 {
log.Info("Skipping removal of backup from backup storage due to previous errors")
}

log.Info("Removing restores")
Expand Down Expand Up @@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,

return errs
}

// isTarballNotFoundError reports whether err indicates that the backup tarball
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
// permanent and not retryable, so callers should let deletion proceed (skipping
// DeleteItemAction plugins) rather than failing the entire deletion.
//
// Transient errors (throttling, auth failures, network timeouts) return false so
// the deletion is failed and can be retried once the storage is reachable again.
func isTarballNotFoundError(err error) bool {
if err == nil {
return false
}
// Lower-case once for all comparisons.
msg := strings.ToLower(err.Error())
// Common "not found" indicators across cloud providers:
// - "not found" / "does not exist": generic, in-memory object store
// - "nosuchkey": AWS S3
// - "blobnotfound": Azure Blob Storage
// - "objectnotexist": GCS
return strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist") ||
strings.Contains(msg, "nosuchkey") ||
strings.Contains(msg, "blobnotfound") ||
strings.Contains(msg, "objectnotexist")
}
110 changes: 89 additions & 21 deletions pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
"reflect"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"

"context"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -606,7 +604,7 @@
// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"
Expand Down Expand Up @@ -672,6 +670,89 @@

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))

_, err := td.controller.Reconcile(t.Context(), td.req)

Check failure on line 674 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 674 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 674 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 674 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
// DeleteBackup (removing backup data from object storage) must NOT be called
// when there are errors, so that the deletion can be retried later.
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should still exist and be marked Processed with errors
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
require.Len(t, res.Status.Errors, 1)
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")

// backup CR should NOT be deleted
err = td.fakeClient.Get(t.Context(), types.NamespacedName{

Check failure on line 691 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 691 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 691 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 691 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
})
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"

input := defaultTestDbr()
input.Labels = nil

location := &velerov1api.BackupStorageLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: backup.Spec.StorageLocation,
},
Spec: velerov1api.BackupStorageLocationSpec{
Provider: "objStoreProvider",
StorageType: velerov1api.StorageType{
ObjectStorage: &velerov1api.ObjectStorageLocation{
Bucket: "bucket",
},
},
},
Status: velerov1api.BackupStorageLocationStatus{
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
},
}

snapshotLocation := &velerov1api.VolumeSnapshotLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: "vsl-1",
},
Spec: velerov1api.VolumeSnapshotLocationSpec{
Provider: "provider-1",
},
}

td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")

snapshots := []*volume.Snapshot{
{
Spec: volume.SnapshotSpec{
Location: "vsl-1",
},
Status: volume.SnapshotStatus{
ProviderSnapshotID: "snap-1",
},
},
}

pluginManager := &pluginmocks.Manager{}
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
pluginManager.On("CleanupClients")
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
// Simulate a 404/not-found error (tarball has already been removed from storage)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(context.TODO(), td.req)
Expand All @@ -680,30 +761,17 @@
td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted
// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
if err == nil {
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
}
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)

// backup CR should be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
// backup CR should be deleted because there are no errors in errs
err = td.fakeClient.Get(t.Context(), types.NamespacedName{

Check failure on line 770 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context) (typecheck)

Check failure on line 770 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context) (typecheck)

Check failure on line 770 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)

Check failure on line 770 in pkg/controller/backup_deletion_controller_test.go

View workflow job for this annotation

GitHub Actions / Run CI

t.Context undefined (type *"testing".T has no field or method Context, but does have unexported field context)
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)

// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
})
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)
Expand Down
Loading