Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelogs/unreleased/9726-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Backporting PR #9700 and #9693
Fix issue #9699, add a 2-second gap between temporary CSI VolumeSnapshotContent create and delete operations
Enhance backup deletion logic to handle tarball download failures
8 changes: 8 additions & 0 deletions internal/delete/actions/csi/volumesnapshotcontent_action.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package csi

import (
"context"
"time"

"github.com/google/uuid"
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
Expand All @@ -40,6 +41,10 @@ type volumeSnapshotContentDeleteItemAction struct {
crClient crclient.Client
}

const tempVSCCreateDeleteGap = 2 * time.Second

var sleepBetweenTempVSCCreateAndDelete = time.Sleep

// AppliesTo returns information indicating
// VolumeSnapshotContentRestoreItemAction action should be invoked
// while restoring VolumeSnapshotContent.snapshot.storage.k8s.io resources
Expand Down Expand Up @@ -115,6 +120,9 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
}
p.log.Infof("Created temp VolumeSnapshotContent %s with DeletionPolicy=Delete to trigger cloud snapshot cleanup", snapCont.Name)

// Add a small delay before delete to avoid create/delete race conditions in CSI controllers.
sleepBetweenTempVSCCreateAndDelete(tempVSCCreateDeleteGap)

// Delete the temp VSC immediately to trigger cloud snapshot removal.
// The CSI driver will handle the actual cloud snapshot deletion.
if err := p.crClient.Delete(
Expand Down
49 changes: 49 additions & 0 deletions internal/delete/actions/csi/volumesnapshotcontent_action_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"testing"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"
"github.com/sirupsen/logrus"
Expand All @@ -46,6 +47,21 @@ type fakeClientWithErrors struct {
deleteError error
}

type fakeClientWithCallTracking struct {
crclient.Client
events *[]string
}

func (c *fakeClientWithCallTracking) Create(ctx context.Context, obj crclient.Object, opts ...crclient.CreateOption) error {
*c.events = append(*c.events, "create")
return c.Client.Create(ctx, obj, opts...)
}

func (c *fakeClientWithCallTracking) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
*c.events = append(*c.events, "delete")
return c.Client.Delete(ctx, obj, opts...)
}

func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error {
if c.getError != nil {
return c.getError
Expand Down Expand Up @@ -325,6 +341,39 @@ func TestTryDeleteOriginalVSC(t *testing.T) {
})
}

func TestVSCExecute_CreateSleepDeleteOrder(t *testing.T) {
snapshotHandleStr := "test"
vsc := builder.ForVolumeSnapshotContent("bar").
ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).
Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).
Result()

vscMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(vsc)
require.NoError(t, err)

events := make([]string, 0, 3)
realClient := velerotest.NewFakeControllerRuntimeClient(t)
trackingClient := &fakeClientWithCallTracking{Client: realClient, events: &events}

originalSleep := sleepBetweenTempVSCCreateAndDelete
t.Cleanup(func() {
sleepBetweenTempVSCCreateAndDelete = originalSleep
})

sleepBetweenTempVSCCreateAndDelete = func(d time.Duration) {
require.Equal(t, tempVSCCreateDeleteGap, d)
events = append(events, "sleep")
}

p := volumeSnapshotContentDeleteItemAction{log: logrus.StandardLogger(), crClient: trackingClient}
err = p.Execute(&velero.DeleteItemActionExecuteInput{
Item: &unstructured.Unstructured{Object: vscMap},
Backup: builder.ForBackup("velero", "backup").Result(),
})
require.NoError(t, err)
require.Equal(t, []string{"create", "sleep", "delete"}, events)
}

func boolPtr(b bool) *bool {
return &b
}
Expand Down
39 changes: 38 additions & 1 deletion pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"

jsonpatch "github.com/evanphx/json-patch/v5"
Expand Down Expand Up @@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
// for backups which failed before tarball object could be uploaded we do offline cleanup
log.Info("Cleaning up CSI volumesnapshots")
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)

// If the tarball simply does not exist (HTTP 404 / not found), the download
// failure is permanent and not retryable, so we let deletion proceed.
// For transient errors (throttling, auth failures, network issues), record
// the error to fail the deletion so it can be retried later.
if !isTarballNotFoundError(err) {
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
Expand Down Expand Up @@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

if backupStore != nil {
if backupStore != nil && len(errs) == 0 {
log.Info("Removing backup from backup storage")
if err := backupStore.DeleteBackup(backup.Name); err != nil {
errs = append(errs, err.Error())
}
} else if len(errs) > 0 {
log.Info("Skipping removal of backup from backup storage due to previous errors")
}

log.Info("Removing restores")
Expand Down Expand Up @@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,

return errs
}

// isTarballNotFoundError reports whether err indicates that the backup tarball
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
// permanent and not retryable, so callers should let deletion proceed (skipping
// DeleteItemAction plugins) rather than failing the entire deletion.
//
// Transient errors (throttling, auth failures, network timeouts) return false so
// the deletion is failed and can be retried once the storage is reachable again.
func isTarballNotFoundError(err error) bool {
if err == nil {
return false
}
// Lower-case once for all comparisons.
msg := strings.ToLower(err.Error())
// Common "not found" indicators across cloud providers:
// - "not found" / "does not exist": generic, in-memory object store
// - "nosuchkey": AWS S3
// - "blobnotfound": Azure Blob Storage
// - "objectnotexist": GCS
return strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist") ||
strings.Contains(msg, "nosuchkey") ||
strings.Contains(msg, "blobnotfound") ||
strings.Contains(msg, "objectnotexist")
}
108 changes: 88 additions & 20 deletions pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ import (
"reflect"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v7/apis/volumesnapshot/v1"

"context"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -606,7 +604,7 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"
Expand Down Expand Up @@ -672,38 +670,108 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(context.TODO(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
// DeleteBackup (removing backup data from object storage) must NOT be called
// when there are errors, so that the deletion can be retried later.
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted
// the dbr should still exist and be marked Processed with errors
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
if err == nil {
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
}
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
require.Len(t, res.Status.Errors, 1)
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")

// backup CR should be deleted
// backup CR should NOT be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
})
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"

// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
input := defaultTestDbr()
input.Labels = nil

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
location := &velerov1api.BackupStorageLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: backup.Spec.StorageLocation,
},
Spec: velerov1api.BackupStorageLocationSpec{
Provider: "objStoreProvider",
StorageType: velerov1api.StorageType{
ObjectStorage: &velerov1api.ObjectStorageLocation{
Bucket: "bucket",
},
},
},
Status: velerov1api.BackupStorageLocationStatus{
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
},
}

snapshotLocation := &velerov1api.VolumeSnapshotLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: "vsl-1",
},
Spec: velerov1api.VolumeSnapshotLocationSpec{
Provider: "provider-1",
},
}

td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")

snapshots := []*volume.Snapshot{
{
Spec: volume.SnapshotSpec{
Location: "vsl-1",
},
Status: volume.SnapshotStatus{
ProviderSnapshotID: "snap-1",
},
},
}

pluginManager := &pluginmocks.Manager{}
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
pluginManager.On("CleanupClients")
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
// Simulate a 404/not-found error (tarball has already been removed from storage)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(context.TODO(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)

// backup CR should be deleted because there are no errors in errs
err = td.fakeClient.Get(context.TODO(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
})
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)
Expand Down
Loading