Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelogs/unreleased/9730-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Backporting PR #9700 and #9693
Fix issue #9699, add a 2-second gap between temporary CSI VolumeSnapshotContent create and delete operations
Enhance backup deletion logic to handle tarball download failures
8 changes: 5 additions & 3 deletions hack/build-image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ ENV GO111MODULE=on
ENV GOPROXY=${GOPROXY}

# kubebuilder test bundle is separated from kubebuilder. Need to setup it for CI test.
RUN curl -sSLo envtest-bins.tar.gz https://go.kubebuilder.io/test-tools/1.22.1/linux/$(go env GOARCH) && \
mkdir /usr/local/kubebuilder && \
tar -C /usr/local/kubebuilder --strip-components=1 -zvxf envtest-bins.tar.gz
# Using setup-envtest to download envtest binaries
RUN go install sigs.k8s.io/controller-runtime/tools/setup-envtest@v0.0.0-20250512082854-54b916c903db && \
mkdir -p /usr/local/kubebuilder/bin && \
ENVTEST_ASSETS_DIR=$(setup-envtest use 1.33.0 --bin-dir /usr/local/kubebuilder/bin -p path) && \
cp -r ${ENVTEST_ASSETS_DIR}/* /usr/local/kubebuilder/bin/

RUN wget --quiet https://github.com/kubernetes-sigs/kubebuilder/releases/download/v3.2.0/kubebuilder_linux_$(go env GOARCH) && \
mv kubebuilder_linux_$(go env GOARCH) /usr/local/kubebuilder/bin/kubebuilder && \
Expand Down
34 changes: 8 additions & 26 deletions internal/delete/actions/csi/volumesnapshotcontent_action.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@ import (
corev1api "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/wait"
crclient "sigs.k8s.io/controller-runtime/pkg/client"

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
"github.com/vmware-tanzu/velero/pkg/client"
plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
Expand All @@ -44,6 +42,10 @@ type volumeSnapshotContentDeleteItemAction struct {
crClient crclient.Client
}

const tempVSCCreateDeleteGap = 2 * time.Second

var sleepBetweenTempVSCCreateAndDelete = time.Sleep

// AppliesTo returns information indicating
// VolumeSnapshotContentRestoreItemAction action should be invoked
// while restoring VolumeSnapshotContent.snapshot.storage.k8s.io resources
Expand Down Expand Up @@ -107,31 +109,11 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
return errors.Wrapf(err, "fail to create VolumeSnapshotContent %s", snapCont.Name)
}

// Read resource timeout from backup annotation, if not set, use default value.
timeout, err := time.ParseDuration(
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation])
if err != nil {
p.log.Warnf("fail to parse resource timeout annotation %s: %s",
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation], err.Error())
timeout = 10 * time.Minute
}
p.log.Debugf("resource timeout is set to %s", timeout.String())

interval := 5 * time.Second

// Wait until VSC created and ReadyToUse is true.
if err := wait.PollUntilContextTimeout(
context.Background(),
interval,
timeout,
true,
func(ctx context.Context) (bool, error) {
return checkVSCReadiness(ctx, &snapCont, p.crClient)
},
); err != nil {
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
}
// Add a small delay before delete to avoid create/delete race conditions in CSI controllers.
sleepBetweenTempVSCCreateAndDelete(tempVSCCreateDeleteGap)

// Delete the temp VSC immediately to trigger cloud snapshot removal.
// The CSI driver will handle the actual cloud snapshot deletion.
if err := p.crClient.Delete(
context.TODO(),
&snapCont,
Expand Down
78 changes: 78 additions & 0 deletions internal/delete/actions/csi/volumesnapshotcontent_action_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"testing"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
"github.com/pkg/errors"
Expand All @@ -37,6 +38,50 @@ import (
velerotest "github.com/vmware-tanzu/velero/pkg/test"
)

// fakeClientWithErrors wraps a real client and injects errors for specific operations.
type fakeClientWithErrors struct {
crclient.Client
getError error
patchError error
deleteError error
}

type fakeClientWithCallTracking struct {
crclient.Client
events *[]string
}

func (c *fakeClientWithCallTracking) Create(ctx context.Context, obj crclient.Object, opts ...crclient.CreateOption) error {
*c.events = append(*c.events, "create")
return c.Client.Create(ctx, obj, opts...)
}

func (c *fakeClientWithCallTracking) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
*c.events = append(*c.events, "delete")
return c.Client.Delete(ctx, obj, opts...)
}

func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error {
if c.getError != nil {
return c.getError
}
return c.Client.Get(ctx, key, obj, opts...)
}

func (c *fakeClientWithErrors) Patch(ctx context.Context, obj crclient.Object, patch crclient.Patch, opts ...crclient.PatchOption) error {
if c.patchError != nil {
return c.patchError
}
return c.Client.Patch(ctx, obj, patch, opts...)
}

func (c *fakeClientWithErrors) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
if c.deleteError != nil {
return c.deleteError
}
return c.Client.Delete(ctx, obj, opts...)
}

func TestVSCExecute(t *testing.T) {
snapshotHandleStr := "test"
tests := []struct {
Expand Down Expand Up @@ -207,3 +252,36 @@ func TestCheckVSCReadiness(t *testing.T) {
})
}
}

func TestVSCExecute_CreateSleepDeleteOrder(t *testing.T) {
snapshotHandleStr := "test"
vsc := builder.ForVolumeSnapshotContent("bar").
ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).
Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).
Result()

vscMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(vsc)
require.NoError(t, err)

events := make([]string, 0, 3)
realClient := velerotest.NewFakeControllerRuntimeClient(t)
trackingClient := &fakeClientWithCallTracking{Client: realClient, events: &events}

originalSleep := sleepBetweenTempVSCCreateAndDelete
t.Cleanup(func() {
sleepBetweenTempVSCCreateAndDelete = originalSleep
})

sleepBetweenTempVSCCreateAndDelete = func(d time.Duration) {
require.Equal(t, tempVSCCreateDeleteGap, d)
events = append(events, "sleep")
}

p := volumeSnapshotContentDeleteItemAction{log: logrus.StandardLogger(), crClient: trackingClient}
err = p.Execute(&velero.DeleteItemActionExecuteInput{
Item: &unstructured.Unstructured{Object: vscMap},
Backup: builder.ForBackup("velero", "backup").Result(),
})
require.NoError(t, err)
require.Equal(t, []string{"create", "sleep", "delete"}, events)
}
39 changes: 38 additions & 1 deletion pkg/controller/backup_deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"encoding/json"
"fmt"
"strings"
"time"

jsonpatch "github.com/evanphx/json-patch/v5"
Expand Down Expand Up @@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque

if err != nil {
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
// for backups which failed before tarball object could be uploaded we do offline cleanup
log.Info("Cleaning up CSI volumesnapshots")
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)

// If the tarball simply does not exist (HTTP 404 / not found), the download
// failure is permanent and not retryable, so we let deletion proceed.
// For transient errors (throttling, auth failures, network issues), record
// the error to fail the deletion so it can be retried later.
if !isTarballNotFoundError(err) {
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
}
} else {
defer closeAndRemoveFile(backupFile, r.logger)
deleteCtx := &delete.Context{
Expand Down Expand Up @@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
}
}

if backupStore != nil {
if backupStore != nil && len(errs) == 0 {
log.Info("Removing backup from backup storage")
if err := backupStore.DeleteBackup(backup.Name); err != nil {
errs = append(errs, err.Error())
}
} else if len(errs) > 0 {
log.Info("Skipping removal of backup from backup storage due to previous errors")
}

log.Info("Removing restores")
Expand Down Expand Up @@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,

return errs
}

// isTarballNotFoundError reports whether err indicates that the backup tarball
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
// permanent and not retryable, so callers should let deletion proceed (skipping
// DeleteItemAction plugins) rather than failing the entire deletion.
//
// Transient errors (throttling, auth failures, network timeouts) return false so
// the deletion is failed and can be retried once the storage is reachable again.
func isTarballNotFoundError(err error) bool {
if err == nil {
return false
}
// Lower-case once for all comparisons.
msg := strings.ToLower(err.Error())
// Common "not found" indicators across cloud providers:
// - "not found" / "does not exist": generic, in-memory object store
// - "nosuchkey": AWS S3
// - "blobnotfound": Azure Blob Storage
// - "objectnotexist": GCS
return strings.Contains(msg, "not found") ||
strings.Contains(msg, "does not exist") ||
strings.Contains(msg, "nosuchkey") ||
strings.Contains(msg, "blobnotfound") ||
strings.Contains(msg, "objectnotexist")
}
108 changes: 88 additions & 20 deletions pkg/controller/backup_deletion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@ import (
"reflect"
"time"

snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"

"context"

"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -606,7 +604,7 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
})
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"
Expand Down Expand Up @@ -672,38 +670,108 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(t.Context(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
// DeleteBackup (removing backup data from object storage) must NOT be called
// when there are errors, so that the deletion can be retried later.
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted
// the dbr should still exist and be marked Processed with errors
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
if err == nil {
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
}
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
require.Len(t, res.Status.Errors, 1)
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")

// backup CR should be deleted
// backup CR should NOT be deleted
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
})
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
backup.UID = "uid"
backup.Spec.StorageLocation = "primary"

// leaked CSI snapshot should be deleted
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: "user-ns",
Name: "vs-1",
}, &snapshotv1api.VolumeSnapshot{})
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
input := defaultTestDbr()
input.Labels = nil

// Make sure snapshot was deleted
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
location := &velerov1api.BackupStorageLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: backup.Spec.StorageLocation,
},
Spec: velerov1api.BackupStorageLocationSpec{
Provider: "objStoreProvider",
StorageType: velerov1api.StorageType{
ObjectStorage: &velerov1api.ObjectStorageLocation{
Bucket: "bucket",
},
},
},
Status: velerov1api.BackupStorageLocationStatus{
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
},
}

snapshotLocation := &velerov1api.VolumeSnapshotLocation{
ObjectMeta: metav1.ObjectMeta{
Namespace: backup.Namespace,
Name: "vsl-1",
},
Spec: velerov1api.VolumeSnapshotLocationSpec{
Provider: "provider-1",
},
}

td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")

snapshots := []*volume.Snapshot{
{
Spec: volume.SnapshotSpec{
Location: "vsl-1",
},
Status: volume.SnapshotStatus{
ProviderSnapshotID: "snap-1",
},
},
}

pluginManager := &pluginmocks.Manager{}
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
pluginManager.On("CleanupClients")
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }

td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
// Simulate a 404/not-found error (tarball has already been removed from storage)
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)

_, err := td.controller.Reconcile(t.Context(), td.req)
require.NoError(t, err)

td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)

// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
res := &velerov1api.DeleteBackupRequest{}
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)

// backup CR should be deleted because there are no errors in errs
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: velerov1api.DefaultNamespace,
Name: backup.Name,
}, &velerov1api.Backup{})
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
})
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/podvolume/pod_volume_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ func TestGetVolumesByPod(t *testing.T) {
Volumes: []corev1api.Volume{
// PVB Volumes
{Name: "pvbPV1"}, {Name: "pvbPV2"}, {Name: "pvbPV3"},
/// Excluded from PVB because colume mounting default service account token
/// Excluded from PVB because column mounting default service account token
{Name: "default-token-5xq45"},
},
},
Expand Down
Loading