Skip to content

Commit 6cba8e6

Browse files
committed
Add delay to avoid race conditions during VolumeSnapshotContent deletion (velero-io#9700)
* Add delay to avoid race conditions during VolumeSnapshotContent deletion Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * updated changelog Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * Updated Changelog Signed-off-by: Priyansh Choudhary <im1706@gmail.com>
1 parent bdc00cd commit 6cba8e6

3 files changed

Lines changed: 87 additions & 30 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix issue #9699, add a 2-second gap between temporary CSI VolumeSnapshotContent create and delete operations

internal/delete/actions/csi/volumesnapshotcontent_action.go

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,8 @@ import (
2727
corev1api "k8s.io/api/core/v1"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
2929
"k8s.io/apimachinery/pkg/runtime"
30-
"k8s.io/apimachinery/pkg/util/wait"
3130
crclient "sigs.k8s.io/controller-runtime/pkg/client"
3231

33-
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
3432
"github.com/vmware-tanzu/velero/pkg/client"
3533
plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common"
3634
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
@@ -44,6 +42,10 @@ type volumeSnapshotContentDeleteItemAction struct {
4442
crClient crclient.Client
4543
}
4644

45+
const tempVSCCreateDeleteGap = 2 * time.Second
46+
47+
var sleepBetweenTempVSCCreateAndDelete = time.Sleep
48+
4749
// AppliesTo returns information indicating
4850
// VolumeSnapshotContentRestoreItemAction action should be invoked
4951
// while restoring VolumeSnapshotContent.snapshot.storage.k8s.io resources
@@ -115,35 +117,11 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
115117
return errors.Wrapf(err, "fail to create VolumeSnapshotContent %s", snapCont.Name)
116118
}
117119

118-
// Read resource timeout from backup annotation, if not set, use default value.
119-
timeout, err := time.ParseDuration(
120-
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation])
121-
if err != nil {
122-
p.log.Warnf("fail to parse resource timeout annotation %s: %s",
123-
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation], err.Error())
124-
timeout = 10 * time.Minute
125-
}
126-
p.log.Debugf("resource timeout is set to %s", timeout.String())
127-
128-
interval := 5 * time.Second
129-
130-
// Wait until VSC created and ReadyToUse is true.
131-
if err := wait.PollUntilContextTimeout(
132-
context.Background(),
133-
interval,
134-
timeout,
135-
true,
136-
func(ctx context.Context) (bool, error) {
137-
return checkVSCReadiness(ctx, &snapCont, p.crClient)
138-
},
139-
); err != nil {
140-
// Clean up the VSC we created since it can't become ready
141-
if deleteErr := p.crClient.Delete(context.TODO(), &snapCont); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
142-
p.log.WithError(deleteErr).Errorf("Failed to clean up VolumeSnapshotContent %s", snapCont.Name)
143-
}
144-
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
145-
}
120+
// Add a small delay before delete to avoid create/delete race conditions in CSI controllers.
121+
sleepBetweenTempVSCCreateAndDelete(tempVSCCreateDeleteGap)
146122

123+
// Delete the temp VSC immediately to trigger cloud snapshot removal.
124+
// The CSI driver will handle the actual cloud snapshot deletion.
147125
if err := p.crClient.Delete(
148126
context.TODO(),
149127
&snapCont,

internal/delete/actions/csi/volumesnapshotcontent_action_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
2526
"github.com/pkg/errors"
@@ -37,6 +38,50 @@ import (
3738
velerotest "github.com/vmware-tanzu/velero/pkg/test"
3839
)
3940

41+
// fakeClientWithErrors wraps a real client and injects errors for specific operations.
42+
type fakeClientWithErrors struct {
43+
crclient.Client
44+
getError error
45+
patchError error
46+
deleteError error
47+
}
48+
49+
type fakeClientWithCallTracking struct {
50+
crclient.Client
51+
events *[]string
52+
}
53+
54+
func (c *fakeClientWithCallTracking) Create(ctx context.Context, obj crclient.Object, opts ...crclient.CreateOption) error {
55+
*c.events = append(*c.events, "create")
56+
return c.Client.Create(ctx, obj, opts...)
57+
}
58+
59+
func (c *fakeClientWithCallTracking) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
60+
*c.events = append(*c.events, "delete")
61+
return c.Client.Delete(ctx, obj, opts...)
62+
}
63+
64+
func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error {
65+
if c.getError != nil {
66+
return c.getError
67+
}
68+
return c.Client.Get(ctx, key, obj, opts...)
69+
}
70+
71+
func (c *fakeClientWithErrors) Patch(ctx context.Context, obj crclient.Object, patch crclient.Patch, opts ...crclient.PatchOption) error {
72+
if c.patchError != nil {
73+
return c.patchError
74+
}
75+
return c.Client.Patch(ctx, obj, patch, opts...)
76+
}
77+
78+
func (c *fakeClientWithErrors) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
79+
if c.deleteError != nil {
80+
return c.deleteError
81+
}
82+
return c.Client.Delete(ctx, obj, opts...)
83+
}
84+
4085
func TestVSCExecute(t *testing.T) {
4186
snapshotHandleStr := "test"
4287
tests := []struct {
@@ -239,6 +284,39 @@ func TestCheckVSCReadiness(t *testing.T) {
239284
}
240285
}
241286

287+
func TestVSCExecute_CreateSleepDeleteOrder(t *testing.T) {
288+
snapshotHandleStr := "test"
289+
vsc := builder.ForVolumeSnapshotContent("bar").
290+
ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).
291+
Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).
292+
Result()
293+
294+
vscMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(vsc)
295+
require.NoError(t, err)
296+
297+
events := make([]string, 0, 3)
298+
realClient := velerotest.NewFakeControllerRuntimeClient(t)
299+
trackingClient := &fakeClientWithCallTracking{Client: realClient, events: &events}
300+
301+
originalSleep := sleepBetweenTempVSCCreateAndDelete
302+
t.Cleanup(func() {
303+
sleepBetweenTempVSCCreateAndDelete = originalSleep
304+
})
305+
306+
sleepBetweenTempVSCCreateAndDelete = func(d time.Duration) {
307+
require.Equal(t, tempVSCCreateDeleteGap, d)
308+
events = append(events, "sleep")
309+
}
310+
311+
p := volumeSnapshotContentDeleteItemAction{log: logrus.StandardLogger(), crClient: trackingClient}
312+
err = p.Execute(&velero.DeleteItemActionExecuteInput{
313+
Item: &unstructured.Unstructured{Object: vscMap},
314+
Backup: builder.ForBackup("velero", "backup").Result(),
315+
})
316+
require.NoError(t, err)
317+
require.Equal(t, []string{"create", "sleep", "delete"}, events)
318+
}
319+
242320
func boolPtr(b bool) *bool {
243321
return &b
244322
}

0 commit comments

Comments
 (0)