Skip to content

Commit 97435cb

Browse files
committed
Add delay to avoid race conditions during VolumeSnapshotContent deletion (#9700)
* Add delay to avoid race conditions during VolumeSnapshotContent deletion Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * updated changelog Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * Updated Changelog Signed-off-by: Priyansh Choudhary <im1706@gmail.com>
1 parent d57ecd3 commit 97435cb

3 files changed

Lines changed: 156 additions & 26 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix issue #9699, add a 2-second gap between temporary CSI VolumeSnapshotContent create and delete operations

internal/delete/actions/csi/volumesnapshotcontent_action.go

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,8 @@ import (
2727
corev1api "k8s.io/api/core/v1"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
2929
"k8s.io/apimachinery/pkg/runtime"
30-
"k8s.io/apimachinery/pkg/util/wait"
3130
crclient "sigs.k8s.io/controller-runtime/pkg/client"
3231

33-
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
3432
"github.com/vmware-tanzu/velero/pkg/client"
3533
plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common"
3634
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
@@ -44,6 +42,10 @@ type volumeSnapshotContentDeleteItemAction struct {
4442
crClient crclient.Client
4543
}
4644

45+
const tempVSCCreateDeleteGap = 2 * time.Second
46+
47+
var sleepBetweenTempVSCCreateAndDelete = time.Sleep
48+
4749
// AppliesTo returns information indicating
4850
// VolumeSnapshotContentRestoreItemAction action should be invoked
4951
// while restoring VolumeSnapshotContent.snapshot.storage.k8s.io resources
@@ -107,31 +109,11 @@ func (p *volumeSnapshotContentDeleteItemAction) Execute(
107109
return errors.Wrapf(err, "fail to create VolumeSnapshotContent %s", snapCont.Name)
108110
}
109111

110-
// Read resource timeout from backup annotation, if not set, use default value.
111-
timeout, err := time.ParseDuration(
112-
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation])
113-
if err != nil {
114-
p.log.Warnf("fail to parse resource timeout annotation %s: %s",
115-
input.Backup.Annotations[velerov1api.ResourceTimeoutAnnotation], err.Error())
116-
timeout = 10 * time.Minute
117-
}
118-
p.log.Debugf("resource timeout is set to %s", timeout.String())
119-
120-
interval := 5 * time.Second
121-
122-
// Wait until VSC created and ReadyToUse is true.
123-
if err := wait.PollUntilContextTimeout(
124-
context.Background(),
125-
interval,
126-
timeout,
127-
true,
128-
func(ctx context.Context) (bool, error) {
129-
return checkVSCReadiness(ctx, &snapCont, p.crClient)
130-
},
131-
); err != nil {
132-
return errors.Wrapf(err, "fail to wait VolumeSnapshotContent %s becomes ready.", snapCont.Name)
133-
}
112+
// Add a small delay before delete to avoid create/delete race conditions in CSI controllers.
113+
sleepBetweenTempVSCCreateAndDelete(tempVSCCreateDeleteGap)
134114

115+
// Delete the temp VSC immediately to trigger cloud snapshot removal.
116+
// The CSI driver will handle the actual cloud snapshot deletion.
135117
if err := p.crClient.Delete(
136118
context.TODO(),
137119
&snapCont,

internal/delete/actions/csi/volumesnapshotcontent_action_test.go

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
2526
"github.com/pkg/errors"
@@ -37,6 +38,50 @@ import (
3738
velerotest "github.com/vmware-tanzu/velero/pkg/test"
3839
)
3940

41+
// fakeClientWithErrors wraps a real client and injects errors for specific operations.
42+
type fakeClientWithErrors struct {
43+
crclient.Client
44+
getError error
45+
patchError error
46+
deleteError error
47+
}
48+
49+
type fakeClientWithCallTracking struct {
50+
crclient.Client
51+
events *[]string
52+
}
53+
54+
func (c *fakeClientWithCallTracking) Create(ctx context.Context, obj crclient.Object, opts ...crclient.CreateOption) error {
55+
*c.events = append(*c.events, "create")
56+
return c.Client.Create(ctx, obj, opts...)
57+
}
58+
59+
func (c *fakeClientWithCallTracking) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
60+
*c.events = append(*c.events, "delete")
61+
return c.Client.Delete(ctx, obj, opts...)
62+
}
63+
64+
func (c *fakeClientWithErrors) Get(ctx context.Context, key crclient.ObjectKey, obj crclient.Object, opts ...crclient.GetOption) error {
65+
if c.getError != nil {
66+
return c.getError
67+
}
68+
return c.Client.Get(ctx, key, obj, opts...)
69+
}
70+
71+
func (c *fakeClientWithErrors) Patch(ctx context.Context, obj crclient.Object, patch crclient.Patch, opts ...crclient.PatchOption) error {
72+
if c.patchError != nil {
73+
return c.patchError
74+
}
75+
return c.Client.Patch(ctx, obj, patch, opts...)
76+
}
77+
78+
func (c *fakeClientWithErrors) Delete(ctx context.Context, obj crclient.Object, opts ...crclient.DeleteOption) error {
79+
if c.deleteError != nil {
80+
return c.deleteError
81+
}
82+
return c.Client.Delete(ctx, obj, opts...)
83+
}
84+
4085
func TestVSCExecute(t *testing.T) {
4186
snapshotHandleStr := "test"
4287
tests := []struct {
@@ -206,4 +251,106 @@ func TestCheckVSCReadiness(t *testing.T) {
206251
}
207252
})
208253
}
254+
255+
// Error injection tests for tryDeleteOriginalVSC
256+
t.Run("Get returns non-NotFound error, returns false", func(t *testing.T) {
257+
errClient := &fakeClientWithErrors{
258+
Client: velerotest.NewFakeControllerRuntimeClient(t),
259+
getError: fmt.Errorf("connection refused"),
260+
}
261+
p := &volumeSnapshotContentDeleteItemAction{
262+
log: logrus.StandardLogger(),
263+
crClient: errClient,
264+
}
265+
require.False(t, p.tryDeleteOriginalVSC(t.Context(), "some-vsc"))
266+
})
267+
268+
t.Run("Patch fails, returns false", func(t *testing.T) {
269+
realClient := velerotest.NewFakeControllerRuntimeClient(t)
270+
vsc := &snapshotv1api.VolumeSnapshotContent{
271+
ObjectMeta: metav1.ObjectMeta{Name: "patch-fail-vsc"},
272+
Spec: snapshotv1api.VolumeSnapshotContentSpec{
273+
DeletionPolicy: snapshotv1api.VolumeSnapshotContentRetain,
274+
Driver: "disk.csi.azure.com",
275+
Source: snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: stringPtr("snap-789")},
276+
VolumeSnapshotRef: corev1api.ObjectReference{Name: "vs-3", Namespace: "default"},
277+
},
278+
}
279+
require.NoError(t, realClient.Create(t.Context(), vsc))
280+
281+
errClient := &fakeClientWithErrors{
282+
Client: realClient,
283+
patchError: fmt.Errorf("patch forbidden"),
284+
}
285+
p := &volumeSnapshotContentDeleteItemAction{
286+
log: logrus.StandardLogger(),
287+
crClient: errClient,
288+
}
289+
require.False(t, p.tryDeleteOriginalVSC(t.Context(), "patch-fail-vsc"))
290+
})
291+
292+
t.Run("Delete fails, returns false", func(t *testing.T) {
293+
realClient := velerotest.NewFakeControllerRuntimeClient(t)
294+
vsc := &snapshotv1api.VolumeSnapshotContent{
295+
ObjectMeta: metav1.ObjectMeta{Name: "delete-fail-vsc"},
296+
Spec: snapshotv1api.VolumeSnapshotContentSpec{
297+
DeletionPolicy: snapshotv1api.VolumeSnapshotContentDelete,
298+
Driver: "disk.csi.azure.com",
299+
Source: snapshotv1api.VolumeSnapshotContentSource{SnapshotHandle: stringPtr("snap-999")},
300+
VolumeSnapshotRef: corev1api.ObjectReference{Name: "vs-4", Namespace: "default"},
301+
},
302+
}
303+
require.NoError(t, realClient.Create(t.Context(), vsc))
304+
305+
errClient := &fakeClientWithErrors{
306+
Client: realClient,
307+
deleteError: fmt.Errorf("delete forbidden"),
308+
}
309+
p := &volumeSnapshotContentDeleteItemAction{
310+
log: logrus.StandardLogger(),
311+
crClient: errClient,
312+
}
313+
require.False(t, p.tryDeleteOriginalVSC(t.Context(), "delete-fail-vsc"))
314+
})
315+
}
316+
317+
func TestVSCExecute_CreateSleepDeleteOrder(t *testing.T) {
318+
snapshotHandleStr := "test"
319+
vsc := builder.ForVolumeSnapshotContent("bar").
320+
ObjectMeta(builder.WithLabelsMap(map[string]string{velerov1api.BackupNameLabel: "backup"})).
321+
Status(&snapshotv1api.VolumeSnapshotContentStatus{SnapshotHandle: &snapshotHandleStr}).
322+
Result()
323+
324+
vscMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(vsc)
325+
require.NoError(t, err)
326+
327+
events := make([]string, 0, 3)
328+
realClient := velerotest.NewFakeControllerRuntimeClient(t)
329+
trackingClient := &fakeClientWithCallTracking{Client: realClient, events: &events}
330+
331+
originalSleep := sleepBetweenTempVSCCreateAndDelete
332+
t.Cleanup(func() {
333+
sleepBetweenTempVSCCreateAndDelete = originalSleep
334+
})
335+
336+
sleepBetweenTempVSCCreateAndDelete = func(d time.Duration) {
337+
require.Equal(t, tempVSCCreateDeleteGap, d)
338+
events = append(events, "sleep")
339+
}
340+
341+
p := volumeSnapshotContentDeleteItemAction{log: logrus.StandardLogger(), crClient: trackingClient}
342+
err = p.Execute(&velero.DeleteItemActionExecuteInput{
343+
Item: &unstructured.Unstructured{Object: vscMap},
344+
Backup: builder.ForBackup("velero", "backup").Result(),
345+
})
346+
require.NoError(t, err)
347+
require.Equal(t, []string{"create", "sleep", "delete"}, events)
348+
}
349+
350+
func boolPtr(b bool) *bool {
351+
return &b
352+
}
353+
354+
func stringPtr(s string) *string {
355+
return &s
209356
}

0 commit comments

Comments
 (0)