Skip to content

Commit fea6f30

Browse files
priyansh17blackpiglet
authored andcommitted
fix: fail backup early when CSI VolumeSnapshot error persists beyond CSISnapshotTimeout
Signed-off-by: Priyansh Choudhary <im1706@gmail.com>
1 parent d3f4b2c commit fea6f30

4 files changed

Lines changed: 79 additions & 3 deletions

File tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix backup stuck in WaitingForPluginOperations when CSI VolumeSnapshot has a persistent error beyond CSISnapshotTimeout

pkg/apis/velero/v1/labels_annotations.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,4 +178,9 @@ const (
178178

179179
// Annotation prefix on Backup to override VGS class per CSI driver
180180
VolumeGroupSnapshotClassAnnotationBackupPrefix = "velero.io/csi-volumegroupsnapshot-class_"
181+
182+
// VSErrorFirstObservedTimeAnnotation is the annotation key used to record the time
183+
// when a VolumeSnapshot error was first observed during an async backup operation.
184+
// This is used by Progress() to detect errors that persist beyond CSISnapshotTimeout.
185+
VSErrorFirstObservedTimeAnnotation = "velero.io/vs-error-first-observed-time"
181186
)

pkg/backup/actions/csi/volumesnapshot_action.go

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,45 @@ func (p *volumeSnapshotBackupItemAction) Progress(
294294
if vs.Status.Error.Message != nil {
295295
errorMessage = *vs.Status.Error.Message
296296
}
297-
p.log.Warnf("VolumeSnapshot has a temporary error %s. Snapshot controller will retry later.",
298-
errorMessage)
297+
298+
now := time.Now()
299+
firstObservedTime := now
300+
301+
if existingTime, ok := vs.Annotations[velerov1api.VSErrorFirstObservedTimeAnnotation]; ok {
302+
if t, err := time.Parse(time.RFC3339, existingTime); err == nil {
303+
firstObservedTime = t
304+
} else {
305+
p.log.Warnf("VolumeSnapshot %s/%s has an unparseable %s annotation value %q, resetting timer: %v",
306+
vs.Namespace, vs.Name, velerov1api.VSErrorFirstObservedTimeAnnotation, existingTime, err)
307+
}
308+
} else {
309+
// First time we observe this error — stamp the annotation onto the VS.
310+
originVS := vs.DeepCopy()
311+
kubeutil.AddAnnotations(&vs.ObjectMeta, map[string]string{
312+
velerov1api.VSErrorFirstObservedTimeAnnotation: now.Format(time.RFC3339),
313+
})
314+
if patchErr := p.crClient.Patch(
315+
context.Background(), vs, crclient.MergeFrom(originVS),
316+
); patchErr != nil {
317+
p.log.Warnf("Failed to patch VolumeSnapshot %s/%s with error first observed time: %v",
318+
vs.Namespace, vs.Name, patchErr)
319+
}
320+
}
321+
322+
timeout := backup.Spec.CSISnapshotTimeout.Duration
323+
if timeout > 0 && now.Sub(firstObservedTime) >= timeout {
324+
p.log.Errorf(
325+
"VolumeSnapshot %s/%s has a persistent error beyond CSISnapshotTimeout (%s): %s",
326+
vs.Namespace, vs.Name, timeout, errorMessage)
327+
progress.Completed = true
328+
progress.Updated = now
329+
progress.Err = fmt.Sprintf("VolumeSnapshot %s/%s has a persistent error: %s",
330+
vs.Namespace, vs.Name, errorMessage)
331+
return progress, nil
332+
}
333+
334+
p.log.Warnf("VolumeSnapshot %s/%s has an error within the CSISnapshotTimeout window: %s. Snapshot controller will retry later.",
335+
vs.Namespace, vs.Name, errorMessage)
299336

300337
return progress, nil
301338
}

pkg/backup/actions/csi/volumesnapshot_action_test.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package csi
1919
import (
2020
"fmt"
2121
"testing"
22+
"time"
2223

2324
"github.com/google/go-cmp/cmp"
2425
"github.com/google/go-cmp/cmp/cmpopts"
@@ -219,7 +220,7 @@ func TestVSProgress(t *testing.T) {
219220
expectedErr: false,
220221
},
221222
{
222-
name: "VS status has error",
223+
name: "VS status has error, no prior annotation, no CSISnapshotTimeout configured",
223224
operationID: "ns/name/2024-04-11T18:49:00+08:00",
224225
vs: builder.ForVolumeSnapshot("ns", "name").Status().
225226
StatusError(snapshotv1api.VolumeSnapshotError{
@@ -228,6 +229,38 @@ func TestVSProgress(t *testing.T) {
228229
backup: builder.ForBackup("velero", "backup").Result(),
229230
expectedErr: false,
230231
},
232+
{
233+
name: "VS status has error, annotation within CSISnapshotTimeout",
234+
operationID: "ns/name/2024-04-11T18:49:00+08:00",
235+
vs: builder.ForVolumeSnapshot("ns", "name").
236+
ObjectMeta(builder.WithAnnotations(
237+
velerov1api.VSErrorFirstObservedTimeAnnotation, time.Now().Format(time.RFC3339),
238+
)).
239+
Status().
240+
StatusError(snapshotv1api.VolumeSnapshotError{
241+
Message: &errorStr,
242+
}).Result(),
243+
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
244+
expectedErr: false,
245+
},
246+
{
247+
name: "VS status has persistent error beyond CSISnapshotTimeout",
248+
operationID: "ns/name/2024-04-11T18:49:00+08:00",
249+
vs: builder.ForVolumeSnapshot("ns", "name").
250+
ObjectMeta(builder.WithAnnotations(
251+
velerov1api.VSErrorFirstObservedTimeAnnotation, "2024-04-11T18:49:00+08:00",
252+
)).
253+
Status().
254+
StatusError(snapshotv1api.VolumeSnapshotError{
255+
Message: &errorStr,
256+
}).Result(),
257+
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
258+
expectedErr: false,
259+
expectedProgress: &velero.OperationProgress{
260+
Completed: true,
261+
Err: fmt.Sprintf("VolumeSnapshot ns/name has a persistent error: %s", errorStr),
262+
},
263+
},
231264
{
232265
name: "Fail to get VSC",
233266
operationID: "ns/name/2024-04-11T18:49:00+08:00",

0 commit comments

Comments
 (0)