Skip to content

Commit f2c3894

Browse files
committed
fix: * improve logic in syncer and snapshot reconciliation to handle e2e tests failures
* Improve logging and assertion in snapshot e2e test
1 parent c368b21 commit f2c3894

File tree

4 files changed

+135
-97
lines changed

4 files changed

+135
-97
lines changed

pkg/controllers/resources/ingresses/syncer.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,8 @@ func (s *ingressSyncer) Syncer() syncertypes.Sync[client.Object] {
6262

6363
func (s *ingressSyncer) SyncToHost(ctx *synccontext.SyncContext, event *synccontext.SyncToHostEvent[*networkingv1.Ingress]) (ctrl.Result, error) {
6464
if s.applyLimitByClass(ctx, event.Virtual) {
65-
s.EventRecorder().Eventf(
66-
event.Virtual,
67-
nil,
68-
"Warning",
69-
"SyncWarning",
70-
"IngressSyncWarning",
71-
"did not sync ingress %q to host because it does not match the selector under 'sync.fromHost.ingressClasses.selector'",
72-
event.Virtual.GetName(),
73-
)
65+
// applyLimitByClass already logs the appropriate error message with details
66+
// (including ingress class name), so we don't need to log another message here
7467
return ctrl.Result{}, nil
7568
}
7669

pkg/snapshot/controller.go

Lines changed: 0 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,6 @@ func (c *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
154154
return ctrl.Result{}, nil
155155
}
156156
}
157-
158-
// If a newer snapshot request exists for the same URL, cancel this one if it's in progress.
159-
err = c.cancelIfNewerRequestExists(ctx, snapshotRequest)
160-
if err != nil {
161-
return ctrl.Result{}, fmt.Errorf("failed to check for newer snapshot requests: %w", err)
162-
}
163-
164157
canContinue, err := c.cancelPreviousRequests(ctx, snapshotRequest)
165158
if err != nil {
166159
return ctrl.Result{}, fmt.Errorf("failed to cancel previous snapshot requests: %w", err)
@@ -578,47 +571,3 @@ func (c *Reconciler) cancelPreviousRequests(ctx context.Context, request *Reques
578571

579572
return currentRequestCanContinue, nil
580573
}
581-
582-
func (c *Reconciler) cancelIfNewerRequestExists(ctx context.Context, request *Request) error {
583-
if request.Status.Phase != RequestPhaseCreatingVolumeSnapshots &&
584-
request.Status.Phase != RequestPhaseCreatingEtcdBackup &&
585-
request.Status.Phase != RequestPhaseNotStarted {
586-
return nil
587-
}
588-
589-
var configMaps corev1.ConfigMapList
590-
listOptions := &client.ListOptions{
591-
LabelSelector: labels.SelectorFromSet(map[string]string{
592-
constants.SnapshotRequestLabel: "",
593-
}),
594-
Namespace: c.getRequestNamespace(),
595-
}
596-
if err := c.client().List(ctx, &configMaps, listOptions); err != nil {
597-
return fmt.Errorf("failed to list snapshot request ConfigMaps: %w", err)
598-
}
599-
600-
for _, configMap := range configMaps.Items {
601-
otherRequest, err := UnmarshalSnapshotRequest(&configMap)
602-
if err != nil {
603-
c.logger.Errorf("Failed to unmarshal snapshot request from ConfigMap %s/%s: %v", configMap.Namespace, configMap.Name, err)
604-
continue
605-
}
606-
if otherRequest.Name == request.Name {
607-
continue
608-
}
609-
if otherRequest.Spec.URL != request.Spec.URL {
610-
continue
611-
}
612-
if !otherRequest.CreationTimestamp.Time.After(request.CreationTimestamp.Time) {
613-
continue
614-
}
615-
if otherRequest.Done() {
616-
continue
617-
}
618-
619-
request.Status.Phase = RequestPhaseCanceling
620-
return nil
621-
}
622-
623-
return nil
624-
}

pkg/snapshot/volumes/csi/snapshothandler.go

Lines changed: 104 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -142,48 +142,68 @@ func (h *snapshotHandler) deleteVolumeSnapshot(ctx context.Context, requestLabel
142142
}
143143
volumeSnapshot, volumeSnapshotContent, err := h.getVolumeSnapshotResources(ctx, volumeSnapshotNamespace, volumeSnapshotName, volumeSnapshotContentName)
144144
if err != nil {
145-
return false, fmt.Errorf("failed to get volume snapshot resources for VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
145+
return cleanupInProgress, fmt.Errorf("failed to get volume snapshot resources for VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
146146
}
147147

148148
resourceRecreated := false
149149
if volumeSnapshotContent == nil && recreateResourceIfNotFound {
150150
h.logger.Debugf("VolumeSnapshotContent %s not found, recreate it", volumeSnapshotContentName)
151151
_, err = h.createVolumeSnapshotContentResource(ctx, requestLabel, requestName, volumeSnapshotRequest, snapshotHandle, snapshotsv1api.VolumeSnapshotContentDelete)
152152
if err != nil {
153-
return false, fmt.Errorf("failed to recreate VolumeSnapshotContent %s: %w", volumeSnapshotContentName, err)
153+
return cleanupInProgress, fmt.Errorf("failed to recreate VolumeSnapshotContent %s: %w", volumeSnapshotContentName, err)
154154
}
155155
resourceRecreated = true
156156
}
157157
if volumeSnapshot == nil && recreateResourceIfNotFound {
158158
h.logger.Debugf("VolumeSnapshot %s/%s not found, recreate it", volumeSnapshotNamespace, volumeSnapshotName)
159159
_, err = h.createPreProvisionedVolumeSnapshot(ctx, requestLabel, requestName, volumeSnapshotRequest)
160160
if err != nil {
161-
return false, fmt.Errorf("failed to recreate VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
161+
return cleanupInProgress, fmt.Errorf("failed to recreate VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
162162
}
163163
resourceRecreated = true
164164
}
165165
if resourceRecreated {
166-
return false, nil
166+
return cleanupInProgress, nil
167167
}
168168

169169
if volumeSnapshot == nil && volumeSnapshotContent == nil {
170170
// both the VolumeSnapshot and the VolumeSnapshotContent have been deleted
171-
return true, nil
171+
return cleanupComplete, nil
172172
}
173173
if volumeSnapshot != nil {
174-
volumeSnapshotJSON, _ := json.Marshal(volumeSnapshot)
175-
h.logger.Debugf("VolumeSnapshot %s/%s still not deleted: %s", volumeSnapshot.Namespace, volumeSnapshot.Name, volumeSnapshotJSON)
174+
h.logger.Debugf("VolumeSnapshot %s/%s deletion pending (deletionTimestamp=%v)", volumeSnapshot.Namespace, volumeSnapshot.Name, volumeSnapshot.DeletionTimestamp)
176175
}
177176
if volumeSnapshotContent != nil {
178-
volumeSnapshotContentJSON, _ := json.Marshal(volumeSnapshotContent)
179-
h.logger.Debugf("VolumeSnapshotContent %s still not deleted: %s", volumeSnapshotContent.Name, volumeSnapshotContentJSON)
177+
h.logger.Debugf("VolumeSnapshotContent %s deletion pending (policy=%s, deletionTimestamp=%v)", volumeSnapshotContent.Name, volumeSnapshotContent.Spec.DeletionPolicy, volumeSnapshotContent.DeletionTimestamp)
180178
}
181179

182-
err = h.updateAndDeleteVolumeSnapshotResource(ctx, volumeSnapshot, volumeSnapshotContent, snapshotsv1api.VolumeSnapshotContentDelete)
180+
result, err := h.updateAndDeleteVolumeSnapshotResource(ctx, volumeSnapshot, volumeSnapshotContent, snapshotsv1api.VolumeSnapshotContentDelete)
183181
if err != nil {
184-
return false, fmt.Errorf("failed to delete volume snapshot: %w", err)
182+
if kerrors.IsConflict(err) {
183+
h.logger.Debugf("Conflict deleting %s/%s (likely processed by controller), will retry: %v", volumeSnapshotNamespace, volumeSnapshotName, err)
184+
return cleanupInProgress, nil
185+
}
186+
return cleanupInProgress, fmt.Errorf("failed to delete volume snapshot: %w", err)
187+
}
188+
if result.policyUpdated {
189+
h.logger.Debugf("DeletionPolicy updated for %s/%s, waiting for controller reconciliation", volumeSnapshotNamespace, volumeSnapshotName)
190+
return cleanupInProgress, nil
191+
}
192+
return cleanupInProgress, nil
193+
}
194+
195+
const (
196+
cleanupComplete = true
197+
cleanupInProgress = false
198+
)
199+
200+
func isCleanupComplete(volumeSnapshot *snapshotsv1api.VolumeSnapshot, volumeSnapshotContent *snapshotsv1api.VolumeSnapshotContent) bool {
201+
if volumeSnapshot == nil && volumeSnapshotContent == nil {
202+
return true
185203
}
186-
return false, nil
204+
snapshotDeleting := volumeSnapshot == nil || !volumeSnapshot.DeletionTimestamp.IsZero()
205+
contentDeleting := volumeSnapshotContent == nil || !volumeSnapshotContent.DeletionTimestamp.IsZero()
206+
return snapshotDeleting && contentDeleting
187207
}
188208

189209
// cleanupVolumeSnapshotResource deletes the VolumeSnapshot and the VolumeSnapshotContent with the deletion policy set
@@ -192,23 +212,58 @@ func (h *snapshotHandler) deleteVolumeSnapshot(ctx context.Context, requestLabel
192212
func (h *snapshotHandler) cleanupVolumeSnapshotResource(ctx context.Context, volumeSnapshotNamespace, volumeSnapshotName string) (bool, error) {
193213
volumeSnapshot, volumeSnapshotContent, err := h.getVolumeSnapshotResources(ctx, volumeSnapshotNamespace, volumeSnapshotName, "")
194214
if err != nil {
195-
return false, fmt.Errorf("failed to get volume snapshot resources for VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
215+
return cleanupInProgress, fmt.Errorf("failed to get volume snapshot resources for VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
196216
}
217+
// If resources are gone, cleanup is done
197218
if volumeSnapshot == nil && volumeSnapshotContent == nil {
198-
return true, nil
219+
return cleanupComplete, nil
220+
}
221+
222+
// Check if resources are already being deleted (have DeletionTimestamp set)
223+
// If they are, we consider them as effectively cleaned up
224+
if isCleanupComplete(volumeSnapshot, volumeSnapshotContent) {
225+
h.logger.Debugf("Volume snapshot resources for%s/%s are marked for deletion, cleanup considered complete", volumeSnapshotNamespace, volumeSnapshotName)
226+
return cleanupComplete, nil
199227
}
200-
err = h.updateAndDeleteVolumeSnapshotResource(ctx, volumeSnapshot, volumeSnapshotContent, snapshotsv1api.VolumeSnapshotContentRetain)
228+
229+
result, err := h.updateAndDeleteVolumeSnapshotResource(ctx, volumeSnapshot, volumeSnapshotContent, snapshotsv1api.VolumeSnapshotContentRetain)
201230
if err != nil {
202-
return false, fmt.Errorf("failed to cleanup volume snapshot resources: %w", err)
231+
if kerrors.IsConflict(err) {
232+
h.logger.Debugf("Conflict cleaning up volume snapshot resources for %s/%s (likely being processed), will retry: %v", volumeSnapshotNamespace, volumeSnapshotName, err)
233+
return cleanupInProgress, nil
234+
}
235+
return cleanupInProgress, fmt.Errorf("failed to cleanup volume snapshot resources for %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
236+
}
237+
238+
if result.policyUpdated {
239+
h.logger.Debugf("DeletionPolicy updated for %s/%s, waiting for controller reconciliation", volumeSnapshotNamespace, volumeSnapshotName)
240+
return cleanupInProgress, nil
241+
}
242+
243+
// Re-check to confirm deletion has started (DeletionTimestamp set) or resources are gone.
244+
volumeSnapshot, volumeSnapshotContent, err = h.getVolumeSnapshotResources(ctx, volumeSnapshotNamespace, volumeSnapshotName, "")
245+
if err != nil {
246+
return cleanupInProgress, fmt.Errorf("failed to recheck volume snapshot resources for VolumeSnapshot %s/%s: %w", volumeSnapshotNamespace, volumeSnapshotName, err)
247+
}
248+
249+
if isCleanupComplete(volumeSnapshot, volumeSnapshotContent) {
250+
return cleanupComplete, nil
203251
}
204-
return false, nil
252+
253+
h.logger.Debugf("Cleanup initiated for %s/%s but deletion timestamp not yet set. Will retry.", volumeSnapshotNamespace, volumeSnapshotName)
254+
return cleanupInProgress, nil
255+
}
256+
257+
type updateDeleteResult struct {
258+
policyUpdated bool
259+
deleteIssued bool
205260
}
206261

207262
func (h *snapshotHandler) updateAndDeleteVolumeSnapshotResource(
208263
ctx context.Context,
209264
volumeSnapshot *snapshotsv1api.VolumeSnapshot,
210265
volumeSnapshotContent *snapshotsv1api.VolumeSnapshotContent,
211-
requiredVolumeSnapshotContentDeletionPolicy snapshotsv1api.DeletionPolicy) error {
266+
requiredVolumeSnapshotContentDeletionPolicy snapshotsv1api.DeletionPolicy) (updateDeleteResult, error) {
212267
if volumeSnapshotContent != nil &&
213268
volumeSnapshotContent.DeletionTimestamp.IsZero() &&
214269
volumeSnapshotContent.Spec.DeletionPolicy != requiredVolumeSnapshotContentDeletionPolicy {
@@ -217,16 +272,22 @@ func (h *snapshotHandler) updateAndDeleteVolumeSnapshotResource(
217272
// 2. DeletionPolicy=Delete when deleting the volume snapshots
218273
err := h.setVolumeSnapshotContentDeletionPolicy(ctx, volumeSnapshotContent.Name, requiredVolumeSnapshotContentDeletionPolicy)
219274
if err != nil {
220-
return fmt.Errorf("failed to set VolumeSnapshotContent %s DeletionPolicy to %s: %w", volumeSnapshotContent.Name, requiredVolumeSnapshotContentDeletionPolicy, err)
275+
return updateDeleteResult{}, fmt.Errorf("failed to set VolumeSnapshotContent %s DeletionPolicy to %s: %w", volumeSnapshotContent.Name, requiredVolumeSnapshotContentDeletionPolicy, err)
221276
}
222-
return nil
277+
return updateDeleteResult{policyUpdated: true}, nil
223278
}
224279

225-
err := h.deleteVolumeSnapshotResources(ctx, volumeSnapshot, volumeSnapshotContent)
280+
err := h.deleteVolumeSnapshotObj(ctx, volumeSnapshot)
226281
if err != nil {
227-
return fmt.Errorf("failed to delete VolumeSnapshot and/or VolumeSnapshotContent: %w", err)
282+
return updateDeleteResult{}, fmt.Errorf("failed to delete VolumeSnapshot: %w", err)
228283
}
229-
return nil
284+
285+
err = h.deleteVolumeSnapshotContentObj(ctx, volumeSnapshot, volumeSnapshotContent)
286+
if err != nil {
287+
return updateDeleteResult{}, fmt.Errorf("failed to delete VolumeSnapshotContent: %w", err)
288+
}
289+
290+
return updateDeleteResult{deleteIssued: true}, nil
230291
}
231292

232293
func (h *snapshotHandler) setVolumeSnapshotContentDeletionPolicy(ctx context.Context, volumeSnapshotContentName string, deletionPolicy snapshotsv1api.DeletionPolicy) error {
@@ -246,27 +307,35 @@ func (h *snapshotHandler) setVolumeSnapshotContentDeletionPolicy(ctx context.Con
246307
return nil
247308
}
248309

249-
func (h *snapshotHandler) deleteVolumeSnapshotResources(
310+
// deleteVolumeSnapshotObj deletes the VolumeSnapshot resource.
311+
func (h *snapshotHandler) deleteVolumeSnapshotObj(
250312
ctx context.Context,
251-
volumeSnapshot *snapshotsv1api.VolumeSnapshot,
252-
volumeSnapshotContent *snapshotsv1api.VolumeSnapshotContent) error {
313+
volumeSnapshot *snapshotsv1api.VolumeSnapshot) error {
253314
if volumeSnapshot != nil &&
254315
volumeSnapshot.DeletionTimestamp.IsZero() {
255316
h.logger.Debugf("Delete VolumeSnapshot %s/%s", volumeSnapshot.Namespace, volumeSnapshot.Name)
256317
err := h.snapshotsClient.SnapshotV1().VolumeSnapshots(volumeSnapshot.Namespace).Delete(ctx, volumeSnapshot.Name, metav1.DeleteOptions{})
257318
if err != nil && !kerrors.IsNotFound(err) {
258-
// If the error is a conflict (e.g., controller is adding finalizers),
259-
// it means the snapshot is being processed
319+
// If the error is a conflict, we return it to allow the reconciler to retry with backoff.
260320
if kerrors.IsConflict(err) {
261321
h.logger.Debugf("VolumeSnapshot %s/%s deletion conflicted (likely being processed by controller), will retry", volumeSnapshot.Namespace, volumeSnapshot.Name)
262-
} else {
263-
return fmt.Errorf("failed to delete VolumeSnapshot %s/%s: %w", volumeSnapshot.Namespace, volumeSnapshot.Name, err)
264322
}
323+
return fmt.Errorf("failed to delete VolumeSnapshot %s/%s: %w", volumeSnapshot.Namespace, volumeSnapshot.Name, err)
265324
}
266325
}
326+
return nil
327+
}
328+
329+
// deleteVolumeSnapshotContentObj deletes the VolumeSnapshotContent resource.
330+
func (h *snapshotHandler) deleteVolumeSnapshotContentObj(
331+
ctx context.Context,
332+
volumeSnapshot *snapshotsv1api.VolumeSnapshot,
333+
volumeSnapshotContent *snapshotsv1api.VolumeSnapshotContent) error {
267334
if volumeSnapshotContent != nil &&
268335
volumeSnapshotContent.DeletionTimestamp.IsZero() &&
269-
volumeSnapshotContent.Spec.DeletionPolicy == snapshotsv1api.VolumeSnapshotContentRetain {
336+
// If the VolumeSnapshot is already gone, the controller cannot clean up the content.
337+
// Delete it manually even when the policy is Delete to avoid orphaned content.
338+
(volumeSnapshotContent.Spec.DeletionPolicy == snapshotsv1api.VolumeSnapshotContentRetain || volumeSnapshot == nil) {
270339
// Delete the VolumeSnapshotContent manually in case it has the Retain deletion policy.
271340
// Otherwise, the VolumeSnapshotContent resource will be deleted automatically by the snapshot-controller.
272341
// Here we have 2 cases:
@@ -277,6 +346,10 @@ func (h *snapshotHandler) deleteVolumeSnapshotResources(
277346
h.logger.Debugf("Delete VolumeSnapshotContent %s", volumeSnapshotContent.Name)
278347
err := h.snapshotsClient.SnapshotV1().VolumeSnapshotContents().Delete(ctx, volumeSnapshotContent.Name, metav1.DeleteOptions{})
279348
if err != nil && !kerrors.IsNotFound(err) {
349+
// If the error is a conflict, we return it to allow the reconciler to retry with backoff.
350+
if kerrors.IsConflict(err) {
351+
h.logger.Debugf("VolumeSnapshotContent %s deletion conflicted (likely being processed by controller), will retry", volumeSnapshotContent.Name)
352+
}
280353
return fmt.Errorf("failed to delete VolumeSnapshotContent %s: %w", volumeSnapshotContent.Name, err)
281354
}
282355
}

test/e2e/snapshot/snapshot.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
"github.com/ghodss/yaml"
12+
snapshotsv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
1213
snapshotsv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/clientset/versioned"
1314
vclusterconfig "github.com/loft-sh/vcluster/pkg/config"
1415
"github.com/loft-sh/vcluster/pkg/constants"
@@ -566,7 +567,6 @@ var _ = Describe("snapshot and restore", Ordered, func() {
566567
Expect(err).NotTo(HaveOccurred())
567568
Expect(volumeSnapshotContents.Items).To(BeEmpty())
568569
})
569-
570570
It("Deletes the PVC with test data", func(ctx context.Context) {
571571
deletePVC(ctx, f, controllerTestNamespaceName, pvcToRestoreName)
572572
})
@@ -720,17 +720,40 @@ var _ = Describe("snapshot and restore", Ordered, func() {
720720
Should(Succeed())
721721
})
722722

723-
It("completed new new snapshot request", func(ctx context.Context) {
723+
It("completed new snapshot request", func(ctx context.Context) {
724724
Eventually(func(g Gomega, ctx context.Context) {
725725
_, newerSnapshotRequest := getTwoSnapshotRequests(g, ctx, f)
726-
g.Expect(newerSnapshotRequest.Status.Phase).Should(
727-
Equal(snapshot.RequestPhaseCompleted),
728-
fmt.Sprintf("Newer snapshot request %s is not completed, got: %s", newerSnapshotRequest.Name, toJSON(newerSnapshotRequest)))
726+
727+
// First check individual volume snapshot phases for better diagnostics
728+
// This helps identify which specific snapshots are stuck
729729
for pvcName, volumeSnapshot := range newerSnapshotRequest.Status.VolumeSnapshots.Snapshots {
730730
g.Expect(volumeSnapshot.Phase).To(
731731
Equal(volumes.RequestPhaseCompleted),
732-
fmt.Sprintf("New volume snapshot request for PVC %s should be completed, got: %s", pvcName, toJSON(volumeSnapshot)))
732+
fmt.Sprintf("Volume snapshot for PVC %s is not completed. Phase: %s, SnapshotHandle: %s, Error: %s. Full snapshot request: %s",
733+
pvcName,
734+
volumeSnapshot.Phase,
735+
volumeSnapshot.SnapshotHandle,
736+
toJSON(volumeSnapshot.Error),
737+
toJSON(newerSnapshotRequest)))
733738
}
739+
740+
// Then check the overall volumeSnapshots phase
741+
g.Expect(newerSnapshotRequest.Status.VolumeSnapshots.Phase).To(
742+
Equal(volumes.RequestPhaseCompleted),
743+
fmt.Sprintf("VolumeSnapshots phase is not completed for snapshot request %s. Phase: %s, Overall phase: %s, VolumeSnapshots: %s",
744+
newerSnapshotRequest.Name,
745+
newerSnapshotRequest.Status.VolumeSnapshots.Phase,
746+
newerSnapshotRequest.Status.Phase,
747+
toJSON(newerSnapshotRequest.Status.VolumeSnapshots)))
748+
749+
// Finally check the overall snapshot request phase
750+
g.Expect(newerSnapshotRequest.Status.Phase).Should(
751+
Equal(snapshot.RequestPhaseCompleted),
752+
fmt.Sprintf("Newer snapshot request %s is not completed. Overall phase: %s, VolumeSnapshots phase: %s, Full request: %s",
753+
newerSnapshotRequest.Name,
754+
newerSnapshotRequest.Status.Phase,
755+
newerSnapshotRequest.Status.VolumeSnapshots.Phase,
756+
toJSON(newerSnapshotRequest)))
734757
}).WithContext(ctx).
735758
WithPolling(framework.PollInterval).
736759
WithTimeout(framework.PollTimeoutLong).

0 commit comments

Comments
 (0)