Skip to content

Commit a2961cf

Browse files
committed
Handle CnsNotRegisteredFault by re-registering volumes in CNS operations
This commit adds handling for CnsNotRegisteredFault in various CNS volume operations for the WORKLOAD cluster flavor. When a volume operation fails with CnsNotRegisteredFault, the driver now attempts to re-register the volume with CNS and retries the operation. Changes include: - Add clusterId and clusterDistribution parameters to GetManager() for volume re-registration - Add reRegisterVolume() function to re-register unregistered volumes - Add IsCnsNotRegisteredFault() helper to detect the fault type - Add IsCnsVolumeAlreadyExistsFault() helper for idempotent re-registration - Handle CnsNotRegisteredFault in: - AttachVolume - DetachVolume - DeleteVolume (with improved idempotency) - UpdateVolumeMetadata - UpdateVolumeCrypto - ExpandVolume (with improved idempotency) - CreateSnapshot (with improved idempotency and with transaction) - DeleteSnapshot - Add TODO for batch attach reconcile to handle unregistered volumes when QueryBackingTypeFromVirtualDiskInfo is used (PR #3799) The re-registration is only attempted once per operation to prevent infinite loops. If re-registration fails or the retry fails, the original error is returned.
1 parent 635368f commit a2961cf

File tree

18 files changed

+390
-99
lines changed

18 files changed

+390
-99
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ require (
2525
github.com/stretchr/testify v1.11.1
2626
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65
2727
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa
28-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac
28+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e
2929
go.uber.org/zap v1.27.0
3030
golang.org/x/crypto v0.43.0
3131
golang.org/x/sync v0.18.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,8 @@ github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65 h1:
291291
github.com/vmware-tanzu/vm-operator/api v1.9.1-0.20250923172217-bf5a74e51c65/go.mod h1:nWTPpxfe4gHuuYuFcrs86+NMxfkqPk3a3IlvI8TCWak=
292292
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa h1:4MKu14YJ7J54O6QKmT4ds5EUpysWLLtQRMff73cVkmU=
293293
github.com/vmware-tanzu/vm-operator/external/byok v0.0.0-20250509154507-b93e51fc90fa/go.mod h1:8tiuyYslzjLIUmOlXZuGKQdQP2ZgWGCVhVeyptmZYnk=
294-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac h1:E3W+2J1I0B5LyIillKYVQHIb6CpslGcogt7Q+8FHT3c=
295-
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203163802-5ce652387dac/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
294+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e h1:TG9xuPu9N29Ak1gNs85VsMImNv1bd2l0yNfAMc3imOU=
295+
github.com/vmware/govmomi v0.53.0-alpha.0.0.20251203213634-99f18b71ea8e/go.mod h1:FM3GTg002dFFN7l2/hNS0YWC4f78HTw4kvgUwAE52cM=
296296
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
297297
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
298298
github.com/xiang90/probing v0.0.0-20221125231312-a49e3df8f510 h1:S2dVYn90KE98chqDkyE9Z4N61UnQd+KOfgp5Iu53llk=

pkg/common/cns-lib/volume/manager.go

Lines changed: 300 additions & 18 deletions
Large diffs are not rendered by default.

pkg/common/cns-lib/volume/util.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,3 +632,17 @@ func IsCnsVolumeAlreadyExistsFault(ctx context.Context, faultType string) bool {
632632
log.Infof("Checking fault type: %q is vim.fault.CnsVolumeAlreadyExistsFault", faultType)
633633
return faultType == "vim.fault.CnsVolumeAlreadyExistsFault"
634634
}
635+
636+
// IsCnsNotRegisteredFault checks if the fault is CnsNotRegisteredFault
637+
func IsCnsNotRegisteredFault(ctx context.Context, fault *types.LocalizedMethodFault) bool {
638+
log := logger.GetLogger(ctx)
639+
if fault == nil || fault.Fault == nil {
640+
log.Infof("fault is nil or fault.Fault is nil. Not a CnsNotRegisteredFault")
641+
return false
642+
}
643+
if _, ok := fault.Fault.(*cnstypes.CnsNotRegisteredFault); ok {
644+
log.Infof("observed CnsNotRegisteredFault")
645+
return true
646+
}
647+
return false
648+
}

pkg/common/utils/utils_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ func getCommonUtilsTest(t *testing.T) *commonUtilsTest {
139139
t.Fatal(err)
140140
}
141141

142-
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "")
142+
volumeManager, err := cnsvolumes.GetManager(ctx, virtualCenter, nil, false, false, false, "", "", "")
143143
if err != nil {
144144
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
145145
}

pkg/csi/service/common/vsphereutil.go

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,32 +1225,21 @@ func DeleteSnapshotUtil(ctx context.Context, volumeManager cnsvolume.Manager, cs
12251225
return cnsSnapshotInfo, nil
12261226
}
12271227

1228-
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id
1229-
func GetCnsVolumeType(ctx context.Context, volumeManager cnsvolume.Manager, volumeId string) (string, error) {
1228+
// GetCnsVolumeType is the helper function that determines the volume type based on the volume-id prefix.
1229+
// If volume ID begins with "file:", it is a file volume, otherwise it is a block volume.
1230+
func GetCnsVolumeType(ctx context.Context, volumeId string) string {
12301231
log := logger.GetLogger(ctx)
12311232
var volumeType string
1232-
queryFilter := cnstypes.CnsQueryFilter{
1233-
VolumeIds: []cnstypes.CnsVolumeId{{Id: volumeId}},
1234-
}
1235-
querySelection := cnstypes.CnsQuerySelection{
1236-
Names: []string{
1237-
string(cnstypes.QuerySelectionNameTypeVolumeType),
1238-
},
1239-
}
1240-
// Select only the volume type.
1241-
queryResult, err := volumeManager.QueryAllVolume(ctx, queryFilter, querySelection)
1242-
if err != nil {
1243-
return "", logger.LogNewErrorCodef(log, codes.Internal,
1244-
"queryVolume failed for volumeID: %q with err=%+v", volumeId, err)
1245-
}
12461233

1247-
if len(queryResult.Volumes) == 0 {
1248-
log.Infof("volume: %s not found during query while determining CNS volume type", volumeId)
1249-
return "", ErrNotFound
1234+
// Determine volume type based on volume ID prefix
1235+
if strings.HasPrefix(volumeId, "file:") {
1236+
volumeType = FileVolumeType
1237+
} else {
1238+
volumeType = BlockVolumeType
12501239
}
1251-
volumeType = queryResult.Volumes[0].VolumeType
1252-
log.Infof("volume: %s is of type: %s", volumeId, volumeType)
1253-
return volumeType, nil
1240+
1241+
log.Infof("volume: %s is of type: %s (determined from volume ID prefix)", volumeId, volumeType)
1242+
return volumeType
12541243
}
12551244

12561245
// GetNodeVMsWithAccessToDatastore finds out NodeVMs which have access to the given

pkg/csi/service/vanilla/controller.go

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,8 @@ func (c *controller) Init(config *cnsconfig.Config, version string) error {
232232
c.managers.VcenterConfigs[vcenterconfig.Host] = vcenterconfig
233233
volumeManager, err := cnsvolume.GetManager(ctx, vcenter,
234234
operationStore, true, true,
235-
multivCenterTopologyDeployment, cnstypes.CnsClusterFlavorVanilla)
235+
multivCenterTopologyDeployment, cnstypes.CnsClusterFlavorVanilla, config.Global.ClusterID,
236+
config.Global.ClusterDistribution)
236237
if err != nil {
237238
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
238239
}
@@ -1617,16 +1618,7 @@ func (c *controller) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequ
16171618
}
16181619

16191620
if cnsVolumeType == common.UnknownVolumeType {
1620-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, volumeManager, req.VolumeId)
1621-
if err != nil {
1622-
if err.Error() == common.ErrNotFound.Error() {
1623-
// The volume couldn't be found during query, assuming the delete operation as success
1624-
return &csi.DeleteVolumeResponse{}, "", nil
1625-
} else {
1626-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
1627-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
1628-
}
1629-
}
1621+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
16301622
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
16311623
}
16321624
// Check if the volume contains CNS snapshots only for block volumes.

pkg/csi/service/vanilla/controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ func getControllerTest(t *testing.T) *controllerTest {
243243

244244
volumeManager, err := cnsvolume.GetManager(ctx, vcenter,
245245
fakeOpStore, true, false,
246-
false, cnstypes.CnsClusterFlavorVanilla)
246+
false, cnstypes.CnsClusterFlavorVanilla, "", "")
247247
if err != nil {
248248
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
249249
}

pkg/csi/service/vanilla/controller_topology_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ func getControllerTestWithTopology(t *testing.T) *controllerTestTopology {
415415

416416
volumeManager, err := cnsvolume.GetManager(ctxtopology, vcenter,
417417
fakeOpStore, true, false,
418-
false, cnstypes.CnsClusterFlavorVanilla)
418+
false, cnstypes.CnsClusterFlavorVanilla, "", "")
419419
if err != nil {
420420
t.Fatalf("failed to create an instance of volume manager. err=%v", err)
421421
}

pkg/csi/service/wcp/controller.go

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func (c *controller) Init(config *cnsconfig.Config, version string) error {
194194

195195
volumeManager, err := cnsvolume.GetManager(ctx, vcenter, operationStore,
196196
idempotencyHandlingEnabled, false,
197-
false, cnstypes.CnsClusterFlavorWorkload)
197+
false, cnstypes.CnsClusterFlavorWorkload, config.Global.SupervisorID, config.Global.ClusterDistribution)
198198
if err != nil {
199199
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
200200
}
@@ -431,10 +431,9 @@ func (c *controller) ReloadConfiguration(reconnectToVCFromNewConfig bool) error
431431
return logger.LogNewErrorf(log, "failed to reset volume manager. err=%v", err)
432432
}
433433
c.manager.VcenterConfig = newVCConfig
434-
435434
volumeManager, err := cnsvolume.GetManager(ctx, vcenter, operationStore,
436435
idempotencyHandlingEnabled, false,
437-
false, cnstypes.CnsClusterFlavorWorkload)
436+
false, cnstypes.CnsClusterFlavorWorkload, cfg.Global.SupervisorID, cfg.Global.ClusterDistribution)
438437
if err != nil {
439438
return logger.LogNewErrorf(log, "failed to create an instance of volume manager. err=%v", err)
440439
}
@@ -1726,16 +1725,7 @@ func (c *controller) DeleteVolume(ctx context.Context, req *csi.DeleteVolumeRequ
17261725
return nil, csifault.CSIInvalidArgumentFault, err
17271726
}
17281727
if cnsVolumeType == common.UnknownVolumeType {
1729-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, c.manager.VolumeManager, req.VolumeId)
1730-
if err != nil {
1731-
if err.Error() == common.ErrNotFound.Error() {
1732-
// The volume couldn't be found during query, assuming the delete operation as success
1733-
return &csi.DeleteVolumeResponse{}, "", nil
1734-
} else {
1735-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
1736-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
1737-
}
1738-
}
1728+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
17391729
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
17401730
}
17411731
// Check if the volume contains CNS snapshots only for block volumes.
@@ -2437,31 +2427,21 @@ func (c *controller) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshot
24372427
}
24382428
volumeID := req.GetSourceVolumeId()
24392429
volumeType = prometheus.PrometheusBlockVolumeType
2440-
// Query capacity in MB for block volume snapshot
2441-
volumeIds := []cnstypes.CnsVolumeId{{Id: volumeID}}
2442-
cnsVolumeDetailsMap, err := utils.QueryVolumeDetailsUtil(ctx, c.manager.VolumeManager, volumeIds)
2443-
if err != nil {
2444-
return nil, err
2445-
}
2446-
if _, ok := cnsVolumeDetailsMap[volumeID]; !ok {
2447-
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2448-
"cns query volume did not return the volume: %s", volumeID)
2449-
}
2450-
snapshotSizeInMB := cnsVolumeDetailsMap[volumeID].SizeInMB
24512430

2452-
if cnsVolumeDetailsMap[volumeID].VolumeType != common.BlockVolumeType {
2431+
cnsvolumeType := common.GetCnsVolumeType(ctx, volumeID)
2432+
if cnsvolumeType != common.BlockVolumeType {
24532433
return nil, logger.LogNewErrorCodef(log, codes.FailedPrecondition,
2454-
"queried volume doesn't have the expected volume type. Expected VolumeType: %v. "+
2455-
"Queried VolumeType: %v", volumeType, cnsVolumeDetailsMap[volumeID].VolumeType)
2434+
"Expected VolumeType: %v. "+
2435+
"Observed VolumeType: %v", volumeType, cnsvolumeType)
24562436
}
2457-
24582437
// TODO: We may need to add logic to check the limit of max number of snapshots by using
24592438
// GlobalMaxSnapshotsPerBlockVolume etc. variables in the future.
24602439

24612440
// the returned snapshotID below is a combination of CNS VolumeID and CNS SnapshotID concatenated by the "+"
24622441
// sign. That is, a string of "<UUID>+<UUID>". Because, all other CNS snapshot APIs still require both
24632442
// VolumeID and SnapshotID as the input, while corresponding snapshot APIs in upstream CSI require SnapshotID.
24642443
// So, we need to bridge the gap in vSphere CSI driver and return a combined SnapshotID to CSI Snapshotter.
2444+
var err error
24652445
var snapshotID string
24662446
var cnsSnapshotInfo *cnsvolume.CnsSnapshotInfo
24672447
var cnsVolumeInfo *cnsvolumeinfov1alpha1.CNSVolumeInfo
@@ -2517,6 +2497,17 @@ func (c *controller) CreateSnapshot(ctx context.Context, req *csi.CreateSnapshot
25172497
"failed to create snapshot on volume %q with error: %v", volumeID, err)
25182498
}
25192499
}
2500+
// Query capacity in MB for block volume snapshot
2501+
volumeIds := []cnstypes.CnsVolumeId{{Id: volumeID}}
2502+
cnsVolumeDetailsMap, err := utils.QueryVolumeDetailsUtil(ctx, c.manager.VolumeManager, volumeIds)
2503+
if err != nil {
2504+
return nil, err
2505+
}
2506+
if _, ok := cnsVolumeDetailsMap[volumeID]; !ok {
2507+
return nil, logger.LogNewErrorCodef(log, codes.Internal,
2508+
"cns query volume did not return the volume: %s", volumeID)
2509+
}
2510+
snapshotSizeInMB := cnsVolumeDetailsMap[volumeID].SizeInMB
25202511
snapshotCreateTimeInProto := timestamppb.New(cnsSnapshotInfo.SnapshotLatestOperationCompleteTime)
25212512
createSnapshotResponse := &csi.CreateSnapshotResponse{
25222513
Snapshot: &csi.Snapshot{
@@ -2708,11 +2699,7 @@ func (c *controller) ControllerExpandVolume(ctx context.Context, req *csi.Contro
27082699
// Later we may need to define different csi faults.
27092700
// Check if the volume contains CNS snapshots only for block volumes.
27102701
if cnsVolumeType == common.UnknownVolumeType {
2711-
cnsVolumeType, err = common.GetCnsVolumeType(ctx, c.manager.VolumeManager, req.VolumeId)
2712-
if err != nil {
2713-
return nil, csifault.CSIInternalFault, logger.LogNewErrorCodef(log, codes.Internal,
2714-
"failed to determine CNS volume type for volume: %q. Error: %+v", req.VolumeId, err)
2715-
}
2702+
cnsVolumeType = common.GetCnsVolumeType(ctx, req.VolumeId)
27162703
volumeType = convertCnsVolumeType(ctx, cnsVolumeType)
27172704
}
27182705
if cnsVolumeType == common.BlockVolumeType &&

0 commit comments

Comments
 (0)