Skip to content

Commit 1a9117b

Browse files
committed
Address review comments on ensureKerberosCache
- Use os.Lstat + os.Readlink to properly validate symlinks instead of os.Stat - Accept any existing symlink pointing to a valid cache file (concurrent mount safe) - Use atomic temp-symlink + os.Rename instead of remove-then-create (no gap) - Remove TryAcquire lock that caused concurrent mounts to fail with Aborted - Include rich context (link name, target, credUID) in all error messages - Ensure consistent tab indentation (gofmt) - Update tests: valid symlink from previous volume is now kept, concurrent test no longer expects codes.Aborted
1 parent f3cf7c6 commit 1a9117b

2 files changed

Lines changed: 62 additions & 24 deletions

File tree

pkg/smb/nodeserver.go

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -603,36 +603,73 @@ func (d *Driver) ensureKerberosCache(krb5CacheDirectory, krb5Prefix, volumeID st
603603
if err != nil {
604604
return false, err
605605
}
606-
607606
volumeIDCacheFileName := volumeKerberosCacheName(volumeID)
608-
volumeIDCacheAbsolutePath := getKerberosFilePath(krb5CacheDirectory, volumeIDCacheFileName)
609607

608+
volumeIDCacheAbsolutePath := getKerberosFilePath(krb5CacheDirectory, volumeIDCacheFileName)
610609
if err := os.WriteFile(volumeIDCacheAbsolutePath, content, os.FileMode(0700)); err != nil {
611610
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't write kerberos cache to file %s: %v", volumeIDCacheAbsolutePath, err))
612611
}
613612
if err := os.Chown(volumeIDCacheAbsolutePath, credUID, credUID); err != nil {
614613
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't chown kerberos cache %s to user %d: %v", volumeIDCacheAbsolutePath, credUID, err))
615614
}
616615

617-
cruidLockKey := fmt.Sprintf("cruid-%d", credUID)
618-
if acquired := d.volumeLocks.TryAcquire(cruidLockKey); !acquired {
619-
return false, status.Errorf(codes.Aborted, "another kerberos cache operation for CRUID %d is already in progress", credUID)
616+
// Check if a valid symlink already exists — if so, leave it alone for concurrent mounts.
617+
// Use Lstat so a dangling symlink is still detected as present.
618+
shouldCreateSymlink := true
619+
fileInfo, statErr := os.Lstat(krb5CacheFileName)
620+
if statErr == nil {
621+
if fileInfo.Mode()&os.ModeSymlink != 0 {
622+
symlinkTarget, readErr := os.Readlink(krb5CacheFileName)
623+
if readErr != nil {
624+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't read symlink %s: %v", krb5CacheFileName, readErr))
625+
}
626+
// Accept any existing symlink that points to a valid, readable cache file.
627+
// This avoids racing with concurrent mounts for the same cruid but different volumes.
628+
if _, targetErr := os.Stat(krb5CacheFileName); targetErr == nil {
629+
klog.V(2).Infof("Valid symlink already exists [%s -> %s], leaving it alone for concurrent mount.", krb5CacheFileName, symlinkTarget)
630+
shouldCreateSymlink = false
631+
}
632+
}
633+
if shouldCreateSymlink {
634+
if err := os.Remove(krb5CacheFileName); err != nil && !os.IsNotExist(err) {
635+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't remove existing kerberos cache path %s: %v", krb5CacheFileName, err))
636+
}
637+
}
638+
} else if !os.IsNotExist(statErr) {
639+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't inspect kerberos cache path %s: %v", krb5CacheFileName, statErr))
620640
}
621-
defer d.volumeLocks.Release(cruidLockKey)
622641

623-
// Use Lstat so a dangling symlink is still detected as present — os.Stat
624-
// would follow the link and report ENOENT, causing the subsequent Symlink
625-
// to fail with "file exists".
626-
if _, err := os.Lstat(krb5CacheFileName); os.IsNotExist(err) {
627-
klog.V(2).Infof("Symlink file doesn't exist, it will be created [%s]", krb5CacheFileName)
628-
} else {
629-
if err := os.Remove(krb5CacheFileName); err != nil {
630-
klog.Warningf("Couldn't delete the file [%s]: %v", krb5CacheFileName, err)
642+
if shouldCreateSymlink {
643+
// Use atomic temp-symlink + rename to avoid a window where krb5CacheFileName is missing.
644+
created := false
645+
var tempSymlinkName string
646+
for attempt := 0; attempt < 5; attempt++ {
647+
tempSymlinkName = filepath.Join(filepath.Dir(krb5CacheFileName), fmt.Sprintf(".%s.tmp.%d", filepath.Base(krb5CacheFileName), time.Now().UnixNano()))
648+
if err := os.Symlink(volumeIDCacheAbsolutePath, tempSymlinkName); err != nil {
649+
if os.IsExist(err) {
650+
klog.V(2).Infof("Temporary symlink path %s already exists, retrying.", tempSymlinkName)
651+
continue
652+
}
653+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't create temporary symlink %s -> %s for credUID %d: %v", tempSymlinkName, volumeIDCacheAbsolutePath, credUID, err))
654+
}
655+
created = true
656+
break
657+
}
658+
if !created {
659+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't create temporary symlink for kerberos cache %s after retries", krb5CacheFileName))
631660
}
632-
}
633661

634-
if err := os.Symlink(volumeIDCacheAbsolutePath, krb5CacheFileName); err != nil {
635-
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't create symlink to a cache file %s->%s for user %d: %v", krb5CacheFileName, volumeIDCacheFileName, credUID, err))
662+
if err := os.Rename(tempSymlinkName, krb5CacheFileName); err != nil {
663+
if removeErr := os.Remove(krb5CacheFileName); removeErr != nil && !os.IsNotExist(removeErr) {
664+
os.Remove(tempSymlinkName)
665+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't replace kerberos symlink %s: rename failed: %v, remove failed: %v", krb5CacheFileName, err, removeErr))
666+
}
667+
if retryErr := os.Rename(tempSymlinkName, krb5CacheFileName); retryErr != nil {
668+
os.Remove(tempSymlinkName)
669+
return false, status.Error(codes.Internal, fmt.Sprintf("Couldn't atomically update kerberos symlink %s -> %s for credUID %d: %v", krb5CacheFileName, volumeIDCacheAbsolutePath, credUID, retryErr))
670+
}
671+
}
672+
klog.V(2).Infof("Updated kerberos symlink %s -> %s", krb5CacheFileName, volumeIDCacheAbsolutePath)
636673
}
637674

638675
return true, nil

pkg/smb/nodeserver_test.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,7 +1010,7 @@ func TestEnsureKerberosCache(t *testing.T) {
10101010
setup: func(_ *testing.T, _, _ string) {},
10111011
},
10121012
{
1013-
desc: "valid symlink from a previous volume is replaced",
1013+
desc: "valid symlink from a previous volume is kept",
10141014
setup: func(t *testing.T, krb5Dir, symlinkPath string) {
10151015
previous := krb5Dir + "previous-volume-cache"
10161016
if err := os.WriteFile(previous, []byte("old"), 0600); err != nil {
@@ -1051,13 +1051,15 @@ func TestEnsureKerberosCache(t *testing.T) {
10511051
if err != nil {
10521052
t.Fatalf("readlink %s: %v", symlinkPath, err)
10531053
}
1054-
expected := getKerberosFilePath(krb5Dir, volumeKerberosCacheName(volumeID))
1055-
if target != expected {
1056-
t.Errorf("symlink target = %s, want %s", target, expected)
1057-
}
1054+
// Symlink must point to a valid, readable cache file.
10581055
if _, err := os.Stat(target); err != nil {
10591056
t.Fatalf("symlink target %s must be valid: %v", target, err)
10601057
}
1058+
// Volume-specific cache file must also exist.
1059+
volCachePath := getKerberosFilePath(krb5Dir, volumeKerberosCacheName(volumeID))
1060+
if _, err := os.Stat(volCachePath); err != nil {
1061+
t.Fatalf("volume cache file %s must exist: %v", volCachePath, err)
1062+
}
10611063
})
10621064
}
10631065
}
@@ -1092,8 +1094,7 @@ func TestEnsureKerberosCacheConcurrent(t *testing.T) {
10921094
defer wg.Done()
10931095
volumeID := fmt.Sprintf("vol-%d", idx)
10941096
_, err := d.ensureKerberosCache(krb5Dir, krb5Prefix, volumeID, mountFlags, secrets)
1095-
// Aborted is acceptable — it's how TryAcquire signals contention.
1096-
if err != nil && status.Code(err) != codes.Aborted {
1097+
if err != nil {
10971098
errCh <- fmt.Errorf("goroutine %d: unexpected error: %v", idx, err)
10981099
}
10991100
}(i)

0 commit comments

Comments
 (0)