Skip to content

Commit 757de63

Browse files
committed
fix: use Lstat to detect dangling krb5cc symlinks before recreation
os.Stat follows the symlink and reports ENOENT when the target is gone, causing ensureKerberosCache to skip the Remove path and then fail with "file exists" in os.Symlink. Switch to os.Lstat so the symlink itself is inspected regardless of target validity. Add TestEnsureKerberosCacheConcurrent: runs parallel goroutines against ensureKerberosCache with a shared CRUID and asserts the final symlink points at a valid volume-specific cache file, locking in the per-CRUID serialization contract.
1 parent 4f7030b commit 757de63

2 files changed

Lines changed: 57 additions & 1 deletion

File tree

pkg/smb/nodeserver.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,10 @@ func (d *Driver) ensureKerberosCache(krb5CacheDirectory, krb5Prefix, volumeID st
620620
}
621621
defer d.volumeLocks.Release(cruidLockKey)
622622

623-
if _, err := os.Stat(krb5CacheFileName); os.IsNotExist(err) {
623+
// Use Lstat so a dangling symlink is still detected as present — os.Stat
624+
// would follow the link and report ENOENT, causing the subsequent Symlink
625+
// to fail with "file exists".
626+
if _, err := os.Lstat(krb5CacheFileName); os.IsNotExist(err) {
624627
klog.V(2).Infof("Symlink file doesn't exist, it will be created [%s]", krb5CacheFileName)
625628
} else {
626629
if err := os.Remove(krb5CacheFileName); err != nil {

pkg/smb/nodeserver_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"runtime"
2828
"strconv"
2929
"strings"
30+
"sync"
3031
"syscall"
3132
"testing"
3233

@@ -981,6 +982,58 @@ func TestGetKerberosCache(t *testing.T) {
981982

982983
}
983984

985+
// Regression test for the race where parallel kubelet mounts sharing a CRUID
986+
// collided on the shared krb5cc_<uid> symlink. Runs many goroutines against the
987+
// per-CRUID lock and asserts no caller observes a "file exists" from Symlink
988+
// and that the final symlink points at a valid volume-specific cache file.
989+
func TestEnsureKerberosCacheConcurrent(t *testing.T) {
990+
if runtime.GOOS == "windows" {
991+
t.Skip("ensureKerberosCache is only used on Linux")
992+
}
993+
994+
krb5Dir := t.TempDir() + "/"
995+
d := NewFakeDriver()
996+
997+
credUID := os.Getuid()
998+
krb5Prefix := "krb5cc_"
999+
ticket := []byte{'G', 'O', 'L', 'A', 'N', 'G'}
1000+
base64Ticket := base64.StdEncoding.EncodeToString(ticket)
1001+
secrets := map[string]string{
1002+
fmt.Sprintf("%s%d", krb5Prefix, credUID): base64Ticket,
1003+
}
1004+
mountFlags := []string{"sec=krb5", fmt.Sprintf("cruid=%d", credUID)}
1005+
1006+
const nGoroutines = 8
1007+
var wg sync.WaitGroup
1008+
errCh := make(chan error, nGoroutines)
1009+
for i := 0; i < nGoroutines; i++ {
1010+
wg.Add(1)
1011+
go func(idx int) {
1012+
defer wg.Done()
1013+
volumeID := fmt.Sprintf("vol-%d", idx)
1014+
_, err := d.ensureKerberosCache(krb5Dir, krb5Prefix, volumeID, mountFlags, secrets)
1015+
// Aborted is acceptable — it's how TryAcquire signals contention.
1016+
if err != nil && status.Code(err) != codes.Aborted {
1017+
errCh <- fmt.Errorf("goroutine %d: unexpected error: %v", idx, err)
1018+
}
1019+
}(i)
1020+
}
1021+
wg.Wait()
1022+
close(errCh)
1023+
for e := range errCh {
1024+
t.Error(e)
1025+
}
1026+
1027+
symlinkPath := getKerberosFilePath(krb5Dir, getKrb5CcacheName(krb5Prefix, credUID))
1028+
target, err := os.Readlink(symlinkPath)
1029+
if err != nil {
1030+
t.Fatalf("final symlink missing at %s: %v", symlinkPath, err)
1031+
}
1032+
if _, err := os.Stat(target); err != nil {
1033+
t.Fatalf("symlink target %s does not exist: %v", target, err)
1034+
}
1035+
}
1036+
9841037
func TestNodePublishVolumeIdempotentMount(t *testing.T) {
9851038
if runtime.GOOS == "windows" || os.Getuid() != 0 {
9861039
return

0 commit comments

Comments
 (0)