@@ -603,36 +603,73 @@ func (d *Driver) ensureKerberosCache(krb5CacheDirectory, krb5Prefix, volumeID st
603603 if err != nil {
604604 return false , err
605605 }
606-
607606 volumeIDCacheFileName := volumeKerberosCacheName (volumeID )
608- volumeIDCacheAbsolutePath := getKerberosFilePath (krb5CacheDirectory , volumeIDCacheFileName )
609607
608+ volumeIDCacheAbsolutePath := getKerberosFilePath (krb5CacheDirectory , volumeIDCacheFileName )
610609 if err := os .WriteFile (volumeIDCacheAbsolutePath , content , os .FileMode (0700 )); err != nil {
611610 return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't write kerberos cache to file %s: %v" , volumeIDCacheAbsolutePath , err ))
612611 }
613612 if err := os .Chown (volumeIDCacheAbsolutePath , credUID , credUID ); err != nil {
614613 return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't chown kerberos cache %s to user %d: %v" , volumeIDCacheAbsolutePath , credUID , err ))
615614 }
616615
617- cruidLockKey := fmt .Sprintf ("cruid-%d" , credUID )
618- if acquired := d .volumeLocks .TryAcquire (cruidLockKey ); ! acquired {
619- return false , status .Errorf (codes .Aborted , "another kerberos cache operation for CRUID %d is already in progress" , credUID )
616+ // Check if a valid symlink already exists — if so, leave it alone for concurrent mounts.
617+ // Use Lstat so a dangling symlink is still detected as present.
618+ shouldCreateSymlink := true
619+ fileInfo , statErr := os .Lstat (krb5CacheFileName )
620+ if statErr == nil {
621+ if fileInfo .Mode ()& os .ModeSymlink != 0 {
622+ symlinkTarget , readErr := os .Readlink (krb5CacheFileName )
623+ if readErr != nil {
624+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't read symlink %s: %v" , krb5CacheFileName , readErr ))
625+ }
626+ // Accept any existing symlink that points to a valid, readable cache file.
627+ // This avoids racing with concurrent mounts for the same cruid but different volumes.
628+ if _ , targetErr := os .Stat (krb5CacheFileName ); targetErr == nil {
629+ klog .V (2 ).Infof ("Valid symlink already exists [%s -> %s], leaving it alone for concurrent mount." , krb5CacheFileName , symlinkTarget )
630+ shouldCreateSymlink = false
631+ }
632+ }
633+ if shouldCreateSymlink {
634+ if err := os .Remove (krb5CacheFileName ); err != nil && ! os .IsNotExist (err ) {
635+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't remove existing kerberos cache path %s: %v" , krb5CacheFileName , err ))
636+ }
637+ }
638+ } else if ! os .IsNotExist (statErr ) {
639+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't inspect kerberos cache path %s: %v" , krb5CacheFileName , statErr ))
620640 }
621- defer d .volumeLocks .Release (cruidLockKey )
622641
623- // Use Lstat so a dangling symlink is still detected as present — os.Stat
624- // would follow the link and report ENOENT, causing the subsequent Symlink
625- // to fail with "file exists".
626- if _ , err := os .Lstat (krb5CacheFileName ); os .IsNotExist (err ) {
627- klog .V (2 ).Infof ("Symlink file doesn't exist, it will be created [%s]" , krb5CacheFileName )
628- } else {
629- if err := os .Remove (krb5CacheFileName ); err != nil {
630- klog .Warningf ("Couldn't delete the file [%s]: %v" , krb5CacheFileName , err )
642+ if shouldCreateSymlink {
643+ // Use atomic temp-symlink + rename to avoid a window where krb5CacheFileName is missing.
644+ created := false
645+ var tempSymlinkName string
646+ for attempt := 0 ; attempt < 5 ; attempt ++ {
647+ tempSymlinkName = filepath .Join (filepath .Dir (krb5CacheFileName ), fmt .Sprintf (".%s.tmp.%d" , filepath .Base (krb5CacheFileName ), time .Now ().UnixNano ()))
648+ if err := os .Symlink (volumeIDCacheAbsolutePath , tempSymlinkName ); err != nil {
649+ if os .IsExist (err ) {
650+ klog .V (2 ).Infof ("Temporary symlink path %s already exists, retrying." , tempSymlinkName )
651+ continue
652+ }
653+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't create temporary symlink %s -> %s for credUID %d: %v" , tempSymlinkName , volumeIDCacheAbsolutePath , credUID , err ))
654+ }
655+ created = true
656+ break
657+ }
658+ if ! created {
659+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't create temporary symlink for kerberos cache %s after retries" , krb5CacheFileName ))
631660 }
632- }
633661
634- if err := os .Symlink (volumeIDCacheAbsolutePath , krb5CacheFileName ); err != nil {
635- return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't create symlink to a cache file %s->%s for user %d: %v" , krb5CacheFileName , volumeIDCacheFileName , credUID , err ))
662+ if err := os .Rename (tempSymlinkName , krb5CacheFileName ); err != nil {
663+ if removeErr := os .Remove (krb5CacheFileName ); removeErr != nil && ! os .IsNotExist (removeErr ) {
664+ os .Remove (tempSymlinkName )
665+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't replace kerberos symlink %s: rename failed: %v, remove failed: %v" , krb5CacheFileName , err , removeErr ))
666+ }
667+ if retryErr := os .Rename (tempSymlinkName , krb5CacheFileName ); retryErr != nil {
668+ os .Remove (tempSymlinkName )
669+ return false , status .Error (codes .Internal , fmt .Sprintf ("Couldn't atomically update kerberos symlink %s -> %s for credUID %d: %v" , krb5CacheFileName , volumeIDCacheAbsolutePath , credUID , retryErr ))
670+ }
671+ }
672+ klog .V (2 ).Infof ("Updated kerberos symlink %s -> %s" , krb5CacheFileName , volumeIDCacheAbsolutePath )
636673 }
637674
638675 return true , nil
0 commit comments