From 3e7f21b6555150ebf1856171d75a33102c0acb51 Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 19 Apr 2026 02:20:43 +0000 Subject: [PATCH 1/2] fix: detect and remount stale NFS mounts in NodePublishVolume When an NFS server restarts, existing mounts become stale. If a pod with fsGroup is restarted, kubelet calls applyFSGroup which does lstat on the mount path and fails with ESTALE before CSI driver gets a chance to remount. Fix by checking for stale file handles when the target path is already mounted. If detected, unmount the stale mount and proceed with a fresh mount. Ref 927 --- pkg/nfs/nodeserver.go | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/pkg/nfs/nodeserver.go b/pkg/nfs/nodeserver.go index 0079d3af5..9103190c6 100644 --- a/pkg/nfs/nodeserver.go +++ b/pkg/nfs/nodeserver.go @@ -18,10 +18,12 @@ package nfs import ( "context" + "errors" "fmt" "os" "strconv" "strings" + "syscall" "time" "github.com/container-storage-interface/spec/lib/go/csi" @@ -131,7 +133,18 @@ func (ns *NodeServer) NodePublishVolume(_ context.Context, req *csi.NodePublishV } } if !notMnt { - return &csi.NodePublishVolumeResponse{}, nil + // check if the existing mount is stale (e.g. after NFS server restart) + if _, err := os.Lstat(targetPath); err != nil && os.IsPermission(err) { + return &csi.NodePublishVolumeResponse{}, nil + } else if err != nil && isStaleFileHandle(err) { + klog.Warningf("NodePublishVolume: detected stale mount at %s, attempting remount", targetPath) + if unmountErr := ns.mounter.Unmount(targetPath); unmountErr != nil { + return nil, status.Errorf(codes.Internal, "failed to unmount stale mount %s: %v", targetPath, unmountErr) + } + // fall through to remount + } else { + return &csi.NodePublishVolumeResponse{}, nil + } } klog.V(2).Infof("NodePublishVolume: volumeID(%v) source(%s) targetPath(%s) mountflags(%v)", volumeID, source, targetPath, mountOptions) @@ -315,3 +328,15 @@ func makeDir(pathname string) error { } return nil } + +// isStaleFileHandle checks if an error is caused by a stale NFS file handle (ESTALE) +func isStaleFileHandle(err error) bool { + if err == nil { + return false + } + var errno syscall.Errno + if errors.As(err, &errno) { + return errno == syscall.ESTALE + } + return strings.Contains(err.Error(), "stale NFS file handle") || strings.Contains(err.Error(), "stale file handle") +} From 3ce23a84b5aee77bd48dcc111e82c5bf7e4d0ded Mon Sep 17 00:00:00 2001 From: andyzhangx Date: Sun, 19 Apr 2026 05:06:21 +0000 Subject: [PATCH 2/2] test: add unit tests for stale NFS mount detection and remount - Make os.Lstat injectable via lstatFunc package variable for testability - Add NodePublishVolume test case covering ESTALE detection -> unmount -> remount - Add TestIsStaleFileHandle with coverage for ESTALE errno, wrapped errors, string matching, and negative cases --- pkg/nfs/nodeserver.go | 5 ++++- pkg/nfs/nodeserver_test.go | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/pkg/nfs/nodeserver.go b/pkg/nfs/nodeserver.go index 9103190c6..14d80a85c 100644 --- a/pkg/nfs/nodeserver.go +++ b/pkg/nfs/nodeserver.go @@ -36,6 +36,9 @@ import ( const mountTimeoutInSec = 110 +// lstatFunc is used for testing to inject stale file handle errors +var lstatFunc = os.Lstat + // NodeServer driver type NodeServer struct { Driver *Driver @@ -134,7 +137,7 @@ func (ns *NodeServer) NodePublishVolume(_ context.Context, req *csi.NodePublishV } if !notMnt { // check if the existing mount is stale (e.g. after NFS server restart) - if _, err := os.Lstat(targetPath); err != nil && os.IsPermission(err) { + if _, err := lstatFunc(targetPath); err != nil && os.IsPermission(err) { return &csi.NodePublishVolumeResponse{}, nil } else if err != nil && isStaleFileHandle(err) { klog.Warningf("NodePublishVolume: detected stale mount at %s, attempting remount", targetPath) diff --git a/pkg/nfs/nodeserver_test.go b/pkg/nfs/nodeserver_test.go index 0ae5bd47c..33f23b5af 100644 --- a/pkg/nfs/nodeserver_test.go +++ b/pkg/nfs/nodeserver_test.go @@ -23,6 +23,7 @@ import ( "os" "reflect" "strings" + "syscall" "testing" "github.com/container-storage-interface/spec/lib/go/csi" @@ -178,6 +179,24 @@ func TestNodePublishVolume(t *testing.T) { Readonly: true}, expectedErr: status.Error(codes.InvalidArgument, "invalid mountPermissions 07ab"), }, + { + desc: "[Success] Stale mount detected and remounted", + setup: func() { + lstatFunc = func(name string) (os.FileInfo, error) { + return nil, syscall.ESTALE + } + }, + req: &csi.NodePublishVolumeRequest{ + VolumeContext: params, + VolumeCapability: &csi.VolumeCapability{AccessMode: &volumeCap}, + VolumeId: "vol_1", + TargetPath: alreadyMountedTarget, + Readonly: true}, + expectedErr: nil, + cleanup: func() { + lstatFunc = os.Lstat + }, + }, } // setup @@ -372,3 +391,25 @@ func TestNodeGetVolumeStats(t *testing.T) { err = os.RemoveAll(fakePath) assert.NoError(t, err) } + +func TestIsStaleFileHandle(t *testing.T) { + tests := []struct { + desc string + err error + expected bool + }{ + {"nil error", nil, false}, + {"ESTALE errno", syscall.ESTALE, true}, + {"other errno", syscall.ENOENT, false}, + {"stale NFS file handle string", fmt.Errorf("stale NFS file handle"), true}, + {"stale file handle string", fmt.Errorf("stale file handle"), true}, + {"unrelated error", fmt.Errorf("something else"), false}, + {"wrapped ESTALE", fmt.Errorf("wrap: %w", syscall.ESTALE), true}, + } + for _, tc := range tests { + result := isStaleFileHandle(tc.err) + if result != tc.expected { + t.Errorf("isStaleFileHandle(%v): got %v, want %v", tc.desc, result, tc.expected) + } + } +}