From 0fa32fbe88a6f450776b79667c5a6628b00c9747 Mon Sep 17 00:00:00 2001 From: Michael Cuevas Date: Thu, 20 Feb 2025 12:34:50 -0800 Subject: [PATCH] fs_util: allow ESTALE to be retried Summary: # This diff Adds ESTALE (StaleNetworkFileHandle) to the list of IO error types that should be retried. Also add a test to ensure this type of error is retried on macOS. # Context There's another frequent (transient) NFS error that can be retried. The error is ESTALE, which can be returned by an NFS server at any point in time. The error indicates that the file handle that was used to make a request is no longer valid, and that the request should be retried with a new handle. See: >Servers can revoke the access provided by a file handle at any time. If the file handle passed in a call refers to a file system object that no longer exists on the server or access for that file handle has been revoked, the error, NFS3ERR_STALE, should be returned. >NFS3ERR_STALE: Invalid file handle. The file handle given in the arguments was invalid. The file referred to by that file handle no longer exists or access to it has been revoked. >A file handle may or may not become stale on a rename. However, server implementors are strongly encouraged to attempt to keep file handles from becoming stale in this fashion. To respect the NFS spec, we should assume that the EdenFS/NFS server can and will return ESTALE on occasion, and that Buck should retry IO requests when an ESTALE is returned. Reviewed By: JakobDegen Differential Revision: D69892541 fbshipit-source-id: 22e8cf395c74deb962eeba9ff60f1a2262fc7b1d --- app/buck2_core/src/fs/fs_util.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/app/buck2_core/src/fs/fs_util.rs b/app/buck2_core/src/fs/fs_util.rs index b0ff1fb553617..b549672e629e0 100644 --- a/app/buck2_core/src/fs/fs_util.rs +++ b/app/buck2_core/src/fs/fs_util.rs @@ -55,7 +55,9 @@ pub struct IoError { } fn is_retryable(err: &io::Error) -> bool { - cfg!(target_os = "macos") && err.kind() == io::ErrorKind::TimedOut + cfg!(target_os = "macos") + && (err.kind() == io::ErrorKind::TimedOut + || err.kind() == io::ErrorKind::StaleNetworkFileHandle) } static MAX_IO_ATTEMPTS: u32 = 3; @@ -1391,6 +1393,14 @@ mod tests { get_test_path("test_timeout", &tempdir), (ErrorKind::TimedOut, expected_attempts, should_succeed), ); + test_cases.insert( + get_test_path("test_stale", &tempdir), + ( + ErrorKind::StaleNetworkFileHandle, + expected_attempts, + should_succeed, + ), + ); // These test cases should behave the same on all platforms test_cases.insert(