Skip to content

Commit a00bf8c

Browse files
erneestocclaude
andauthored
Fix CAS inode corruption: directory cache cleanup must not chmod files (#2347)
Port of TraceMachina/nativelink PR #2243 commit a47774d. Root cause: when DirectoryCache evicts an entry, the cleanup path calls `set_readwrite_recursive` on the cached tree before `remove_dir_all`. That helper chmods every entry — including files — to 0o755/0o644. Files in a cached entry are hardlinked into in-flight action workspaces (via `hardlink_directory_tree` in `get_or_create`) and ultimately share an inode with the underlying `FilesystemStore` CAS blob (via `fs::hard_link` in `download_to_directory`). Chmoding the cached-side file therefore silently mutates the shared inode's mode for every other in-flight action holding a hardlink to the same blob. Production symptom: EACCES on exec for `cc_wrapper.sh`. The CAS mode of 0o555 (r-xr-xr-x) gets clobbered to 0o644 (rw-r--r--), dropping the +x bit while an unrelated action is mid-exec. Fix: introduce `set_dir_writable_recursive` which only chmods directories, never files. On unix, write permission on the parent directory is sufficient to unlink files inside; the files' own modes are irrelevant for unlinking. Switch the eviction cleanup path in `DirectoryCache::evict_lru` to the new helper. Empirically verified: a regression test that hardlinks a 0o555 file into a cached tree and runs the cleanup helper FAILS on pre-fix code (file mode mutated to 0o644) and PASSES on post-fix code. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e6def51 commit a00bf8c

2 files changed

Lines changed: 182 additions & 3 deletions

File tree

nativelink-util/src/fs_util.rs

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,28 @@ pub async fn set_readwrite_recursive(dir: &Path) -> Result<(), Error> {
286286
set_perms_recursive_impl(dir.to_path_buf(), set_readwrite_one_path).await
287287
}
288288

289+
/// Sets only the **directories** in a tree to writable for the current user,
290+
/// leaving files untouched. This is the safe variant for cleanup paths that
291+
/// need to delete a tree containing CAS-hardlinked files.
292+
///
293+
/// On unix, write permission on the parent directory is sufficient to unlink
294+
/// files inside it — the files' own modes are irrelevant for unlinking. Chmoding
295+
/// a CAS-hardlinked file would silently mutate the shared inode's permissions
296+
/// for every other in-flight action that has hardlinked the same blob, leading
297+
/// to EACCES on exec or EPERM on open in unrelated actions.
298+
///
299+
/// # Arguments
300+
/// * `dir` - Directory whose directories should be made writable
301+
///
302+
/// # Platform Notes
303+
/// - Unix: Sets directory permissions to 0o755 (rwxr-xr-x); files are NOT touched.
304+
/// - Windows: Clears `FILE_ATTRIBUTE_READONLY` on directories only; files are NOT touched.
305+
pub async fn set_dir_writable_recursive(dir: &Path) -> Result<(), Error> {
306+
error_if!(!dir.exists(), "Directory does not exist: {}", dir.display());
307+
308+
set_perms_recursive_impl(dir.to_path_buf(), set_dir_writable_one_path).await
309+
}
310+
289311
fn set_readonly_one_path(
290312
path: PathBuf,
291313
metadata: Metadata,
@@ -356,6 +378,43 @@ fn set_readwrite_one_path(
356378
})
357379
}
358380

381+
fn set_dir_writable_one_path(
382+
path: PathBuf,
383+
metadata: Metadata,
384+
) -> Pin<Box<dyn Future<Output = Result<(), Error>> + Send>> {
385+
Box::pin(async move {
386+
// Files are intentionally skipped here. They may be hardlinked into
387+
// the CAS (FilesystemStore); chmoding them would corrupt the shared
388+
// inode's mode for every other in-flight action.
389+
if !metadata.is_dir() {
390+
return Ok(());
391+
}
392+
393+
#[cfg(unix)]
394+
{
395+
use std::os::unix::fs::PermissionsExt;
396+
let mut perms = metadata.permissions();
397+
perms.set_mode(0o755);
398+
399+
fs::set_permissions(&path, perms)
400+
.await
401+
.err_tip(|| format!("Failed to set permissions for: {}", path.display()))?;
402+
}
403+
404+
#[cfg(windows)]
405+
{
406+
let mut perms = metadata.permissions();
407+
perms.set_readonly(false);
408+
409+
fs::set_permissions(&path, perms)
410+
.await
411+
.err_tip(|| format!("Failed to set permissions for: {}", path.display()))?;
412+
}
413+
414+
Ok(())
415+
})
416+
}
417+
359418
fn set_perms_recursive_impl<'a, F>(
360419
path: PathBuf,
361420
perms_fn: F,

nativelink-worker/src/directory_cache.rs

Lines changed: 123 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use nativelink_store::ac_utils::get_and_decode_digest;
2828
use nativelink_store::cas_utils::is_zero_digest;
2929
use nativelink_util::common::DigestInfo;
3030
use nativelink_util::fs_util::{
31-
CloneMethod, hardlink_directory_tree, set_readonly_recursive, set_readwrite_recursive,
31+
CloneMethod, hardlink_directory_tree, set_dir_writable_recursive, set_readonly_recursive,
3232
};
3333
use nativelink_util::store_trait::{Store, StoreKey, StoreLike};
3434
use tokio::fs;
@@ -448,12 +448,20 @@ impl DirectoryCache {
448448
{
449449
debug!(?digest, size = metadata.size, "Evicting cached directory");
450450

451-
if let Err(e) = set_readwrite_recursive(&metadata.path).await {
451+
// CRITICAL: only chmod directories writable, never files. Cached
452+
// files share an inode with FilesystemStore CAS entries via
453+
// hardlink (see `download_to_directory` in running_actions_manager).
454+
// Calling `set_readwrite_recursive` here would chmod those files
455+
// to 0o644, mutating the CAS inode's mode for every other in-flight
456+
// action that has hardlinked the same blob and causing EACCES on
457+
// exec (e.g. cc_wrapper.sh) or EPERM on open. Directory write
458+
// permission is sufficient on unix to unlink files inside.
459+
if let Err(e) = set_dir_writable_recursive(&metadata.path).await {
452460
warn!(
453461
?digest,
454462
path = ?metadata.path,
455463
error = ?e,
456-
"Unable to mark evicted directory as read/write, will probably fail to remove"
464+
"Unable to mark evicted directory as writable, will probably fail to remove"
457465
);
458466
}
459467

@@ -680,4 +688,116 @@ mod tests {
680688

681689
Ok(())
682690
}
691+
692+
/// Regression test for CAS inode corruption during directory cache
693+
/// eviction.
694+
///
695+
/// Background: a cached directory's files share an inode (via hardlink)
696+
/// with both the `FilesystemStore` CAS entry and every action workspace
697+
/// that has consumed this cached directory. The cleanup path that runs
698+
/// before `fs::remove_dir_all` used to call `set_readwrite_recursive`,
699+
/// which chmods every file in the tree to 0o644 — silently mutating the
700+
/// shared inode's mode for every in-flight action holding a hardlink to
701+
/// the same blob. In production this surfaced as EACCES on exec for
702+
/// `cc_wrapper.sh` (whose CAS mode is 0o555, but eviction turned it into
703+
/// 0o644, dropping the +x bit).
704+
///
705+
/// This test models the real scenario: a "cached" directory tree whose
706+
/// files are hardlinked to a still-active "action workspace" file. The
707+
/// eviction cleanup runs on the cached tree, and we assert the active
708+
/// workspace file's mode is untouched. Before the fix this test fails
709+
/// because `set_readwrite_recursive` chmods the cached-side file, and
710+
/// the same inode underlies the workspace file.
711+
#[cfg(unix)]
712+
#[nativelink_test]
713+
async fn test_eviction_cleanup_preserves_hardlinked_file_mode() -> Result<(), Error> {
714+
use std::os::unix::fs::{MetadataExt, PermissionsExt};
715+
716+
use nativelink_util::fs_util::set_dir_writable_recursive;
717+
718+
let temp_dir = TempDir::new().unwrap();
719+
720+
// Build a "cached" directory tree mimicking what DirectoryCache
721+
// produces: a top-level dir with a nested subdir and an executable
722+
// file inside. Mode 0o555 matches the CAS mode of an executable like
723+
// `cc_wrapper.sh`.
724+
let cache_entry_dir = temp_dir.path().join("cache_entry");
725+
let nested_dir = cache_entry_dir.join("nested");
726+
fs::create_dir_all(&nested_dir).await.unwrap();
727+
let cached_file = nested_dir.join("cc_wrapper.sh");
728+
fs::write(&cached_file, b"#!/bin/sh\necho hi\n")
729+
.await
730+
.unwrap();
731+
fs::set_permissions(&cached_file, std::fs::Permissions::from_mode(0o555))
732+
.await
733+
.unwrap();
734+
735+
// Mark the cached tree read-only the way DirectoryCache does after
736+
// construction (set_readonly_recursive). This drops the file to 0o444
737+
// and the directories to 0o555.
738+
set_readonly_recursive(&cache_entry_dir).await?;
739+
740+
// Simulate an in-flight action workspace that has hardlinked the
741+
// cached file. This is what `hardlink_directory_tree` does in
742+
// `get_or_create` after the cached tree is built and locked down.
743+
let action_workspace = temp_dir.path().join("action_workspace");
744+
fs::create_dir_all(&action_workspace).await.unwrap();
745+
let workspace_file = action_workspace.join("cc_wrapper.sh");
746+
fs::hard_link(&cached_file, &workspace_file).await.unwrap();
747+
748+
// Sanity check: cached file and workspace file share the same inode.
749+
let cached_ino = fs::metadata(&cached_file).await.unwrap().ino();
750+
let workspace_ino = fs::metadata(&workspace_file).await.unwrap().ino();
751+
assert_eq!(
752+
cached_ino, workspace_ino,
753+
"workspace file must share inode with cached file (hardlinked)",
754+
);
755+
let workspace_mode_before = fs::metadata(&workspace_file)
756+
.await
757+
.unwrap()
758+
.permissions()
759+
.mode()
760+
& 0o777;
761+
assert_eq!(workspace_mode_before, 0o444);
762+
763+
// Run the cleanup that `evict_lru` runs before removing the tree.
764+
// After the fix this only chmods directories; before the fix this
765+
// chmoded every file to 0o644, mutating the shared inode.
766+
set_dir_writable_recursive(&cache_entry_dir).await?;
767+
768+
// Critical assertion: the action workspace file's mode (i.e. the
769+
// shared inode's mode) MUST be unchanged. If this fails, the cleanup
770+
// path corrupted the inode for an in-flight action — that is the
771+
// bug we are guarding against.
772+
let workspace_mode_after = fs::metadata(&workspace_file)
773+
.await
774+
.unwrap()
775+
.permissions()
776+
.mode()
777+
& 0o777;
778+
assert_eq!(
779+
workspace_mode_after, workspace_mode_before,
780+
"eviction cleanup mutated the inode mode of an active workspace \
781+
file (was 0o{workspace_mode_before:o}, now 0o{workspace_mode_after:o}); \
782+
this is the CAS inode corruption bug",
783+
);
784+
785+
// We should still be able to remove the cached tree. This proves
786+
// directory writability alone is sufficient to unlink files on unix.
787+
fs::remove_dir_all(&cache_entry_dir).await.unwrap();
788+
assert!(!cache_entry_dir.exists());
789+
790+
// The workspace's file is still intact and the mode survives even
791+
// after the cached tree is gone.
792+
assert!(workspace_file.exists());
793+
let workspace_mode_after_remove = fs::metadata(&workspace_file)
794+
.await
795+
.unwrap()
796+
.permissions()
797+
.mode()
798+
& 0o777;
799+
assert_eq!(workspace_mode_after_remove, workspace_mode_before);
800+
801+
Ok(())
802+
}
683803
}

0 commit comments

Comments
 (0)