Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions nativelink-store/src/filesystem_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,22 @@ pub struct SharedContext {
content_path: String,
}

impl SharedContext {
/// Test-only constructor that lets external crates fabricate a context
/// matching a FilesystemStore's configured paths so they can build
/// synthetic `EncodedFilePath` / `FileEntryImpl` instances for race
/// simulations. Production code receives this only via `FilesystemStore`
/// construction.
#[doc(hidden)]
pub const fn new_for_test(temp_path: String, content_path: String) -> Self {
Self {
active_drop_spawns: AtomicU64::new(0),
temp_path,
content_path,
}
}
}

#[derive(Eq, PartialEq, Debug)]
enum PathType {
Content,
Expand All @@ -99,6 +115,23 @@ impl EncodedFilePath {
fn get_file_path(&self) -> Cow<'_, OsStr> {
get_file_path_raw(&self.path_type, self.shared_context.as_ref(), &self.key)
}

/// Test-only constructor. Used by races simulation in nativelink-worker
/// to fabricate a `FileEntryImpl` pointing at a known-on-disk file under
/// the store's content path. Production code must not call this; the
/// store constructs its own `EncodedFilePath` values through the
/// `make_and_open_file` -> `emplace_file` flow.
#[doc(hidden)]
pub const fn new_content_for_test(
shared_context: Arc<SharedContext>,
key: StoreKey<'static>,
) -> Self {
Self {
shared_context,
path_type: PathType::Content,
key,
}
}
}

#[inline]
Expand Down Expand Up @@ -444,7 +477,13 @@ pub fn key_from_file(file_name: &str, file_type: FileType) -> Result<StoreKey<'_
/// `add_files_to_cache`.
const SIMULTANEOUS_METADATA_READS: usize = 200;

type FsEvictingMap<'a, Fe> =
/// The concrete `EvictingMap` instantiation used by `FilesystemStore`. This
/// alias is `pub` solely so tests in other crates (e.g. nativelink-worker's
/// `download_to_directory_retries_when_entry_evicted_between_lookup_and_hardlink`)
/// can name the type returned by `FilesystemStore::evicting_map_for_test`.
/// Production code should treat this as an implementation detail.
#[doc(hidden)]
pub type FsEvictingMap<'a, Fe> =
EvictingMap<StoreKeyBorrow, StoreKey<'a>, Arc<Fe>, SystemTime, RemoveItemCallbackHolder>;

async fn add_files_to_cache<Fe: FileEntry>(
Expand Down Expand Up @@ -748,6 +787,16 @@ impl<Fe: FileEntry> FilesystemStore<Fe> {
self.weak_self.upgrade()
}

/// Test-only accessor for the internal evicting map. Used by tests that
/// need to deterministically simulate concurrent insert/displace races
/// (e.g. the loser-entry race exercised by
/// `download_to_directory_retries_when_entry_evicted_between_lookup_and_hardlink`
/// in nativelink-worker). Production code must not depend on this.
#[doc(hidden)]
pub const fn evicting_map_for_test(&self) -> &Arc<FsEvictingMap<'static, Fe>> {
&self.evicting_map
}

pub async fn get_file_entry_for_digest(&self, digest: &DigestInfo) -> Result<Arc<Fe>, Error> {
if is_zero_digest(digest) {
return Ok(Arc::new(Fe::create(
Expand Down Expand Up @@ -845,9 +894,17 @@ impl<Fe: FileEntry> FilesystemStore<Fe> {
// The insert might have resulted in an eviction/unref so we need to check
// it still exists in there. But first, get the lock...
let mut encoded_file_path = entry.get_encoded_file_path().write().await;
// Then check it's still in there...
if evicting_map.get(&key).await.is_none() {
info!(%key, "Got eviction while emplacing, dropping");
// Check that OUR specific entry is still in the map. A concurrent
// write for the same key may have replaced our entry (calling
// unref which deletes our temp file). Checking just the key
// would pass if the replacement entry exists, but our temp file
// would already be deleted → ENOENT on rename.
let still_ours = match evicting_map.get(&key).await {
Some(map_entry) => Arc::ptr_eq(&map_entry, &entry),
None => false,
};
if !still_ours {
info!(%key, "Got eviction or replacement while emplacing, dropping");
return Ok(());
}

Expand Down
144 changes: 117 additions & 27 deletions nativelink-worker/src/running_actions_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,34 +157,124 @@ pub fn download_to_directory<'a>(
file_slot.write_all(&[]).await?;
}
else {
let file_entry = filesystem_store
.get_file_entry_for_digest(&digest)
.await
.err_tip(|| "During hard link")?;
// TODO: add a test for #2051: deadlock with large number of files
let src_path = file_entry.get_file_path_locked(|src| async move { Ok(PathBuf::from(src)) }).await?;
fs::hard_link(&src_path, &dest)
.await
.map_err(|e| {
if e.code == Code::NotFound {
e.append(
format!(
"Could not make hardlink to {dest}, file was likely evicted from cache.\n\
This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\
To fix this issue:\n\
1. Increase the 'max_bytes' value in your filesystem store configuration\n\
2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\
3. The setting is typically found in your nativelink.json config under:\n\
stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\
4. Restart NativeLink after making the change\n\n\
If this error persists after increasing max_bytes several times, please report at:\n\
https://github.com/TraceMachina/nativelink/issues\n\
Include your config file and both server and client logs to help us assist you."
))
} else {
e.append(format!("Could not make hardlink to {dest}"))
// Bounded retry around the hard_link to close the
// SECOND-PASS reader-side emplace race that the
// earlier read-lock fix (commit a2f4f4ab) did not
// cover.
//
// Race recap (companion fix to PR #2341):
//
// The first fix held the per-FileEntry read lock
// across hard_link so that, for the entry the
// reader has in hand, a writer's `unref()`
// (write lock on the same RwLock) cannot rename
// the file out from under the syscall. That
// correctly protects the WINNER entry mid-rename.
//
// It does NOT protect against the LOSER-entry
// case: the reader looks up the digest and gets
// `Arc<entry_A>`, then a concurrent writer
// `insert(entry_B)` for the SAME key displaces
// entry_A and triggers `unref(entry_A)`. The
// writer's unref takes entry_A's write lock; the
// reader's read lock request on entry_A may
// queue behind it. When the reader finally
// acquires the lock, entry_A's file has been
// moved to the temp path and the hard_link
// returns ENOENT. The CAS still has the digest
// — under entry_B in the map — so re-fetching
// `get_file_entry_for_digest` returns entry_B,
// whose file is on disk under the writer's
// read-lock-protected rename. A single retry
// resolves the race; we cap at
// `HARDLINK_MAX_RETRIES` so that genuine
// eviction-pressure ENOENT (no writer racing,
// the digest truly is gone) cannot spin.
//
// Re: TODO #2051 (deadlock with large number of
// files): the lock taken here is a per-FileEntry
// read lock, not a global lock. Multiple
// concurrent hard_link calls against DIFFERENT
// digests do not contend, and multiple readers of
// the SAME digest share the read lock. The only
// contention is reader-vs-writer on the same
// digest, which is exactly the contention we need
// for correctness. The outer concurrency cap on
// `download_to_directory` is governed by
// `fs::hard_link`'s open-file semaphore (see
// nativelink_util::fs), not by this RwLock.
// TODO(#2051): revisit if the file-handle
// semaphore proves insufficient under very large
// fan-outs.
const HARDLINK_MAX_RETRIES: u32 = 3;
// Small backoff between retries gives a racing
// writer's `emplace_file` background spawn time
// to finish renaming the temp file into
// `content_path/<digest>` before the next
// attempt's `get_file_entry_for_digest` returns
// the new entry. Without it, all retries can
// race the same write window microseconds apart
// and all observe ENOENT.
const HARDLINK_RETRY_BACKOFF: Duration =
Duration::from_millis(10);
let mut last_err: Option<Error> = None;
for attempt in 0..HARDLINK_MAX_RETRIES {
if attempt > 0 {
tokio::time::sleep(HARDLINK_RETRY_BACKOFF).await;
}
let file_entry = filesystem_store
.get_file_entry_for_digest(&digest)
.await
.err_tip(|| "During hard link")?;
let dest_for_attempt = dest.clone();
let result = file_entry
.get_file_path_locked(|src| {
let dest = dest_for_attempt.clone();
async move { fs::hard_link(src, &dest).await }
})
.await;
match result {
Ok(()) => {
last_err = None;
break;
}
})?;
Err(e)
if e.code == Code::NotFound
&& attempt + 1 < HARDLINK_MAX_RETRIES =>
{
// Reader saw a stale entry that was
// unref'd before we hard_link'd. The
// evicting map should now have the
// winning writer's entry — retry
// after a short backoff (loop
// continues to the next attempt).
last_err = Some(e);
}
Err(e) => {
last_err = Some(e);
break;
}
}
}
if let Some(e) = last_err {
return Err(if e.code == Code::NotFound {
e.append(format!(
"Could not make hardlink to {dest} after {HARDLINK_MAX_RETRIES} attempts, file was likely evicted from cache.\n\
This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\
To fix this issue:\n\
1. Increase the 'max_bytes' value in your filesystem store configuration\n\
2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\
3. The setting is typically found in your nativelink.json config under:\n\
stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\
4. Restart NativeLink after making the change\n\n\
If this error persists after increasing max_bytes several times, please report at:\n\
https://github.com/TraceMachina/nativelink/issues\n\
Include your config file and both server and client logs to help us assist you."
))
} else {
e.append(format!("Could not make hardlink to {dest}"))
});
}
}
#[cfg(target_family = "unix")]
if let Some(unix_mode) = unix_mode {
Expand Down
Loading
Loading