Skip to content

Commit 7c0b3e5

Browse files
committed
fix(test): make test_schedstat_fd_closed_on_thread_exit not flaky
The previous assertion checked fcntl(fd, F_GETFD) != -1 after the spawned thread exited. Linux readily recycles fd numbers, and other tests in the same suite open many files concurrently (including their own per-thread schedstat fds via tokio worker poll callbacks). When a sibling test opened a file in the small window after our spawned thread closed its schedstat fd, the fd number got reused and the check incorrectly reported a leak. Capture the path the fd points to via /proc/self/fd/<fd> inside the spawned thread, and after join verify that the fd is either closed or now points at a different file. Each thread opens schedstat for its own tid, so /proc/self/task/<spawned_tid>/schedstat is unique to the spawned thread's open and no concurrent test can collide on it. Verified by running the full cargo test -p dial9-tokio-telemetry --lib suite 13 times in a row with zero failures (previously failed roughly 3/10 runs), and confirmed the new test still catches a real leak by temporarily mem::forget'ing the OwnedFd in the production path.
1 parent 5433e82 commit 7c0b3e5

1 file changed

Lines changed: 39 additions & 9 deletions

File tree

dial9-tokio-telemetry/src/telemetry/events.rs

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -609,19 +609,49 @@ mod tests {
609609
#[test]
610610
#[cfg(any(target_os = "linux", target_os = "android"))]
611611
fn test_schedstat_fd_closed_on_thread_exit() {
612-
// fcntl(fd, F_GETFD) returns -1 with errno=EBADF for closed fds.
613-
// SAFETY: F_GETFD with a raw fd is side-effect free.
614-
fn fd_is_open(fd: std::os::fd::RawFd) -> bool {
615-
unsafe { libc::fcntl(fd, libc::F_GETFD) != -1 }
612+
// We cannot just check `fcntl(fd, F_GETFD) != -1` after the thread
613+
// exits: the kernel readily recycles fd numbers, and other tests in the
614+
// suite open many files (including their own per-thread schedstat fds)
615+
// concurrently. A closed fd may immediately be reused by an unrelated
616+
// open in another thread, making a naive open-check flaky.
617+
//
618+
// Instead, record the path the fd points at (via /proc/self/fd/<fd>)
619+
// *inside* the spawned thread, and after join check whether the fd is
620+
// either closed or now points at a different file. Each thread opens
621+
// schedstat for its own tid, so the captured path
622+
// `/proc/self/task/<spawned_tid>/schedstat` is unique to the spawned
623+
// thread's open: no other thread will ever open that exact path. If the
624+
// fd is still open and still points at it, the OwnedFd was leaked.
625+
626+
fn fd_target(fd: std::os::fd::RawFd) -> Option<std::path::PathBuf> {
627+
std::fs::read_link(format!("/proc/self/fd/{fd}")).ok()
616628
}
617629

618-
let fd = std::thread::spawn(|| SchedStat::read_current().unwrap().fd)
619-
.join()
620-
.unwrap();
630+
let (fd, opened_path) = std::thread::spawn(|| {
631+
let fd = SchedStat::read_current().unwrap().fd;
632+
let path = fd_target(fd).expect("readlink /proc/self/fd/<fd> in live thread");
633+
(fd, path)
634+
})
635+
.join()
636+
.unwrap();
621637

638+
// Sanity-check: confirm we actually captured a schedstat path so the
639+
// assertion below is meaningful (rather than passing because we read
640+
// the wrong link).
622641
assert!(
623-
!fd_is_open(fd),
624-
"schedstat fd {fd} leaked after thread exit"
642+
opened_path.to_string_lossy().ends_with("/schedstat"),
643+
"expected schedstat path, got {opened_path:?}"
625644
);
645+
646+
match fd_target(fd) {
647+
None => { /* fd is closed - good. */ }
648+
Some(now) if now != opened_path => {
649+
// fd was closed and the slot was reused for an unrelated open
650+
// in another thread. That still means our OwnedFd was dropped.
651+
}
652+
Some(now) => panic!(
653+
"schedstat fd {fd} leaked after thread exit (still points at {now:?})"
654+
),
655+
}
626656
}
627657
}

0 commit comments

Comments
 (0)