Skip to content

Commit 084506c

Browse files
authored
[rust/rqd] Fix memory bug on OOM killed frames (#2093)
When a frame is killed due to OOM, it is possible the that thread that collects stats before reporting back races the frame wrapping process and gather stats for the frame when some of its procs have died, leading to a incorrect reading of memory for the given frame. # How the bug manifests: For successful frames:** 1. Frame runs → processes accumulate memory → `refresh_procs` updates stats normally 2. Frame completes naturally → all processes exit cleanly together 3. Final stats are captured before the cache is cleared 4. **Memory reported correctly** **For killed frames (OOM):** 1. Frame detected using too much memory (e.g., 12GB actual usage) 2. `kill_session()` is called → child processes start dying 3. **Next `refresh_procs()` cycle happens** (this runs every report interval) 4. `session_processes.clear()` **wipes out all the cached process data** including the high memory readings 5. When rebuilding cache, zombie/dying processes are skipped 6. **Only the session leader remains** (in zombie state or about to become one) 7. `collect_proc_stats()` now reads **only the session leader's memory** (typically very small, just the shell wrapper) 8. **Massively underreported memory** (e.g., reports 1GB instead of 12GB)
1 parent 8597e82 commit 084506c

File tree

2 files changed

+27
-0
lines changed

2 files changed

+27
-0
lines changed

rust/crates/rqd/src/frame/running_frame.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::{
1111
fmt::Display,
1212
path::Path,
1313
process::ExitStatus,
14+
sync::atomic::{AtomicBool, Ordering},
1415
sync::{Arc, RwLock},
1516
};
1617
use std::{process::Stdio, thread};
@@ -59,6 +60,9 @@ pub struct RunningFrame {
5960
pub entrypoint_file_path: String,
6061
state: RwLock<FrameState>,
6162
should_remove_from_cache: RwLock<bool>,
63+
#[serde(skip_serializing)]
64+
#[serde(skip_deserializing)]
65+
stats_frozen: AtomicBool,
6266
}
6367

6468
#[derive(Serialize, Deserialize, Debug)]
@@ -177,6 +181,7 @@ impl RunningFrame {
177181
launch_thread_handle: None,
178182
})),
179183
should_remove_from_cache: RwLock::new(false),
184+
stats_frozen: AtomicBool::new(false),
180185
}
181186
}
182187

@@ -226,6 +231,11 @@ impl RunningFrame {
226231
}
227232

228233
pub fn update_frame_stats(&self, proc_stats: ProcessStats) {
234+
// Don't update stats if they've been frozen (e.g., when frame is being killed for OOM)
235+
if self.stats_frozen.load(Ordering::SeqCst) {
236+
return;
237+
}
238+
229239
self.frame_stats
230240
.write()
231241
.unwrap_or_else(|poisoned| poisoned.into_inner())
@@ -1041,6 +1051,8 @@ impl RunningFrame {
10411051

10421052
// Replace snapshot config with the new config:
10431053
frame.config = config;
1054+
// Initialize stats_frozen (skipped during deserialization)
1055+
frame.stats_frozen = AtomicBool::new(false);
10441056

10451057
let pid = frame.pid();
10461058

@@ -1258,6 +1270,17 @@ Render Frame Completed
12581270
.read()
12591271
.unwrap_or_else(|poisoned| poisoned.into_inner())
12601272
}
1273+
1274+
/// Freezes the frame statistics to prevent further updates.
1275+
///
1276+
/// This is typically called when a frame is being killed for OOM (out of memory),
1277+
/// to capture the accurate memory measurement at the moment of kill detection
1278+
/// and prevent corruption from reading zombie/dying processes.
1279+
///
1280+
/// Once frozen, calls to `update_frame_stats()` will be ignored.
1281+
pub fn freeze_stats(&self) {
1282+
self.stats_frozen.store(true, Ordering::SeqCst);
1283+
}
12611284
}
12621285

12631286
#[cfg(test)]

rust/crates/rqd/src/system/machine.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,10 @@ impl MachineMonitor {
408408
// Attempt to kill selected frames.
409409
// Logic will ignore kill errors and try again on the next iteration
410410
for frame in frames_to_kill {
411+
// Freeze stats before killing to capture accurate memory measurement.
412+
// This prevents corruption from reading zombie/dying processes after kill signal.
413+
frame.freeze_stats();
414+
411415
if let Ok(manager) = manager::instance().await {
412416
info!("Requesting a kill for {}", &frame);
413417
let kill_result = manager

0 commit comments

Comments
 (0)