Skip to content

Commit dd0a30c

Browse files
committed
[rust/rqd] Fix memory bug on OOM killed frames
When a frame is killed due to OOM, it is possible the that thread that collects stats before reporting back races the frame wrapping process and gather stats for the frame when some of its procs have died, leading to a incorrect reading of memory for the given frame.
1 parent 8597e82 commit dd0a30c

File tree

2 files changed

+27
-0
lines changed

2 files changed

+27
-0
lines changed

rust/crates/rqd/src/frame/running_frame.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::{
1111
fmt::Display,
1212
path::Path,
1313
process::ExitStatus,
14+
sync::atomic::{AtomicBool, Ordering},
1415
sync::{Arc, RwLock},
1516
};
1617
use std::{process::Stdio, thread};
@@ -59,6 +60,9 @@ pub struct RunningFrame {
5960
pub entrypoint_file_path: String,
6061
state: RwLock<FrameState>,
6162
should_remove_from_cache: RwLock<bool>,
63+
#[serde(skip_serializing)]
64+
#[serde(skip_deserializing)]
65+
stats_frozen: AtomicBool,
6266
}
6367

6468
#[derive(Serialize, Deserialize, Debug)]
@@ -177,6 +181,7 @@ impl RunningFrame {
177181
launch_thread_handle: None,
178182
})),
179183
should_remove_from_cache: RwLock::new(false),
184+
stats_frozen: AtomicBool::new(false),
180185
}
181186
}
182187

@@ -226,6 +231,11 @@ impl RunningFrame {
226231
}
227232

228233
pub fn update_frame_stats(&self, proc_stats: ProcessStats) {
234+
// Don't update stats if they've been frozen (e.g., when frame is being killed for OOM)
235+
if self.stats_frozen.load(Ordering::SeqCst) {
236+
return;
237+
}
238+
229239
self.frame_stats
230240
.write()
231241
.unwrap_or_else(|poisoned| poisoned.into_inner())
@@ -1041,6 +1051,8 @@ impl RunningFrame {
10411051

10421052
// Replace snapshot config with the new config:
10431053
frame.config = config;
1054+
// Initialize stats_frozen (skipped during deserialization)
1055+
frame.stats_frozen = AtomicBool::new(false);
10441056

10451057
let pid = frame.pid();
10461058

@@ -1258,6 +1270,17 @@ Render Frame Completed
12581270
.read()
12591271
.unwrap_or_else(|poisoned| poisoned.into_inner())
12601272
}
1273+
1274+
/// Freezes the frame statistics to prevent further updates.
1275+
///
1276+
/// This is typically called when a frame is being killed for OOM (out of memory),
1277+
/// to capture the accurate memory measurement at the moment of kill detection
1278+
/// and prevent corruption from reading zombie/dying processes.
1279+
///
1280+
/// Once frozen, calls to `update_frame_stats()` will be ignored.
1281+
pub fn freeze_stats(&self) {
1282+
self.stats_frozen.store(true, Ordering::SeqCst);
1283+
}
12611284
}
12621285

12631286
#[cfg(test)]

rust/crates/rqd/src/system/machine.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,10 @@ impl MachineMonitor {
408408
// Attempt to kill selected frames.
409409
// Logic will ignore kill errors and try again on the next iteration
410410
for frame in frames_to_kill {
411+
// Freeze stats before killing to capture accurate memory measurement.
412+
// This prevents corruption from reading zombie/dying processes after kill signal.
413+
frame.freeze_stats();
414+
411415
if let Ok(manager) = manager::instance().await {
412416
info!("Requesting a kill for {}", &frame);
413417
let kill_result = manager

0 commit comments

Comments
 (0)