Skip to content

Commit 34f4fd8

Browse files
authored
[rust/rqd] Log OOM kill on rqd (#2067)
Related to #2064 , which handles killing frames but fails to log its actions.
1 parent 0e758f4 commit 34f4fd8

File tree

1 file changed

+28
-16
lines changed

1 file changed

+28
-16
lines changed

rust/crates/rqd/src/system/machine.rs

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -392,25 +392,37 @@ impl MachineMonitor {
392392
}
393393

394394
self.handle_finished_frames(finished_frames).await;
395-
if let Some((memory_usage, total_memory)) = self.memory_usage().await {
396-
let frames_to_kill =
397-
oom::choose_frames_to_kill(memory_usage, total_memory, memory_aggressors);
398-
399-
// Attempt to kill selected frames.
400-
// Logic will ignore kill errors and try again on the next iteration
401-
for frame in frames_to_kill {
402-
if let Ok(manager) = manager::instance().await {
403-
let kill_result = manager
404-
.kill_running_frame(&frame.frame_id, OOM_REASON_MSG.to_string())
405-
.await;
406-
if let Err(err) = kill_result {
407-
warn!(
408-
"Failed to kill frame {} when under OOM pressure. {}",
409-
frame, err
410-
)
395+
396+
match self.memory_usage().await {
397+
Some((memory_usage, total_memory))
398+
if memory_usage > CONFIG.machine.memory_oom_margin_percentage =>
399+
{
400+
warn!(
401+
"Machine memory usage is above allowed threshold ({}). \
402+
Triggering OOM protection",
403+
CONFIG.machine.memory_oom_margin_percentage
404+
);
405+
let frames_to_kill =
406+
oom::choose_frames_to_kill(memory_usage, total_memory, memory_aggressors);
407+
408+
// Attempt to kill selected frames.
409+
// Logic will ignore kill errors and try again on the next iteration
410+
for frame in frames_to_kill {
411+
if let Ok(manager) = manager::instance().await {
412+
info!("Requesting a kill for {}", &frame);
413+
let kill_result = manager
414+
.kill_running_frame(&frame.frame_id, OOM_REASON_MSG.to_string())
415+
.await;
416+
if let Err(err) = kill_result {
417+
warn!(
418+
"Failed to kill frame {} when under OOM pressure. {}",
419+
frame, err
420+
)
421+
}
411422
}
412423
}
413424
}
425+
_ => (),
414426
}
415427

416428
// Sanitize dangling reservations

0 commit comments

Comments
 (0)