Skip to content

Commit e140f8f

Browse files
fix: reduce peak memory for multi-player CFR generation (#244)
Three changes to prevent OOM when generating 3-player CFR games: - Configure jemalloc with 1-second dirty/muzzy page decay (down from default 10s). The ~17GB CFR tree freed between games was overlapping with the next allocation, spiking RSS to ~34GB. - Drop HoldemSimulation immediately after run() in generate, compare, and comparison runner. This frees the CFR tree before stats processing or potentially-blocking channel sends. - Use 47MB stack for spawned background threads to match the main binary's linker-configured stack, preventing stack overflow during deep CFR recursion with 3+ players. Also fixes agent builder ordering in comparison runner to call cfr_context() before game_state(), avoiding wasted eager CFR allocation.
1 parent d277e5a commit e140f8f

4 files changed

Lines changed: 49 additions & 10 deletions

File tree

src/arena/comparison/runner.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,13 @@ impl ArenaComparison {
259259
.map(|(idx, &agent_idx)| {
260260
let mut builder = ConfigAgentBuilder::new(agent_configs[agent_idx].clone())
261261
.expect("Failed to create agent builder")
262-
.player_idx(idx)
263-
.game_state(game_state.clone());
262+
.player_idx(idx);
263+
// Inject shared CFR context BEFORE game_state to avoid
264+
// wasted eager allocation in game_state()
264265
if let Some((ref cfr_states, ref ts)) = cfr_context {
265266
builder = builder.cfr_context(cfr_states.clone(), ts.clone());
266267
}
268+
builder = builder.game_state(game_state.clone());
267269
if let Some(ref pool) = self.config.thread_pool {
268270
builder = builder.thread_pool(pool.clone());
269271
}
@@ -311,6 +313,10 @@ impl ArenaComparison {
311313
"Completed permutation"
312314
);
313315

316+
// Drop the simulation immediately to free the CFR tree
317+
// before we read stats or invoke the callback.
318+
drop(sim);
319+
314320
// Extract statistics from the historian via the shared storage
315321
let stats = stats_storage.try_read().map_err(|e| {
316322
ComparisonError::SimulationError(format!("Failed to read stats: {}", e))

src/bin/rsp/arena/compare.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,17 @@ fn run_comparison_with_tui(comparison: ArenaComparison) -> Result<(), CompareErr
174174

175175
let (tx, rx) = std::sync::mpsc::sync_channel::<SimMessage<GameResult>>(1024);
176176

177+
// Must use a large stack to match the main binary's linker-configured stack
178+
// (47 MB via -Wl,-zstack-size), since CFR traversal with 3+ players recurses
179+
// deeply and overflows the default 8 MB thread stack.
177180
let bg_hand_store = hand_store.clone();
178-
std::thread::spawn(move || {
179-
run_comparison_background(comparison, tx, bg_hand_store, ohh_path);
180-
});
181+
const STACK_SIZE: usize = 47 * 1024 * 1024;
182+
std::thread::Builder::new()
183+
.stack_size(STACK_SIZE)
184+
.spawn(move || {
185+
run_comparison_background(comparison, tx, bg_hand_store, ohh_path);
186+
})
187+
.expect("failed to spawn comparison thread");
181188

182189
let handler = EventHandler::new(rx, Duration::from_millis(33));
183190
let mut tui_app = App::new(Some(total_games));

src/bin/rsp/arena/generate.rs

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ fn run_generation(args: &GenerateArgs, configs: &[AgentConfig]) -> Result<(), Ge
340340
};
341341

342342
sim.run(&mut ctx.rng);
343+
drop(sim);
343344
ctx.record_success();
344345

345346
if ctx.games_completed.is_multiple_of(report_interval) {
@@ -436,6 +437,12 @@ fn run_generation_inner(
436437

437438
sim.run(&mut ctx.rng);
438439

440+
// Drop the simulation immediately to free the CFR tree (~19GB for 3
441+
// players) before we snapshot stats or block on the channel send.
442+
// Without this, the tree stays alive through the potentially-blocking
443+
// tx.send(), keeping peak RSS ~2x higher than necessary.
444+
drop(sim);
445+
439446
// Record the byte offset so the TUI can fetch this hand on demand
440447
hand_store.push_offset(pre_offset);
441448

@@ -447,7 +454,6 @@ fn run_generation_inner(
447454
let seat_stats: Vec<SeatStats> = (0..setup.num_players)
448455
.map(|i| SeatStats::from_storage(&stats_snap, i))
449456
.collect();
450-
// stats_snap (with its 40+ Vecs) is dropped here, on the generation thread
451457
drop(stats_snap);
452458

453459
let game_result = GameResult {
@@ -483,13 +489,20 @@ fn run_generation_with_tui(
483489

484490
let hand_store = HandStore::new(args.output.clone());
485491

486-
// Spawn simulation in background thread
492+
// Spawn simulation in background thread.
493+
// Must use a large stack to match the main binary's linker-configured stack
494+
// (47 MB via -Wl,-zstack-size), since CFR traversal with 3+ players recurses
495+
// deeply and overflows the default 8 MB thread stack.
487496
let bg_args = args.clone();
488497
let bg_configs = configs.clone();
489498
let bg_hand_store = hand_store.clone();
490-
std::thread::spawn(move || {
491-
run_generation_background(bg_args, bg_configs, tx, bg_hand_store);
492-
});
499+
const STACK_SIZE: usize = 47 * 1024 * 1024;
500+
std::thread::Builder::new()
501+
.stack_size(STACK_SIZE)
502+
.spawn(move || {
503+
run_generation_background(bg_args, bg_configs, tx, bg_hand_store);
504+
})
505+
.expect("failed to spawn generation thread");
493506

494507
let handler = EventHandler::new(rx, std::time::Duration::from_millis(33));
495508
let mut tui_app = App::new(games_target);

src/bin/rsp/main.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,19 @@
22
#[global_allocator]
33
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
44

5+
/// Configure jemalloc to purge freed pages after 1 second instead of the
6+
/// default 10 seconds.
7+
///
8+
/// CFR solvers allocate ~17GB per game. With the default 10-second dirty page
9+
/// decay, the old tree's pages can overlap with the new allocation, spiking
10+
/// peak RSS to ~34GB. A 1-second decay is fast enough to reclaim pages between
11+
/// games (3-player CFR games take several seconds) while avoiding the syscall
12+
/// overhead of immediate purging (decay_ms:0).
13+
#[cfg(not(target_env = "msvc"))]
14+
#[allow(non_upper_case_globals)]
15+
#[unsafe(no_mangle)]
16+
pub static malloc_conf: &[u8; 40] = b"dirty_decay_ms:1000,muzzy_decay_ms:1000\0";
17+
518
mod arena;
619
mod common;
720
mod holdem;

0 commit comments

Comments
 (0)