Skip to content

Commit 83138a0

Browse files
fix: catalyst teardown timeout — nvidia RM takes ~160s on GV100
The warm_swap phase (unbinding nvsov from Titan V) triggers nvidia-470 RM teardown which takes ~160s for HBM2 dealloc + falcon halt. The previous 10s UNBIND_TIMEOUT killed the child process prematurely, causing the rebind to race with still-running kernel teardown. - Add CATALYST_TEARDOWN_TIMEOUT (200s) for nvidia RM teardown - Use extended timeout for catalyst warm_swap unbind - Raise RPC timeout from 180s to 420s - Raise HANDOFF_DEADLINE from 150s to 400s Validated: full catalyst pipeline completes end-to-end on Titan V — insmod (400ms), BAR0 capture (78,627 registers), warm_swap (~160s), snapshot persisted, frozen .ko archived, rmmod clean (100ms). Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent dfff34a commit 83138a0

3 files changed

Lines changed: 18 additions & 8 deletions

File tree

crates/core/cylinder/src/vfio/guarded_sysfs.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,14 @@ pub const UNBIND_TIMEOUT: Duration = Duration::from_secs(10);
3737
pub const INSMOD_TIMEOUT: Duration = Duration::from_secs(15);
3838
/// Default timeout for `rmmod` operations.
3939
pub const RMMOD_TIMEOUT: Duration = Duration::from_secs(10);
40+
/// Extended timeout for nvidia RM teardown during catalyst unbind.
41+
/// nvidia-470's RM on GV100 takes ~160s to fully teardown (HBM2 dealloc,
42+
/// falcon shutdown, FECS/GPCCS halt). Must exceed this or the child gets
43+
/// killed and the probe/rebind races with still-running kernel teardown.
44+
pub const CATALYST_TEARDOWN_TIMEOUT: Duration = Duration::from_secs(200);
4045
/// Default overall handoff deadline.
41-
/// 150s: catalyst cold-boot on Volta needs ~30s deferred probe delay,
42-
/// 15s settle, and 30-60s RM cold init (HBM2 + falcon + FECS).
43-
pub const HANDOFF_DEADLINE: Duration = Duration::from_secs(150);
46+
/// 400s for catalyst: 15s settle + 160s RM teardown + 30s BAR0 capture.
47+
pub const HANDOFF_DEADLINE: Duration = Duration::from_secs(400);
4448

4549
/// Errors from guarded sysfs operations.
4650
#[derive(Debug, thiserror::Error)]

crates/core/cylinder/src/vfio/sovereign_handoff.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1030,15 +1030,20 @@ pub fn execute_handoff(
10301030
let t = Instant::now();
10311031
if let Some(ref current) = guarded_sysfs::read_current_driver(&config.bdf) {
10321032
let unbind_path = crate::linux_paths::sysfs_pci_driver_unbind(current);
1033+
let unbind_timeout = if is_catalyst {
1034+
guarded_sysfs::CATALYST_TEARDOWN_TIMEOUT
1035+
} else {
1036+
guarded_sysfs::UNBIND_TIMEOUT
1037+
};
10331038
if let Err(e) = guarded_sysfs::sysfs_write_guarded(
1034-
&unbind_path, &config.bdf, guarded_sysfs::UNBIND_TIMEOUT,
1039+
&unbind_path, &config.bdf, unbind_timeout,
10351040
) {
10361041
steps.push(HandoffStep {
10371042
name: "warm_swap".into(), ok: false,
10381043
detail: Some(format!("guarded unbind {current} failed: {e}")),
10391044
duration_ms: t.elapsed().as_millis() as u64,
10401045
});
1041-
1046+
10421047
return halt_result(&config.bdf, "warm_swap", steps, patch_result,
10431048
module_loaded, false, overall, &sibling_state,
10441049
&config.module_name, needs_device_rollback);

crates/server/src/pure_jsonrpc/handler/dispatch/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,9 +1077,10 @@ impl DispatchHandler {
10771077
// Wrapped in tokio::time::timeout to prevent indefinite RPC hangs.
10781078
// The handoff itself has internal deadlines via guarded_sysfs, but
10791079
// this outer timeout is the last line of defense.
1080-
// 180s: catalyst cold-boot on Volta requires ~30s PCI probe delay
1081-
// + 15s settle + 30-60s RM init (HBM2 training, falcon boot, FECS).
1082-
let rpc_timeout = std::time::Duration::from_secs(180);
1080+
// 420s: catalyst teardown on GV100 needs ~160s for nvidia RM
1081+
// shutdown (HBM2 dealloc, falcon halt) + 15s settle + 30s probe
1082+
// + 30s BAR0 capture margin.
1083+
let rpc_timeout = std::time::Duration::from_secs(420);
10831084
let blocking_future = tokio::task::spawn_blocking(move || {
10841085
execute_handoff(&config, None)
10851086
});

0 commit comments

Comments
 (0)