Skip to content

Commit 6d649c0

Browse files
fix: catalyst poll — set driver_override + drivers_probe in poll loop
After fire-and-poll clears the driver symlink (~2s), the device has no driver_override set (still "nvsov" from deferred_insmod). vfio-pci won't auto-claim without correct override + probe. Catalyst warm_swap now polls with guarded override+probe writes (5s timeout each, retry on failure). On GV100: override set in 50ms, probe sent in 50ms, vfio-pci bound within 5s. Total warm_swap: 7s. Full pipeline timings on fresh boot: insmod nvsov: 400ms catalyst capture: instant (20 registers) fire-and-poll: 2s (driver symlink cleared) poll for vfio: 5s (override+probe+rebind) BAR0 snapshot: ~1s (173K registers, 18 domains) rmmod nvsov: 100ms Known remaining gap: post-swap steps (sibling rebind, tier classify, BAR0 capture) can block 5-7min when nvidia RM teardown child is still running in the kernel background. The PCI lock contention delays sysfs operations. The BAR0 capture itself is fast once it starts. Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent e129c3e commit 6d649c0

1 file changed

Lines changed: 35 additions & 8 deletions

File tree

crates/core/cylinder/src/vfio/sovereign_handoff.rs

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,30 +1064,57 @@ pub fn execute_handoff(
10641064
}
10651065

10661066
if is_catalyst {
1067-
// After fire-and-poll, the driver symlink is gone but nvidia RM
1068-
// teardown still holds the PCI device lock. Any sysfs write
1069-
// (override, probe) would block until teardown finishes.
1070-
// Instead, poll for the final driver to appear — vfio-pci auto-
1071-
// claims via boot config once the lock releases.
1067+
// After fire-and-poll, the driver symlink clears in ~2s but nvidia
1068+
// RM teardown still holds the PCI lock for 160-400s. We need to:
1069+
// 1. Wait for driver=None (done by fire-and-poll)
1070+
// 2. Set driver_override to final_driver via guarded write (may
1071+
// block until PCI lock releases — use 5s timeout, retry)
1072+
// 3. Write drivers_probe to trigger rebind
1073+
// 4. Poll for final_driver to appear
10721074
let poll_deadline = deadline.saturating_sub(overall.elapsed());
10731075
let poll_start = Instant::now();
1074-
let poll_interval = Duration::from_secs(2);
1076+
let poll_interval = Duration::from_secs(5);
1077+
let mut override_set = false;
1078+
let mut probe_sent = false;
10751079
let mut final_driver = guarded_sysfs::read_current_driver(&config.bdf);
10761080

10771081
while final_driver.as_deref() != Some(config.final_driver.as_str()) {
10781082
if poll_start.elapsed() >= poll_deadline {
10791083
steps.push(HandoffStep {
10801084
name: "warm_swap".into(), ok: false,
10811085
detail: Some(format!(
1082-
"poll for {} timed out (driver={:?})",
1083-
config.final_driver, final_driver,
1086+
"poll for {} timed out (driver={:?}, override_set={}, probe_sent={})",
1087+
config.final_driver, final_driver, override_set, probe_sent,
10841088
)),
10851089
duration_ms: t.elapsed().as_millis() as u64,
10861090
});
10871091
return halt_result(&config.bdf, "warm_swap", steps, patch_result,
10881092
module_loaded, false, overall, &sibling_state,
10891093
&config.module_name, needs_device_rollback);
10901094
}
1095+
1096+
if !override_set {
1097+
if let Ok(()) = guarded_sysfs::sysfs_write_guarded(
1098+
&override_path, &config.final_driver,
1099+
Duration::from_secs(5),
1100+
) {
1101+
override_set = true;
1102+
tracing::info!(bdf = config.bdf.as_str(),
1103+
"catalyst poll: driver_override set to {}", config.final_driver);
1104+
}
1105+
}
1106+
1107+
if override_set && !probe_sent {
1108+
if let Ok(()) = guarded_sysfs::sysfs_write_guarded(
1109+
&probe_path, &config.bdf,
1110+
Duration::from_secs(5),
1111+
) {
1112+
probe_sent = true;
1113+
tracing::info!(bdf = config.bdf.as_str(),
1114+
"catalyst poll: drivers_probe sent");
1115+
}
1116+
}
1117+
10911118
std::thread::sleep(poll_interval);
10921119
final_driver = guarded_sysfs::read_current_driver(&config.bdf);
10931120
}

0 commit comments

Comments
 (0)