Skip to content

Commit 2afd12c

Browse files
committed
chore: add pinned host CPU read performance test
Measure default and write-combined pinned allocations on the CPU and guard against choosing read-hostile flags for read-heavy buffers.
1 parent 3cd1734 commit 2afd12c

1 file changed

Lines changed: 39 additions & 0 deletions

File tree

src/driver/safe/core.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2535,6 +2535,7 @@ impl CudaStream {
25352535

25362536
#[cfg(test)]
25372537
mod tests {
2538+
use std::hint::black_box;
25382539
use std::time::Instant;
25392540

25402541
use super::*;
@@ -2716,6 +2717,44 @@ mod tests {
27162717
assert_eq!(&host, &truth);
27172718
}
27182719

2720+
#[test]
2721+
fn test_default_pinned_host_reads_are_faster_than_write_combined() {
2722+
fn timed_host_reads(values: &[u32], n_samples: usize) -> (std::time::Duration, u64) {
2723+
let start = Instant::now();
2724+
let mut sum = 0_u64;
2725+
for _ in 0..n_samples {
2726+
for value in black_box(values) {
2727+
sum = sum.wrapping_add(u64::from(*value));
2728+
}
2729+
}
2730+
(start.elapsed(), black_box(sum))
2731+
}
2732+
2733+
let ctx = CudaContext::new(0).unwrap();
2734+
let n = 1 << 20;
2735+
let n_samples = 5;
2736+
let mut write_combined =
2737+
unsafe { ctx.alloc_pinned_with_flags::<u32>(n, sys::CU_MEMHOSTALLOC_WRITECOMBINED) }
2738+
.unwrap();
2739+
let mut default = unsafe { ctx.alloc_pinned_with_flags::<u32>(n, 0) }.unwrap();
2740+
write_combined.as_mut_slice().unwrap().fill(1);
2741+
default.as_mut_slice().unwrap().fill(1);
2742+
2743+
let (write_combined_elapsed, write_combined_sum) =
2744+
timed_host_reads(write_combined.as_slice().unwrap(), n_samples);
2745+
let (default_elapsed, default_sum) =
2746+
timed_host_reads(default.as_slice().unwrap(), n_samples);
2747+
assert_eq!(write_combined_sum, default_sum);
2748+
std::println!(
2749+
"default pinned host reads: {default_elapsed:?}; write-combined host reads: {write_combined_elapsed:?}"
2750+
);
2751+
// The performance gap should be large, but leave margin for device and host variance.
2752+
assert!(
2753+
default_elapsed.as_secs_f32() * 2.0 < write_combined_elapsed.as_secs_f32(),
2754+
"{default_elapsed:?} vs {write_combined_elapsed:?}"
2755+
);
2756+
}
2757+
27192758
#[test]
27202759
fn test_pinned_copy_is_faster() {
27212760
let ctx = CudaContext::new(0).unwrap();

0 commit comments

Comments
 (0)