Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 83 additions & 5 deletions src/driver/safe/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1405,22 +1405,41 @@ impl<T> Drop for PinnedHostSlice<T> {
impl CudaContext {
/// Allocates page locked host memory with [sys::CU_MEMHOSTALLOC_WRITECOMBINED] flags.
///
/// Write-combined memory is intended for memory primarily written by the
/// host before a device transfer; it has poor CPU read performance. Use
/// [CudaContext::alloc_pinned_with_flags()] with `0` for default page
/// locked host memory if CPU reads matter.
Comment on lines +1410 to +1411

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see 0 as an option for flags, where are you seeing this from the nvidia docs?

///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e035b9)
///
/// # Safety
/// 1. This is unsafe because the memory is unset after this call.
pub unsafe fn alloc_pinned<T: DeviceRepr>(
self: &Arc<Self>,
len: usize,
) -> Result<PinnedHostSlice<T>, DriverError> {
self.alloc_pinned_with_flags(len, sys::CU_MEMHOSTALLOC_WRITECOMBINED)
}

/// Allocates page locked host memory with the specified `flags`.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g572ca4011bfcb25034888a14d4e035b9)
///
/// # Safety
/// 1. This is unsafe because the memory is unset after this call.
pub unsafe fn alloc_pinned_with_flags<T: DeviceRepr>(
self: &Arc<Self>,
len: usize,
flags: u32,
) -> Result<PinnedHostSlice<T>, DriverError> {
self.bind_to_thread()?;
let ptr = result::malloc_host(
len * std::mem::size_of::<T>(),
sys::CU_MEMHOSTALLOC_WRITECOMBINED,
)?;
let num_bytes = len
.checked_mul(std::mem::size_of::<T>())
.expect("Pinned host allocation size overflow");
assert!(num_bytes < isize::MAX as usize);
let ptr = result::malloc_host(num_bytes, flags)?;
let ptr = ptr as *mut T;
assert!(!ptr.is_null());
assert!(len * std::mem::size_of::<T>() < isize::MAX as usize);
assert!(ptr.is_aligned());
let event = self.new_event(Some(sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC))?;
Ok(PinnedHostSlice { ptr, len, event })
Expand Down Expand Up @@ -2519,6 +2538,7 @@ impl CudaStream {

#[cfg(test)]
mod tests {
use std::hint::black_box;
use std::time::Instant;

use super::*;
Expand Down Expand Up @@ -2687,6 +2707,64 @@ mod tests {
assert_eq!(&host, &truth);
}

#[test]
fn test_htod_copy_pinned_with_default_flags() {
let truth = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut pinned = unsafe { ctx.alloc_pinned_with_flags::<f32>(10, 0) }.unwrap();
pinned.as_mut_slice().unwrap().clone_from_slice(&truth);
assert_eq!(pinned.as_slice().unwrap(), &truth);
let dst = stream.clone_htod(&pinned).unwrap();
let host = stream.clone_dtoh(&dst).unwrap();
assert_eq!(&host, &truth);
}

#[test]
#[should_panic(expected = "Pinned host allocation size overflow")]
fn test_alloc_pinned_panics_on_size_overflow() {
let ctx = CudaContext::new(0).unwrap();
let _ = unsafe { ctx.alloc_pinned_with_flags::<u32>(usize::MAX, 0) };
}

#[test]
fn test_default_pinned_host_reads_are_faster_than_write_combined() {
fn timed_host_reads(values: &[u32], n_samples: usize) -> (std::time::Duration, u64) {
let start = Instant::now();
let mut sum = 0_u64;
for _ in 0..n_samples {
for value in black_box(values) {
sum = sum.wrapping_add(u64::from(*value));
}
}
(start.elapsed(), black_box(sum))
}

let ctx = CudaContext::new(0).unwrap();
let n = 1 << 20;
let n_samples = 5;
let mut write_combined =
unsafe { ctx.alloc_pinned_with_flags::<u32>(n, sys::CU_MEMHOSTALLOC_WRITECOMBINED) }
.unwrap();
let mut default = unsafe { ctx.alloc_pinned_with_flags::<u32>(n, 0) }.unwrap();
write_combined.as_mut_slice().unwrap().fill(1);
default.as_mut_slice().unwrap().fill(1);

let (write_combined_elapsed, write_combined_sum) =
timed_host_reads(write_combined.as_slice().unwrap(), n_samples);
let (default_elapsed, default_sum) =
timed_host_reads(default.as_slice().unwrap(), n_samples);
assert_eq!(write_combined_sum, default_sum);
std::println!(
"default pinned host reads: {default_elapsed:?}; write-combined host reads: {write_combined_elapsed:?}"
);
// The performance gap should be large, but leave margin for device and host variance.
assert!(
default_elapsed.as_secs_f32() * 2.0 < write_combined_elapsed.as_secs_f32(),
"{default_elapsed:?} vs {write_combined_elapsed:?}"
);
}

#[test]
fn test_pinned_copy_is_faster() {
let ctx = CudaContext::new(0).unwrap();
Expand Down
Loading