cubecl/crates/cubecl-cuda/src/compute/io/base.rs at 0b326e3b35f1dba267d70762ca1e3d9d43bc0c17 · tracel-ai/cubecl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
use super::controller::PinnedMemoryManagedAllocController;
use crate::compute::{CudaContext, MB};
use cubecl_common::bytes::Bytes;
use cubecl_core::server::{CopyDescriptor, IoError};
use cubecl_runtime::memory_management::MemoryHandle;
use cubecl_runtime::stride::{is_contiguous, is_inner_contiguous_rows, row_pitch_elems};
use cudarc::driver::sys::{CUDA_MEMCPY2D_st, CUmemorytype, cuMemcpy2DAsync_v2};
use std::{ffi::c_void, ops::DerefMut};

/// Registers multiple lazy buffer copies to [Bytes], potentially using pinned memory.
///
/// # Arguments
///
/// * `ctx` - The CUDA context for managing memory and streams.
/// * `descriptors` - A vector of copy descriptors specifying the source data.
///
/// # Returns
///
/// A [Result] containing a vector of [Bytes] with the copied data, or an [IoError] if any copy fails.
pub fn register_copies_to_bytes(
    ctx: &mut CudaContext,
    descriptors: Vec<CopyDescriptor<'_>>,
) -> Result<Vec<Bytes>, IoError> {
    let mut result = Vec::with_capacity(descriptors.len());

    for descriptor in descriptors {
        result.push(register_copy_to_bytes(ctx, descriptor, false)?);
    }

    Ok(result)
}

/// Registers a single lazy buffer copy to [Bytes], potentially using pinned memory.
///
/// # Arguments
///
/// * `ctx` - The CUDA context for managing memory and streams.
/// * `descriptor` - The copy descriptor specifying the source data.
/// * `marked_pinned` - Whether to force the use of pinned memory for the copy.
///
/// # Returns
///
/// A [Result] containing the copied data as [Bytes], or an [IoError] if the copy fails.
pub fn register_copy_to_bytes(
    ctx: &mut CudaContext,
    descriptor: CopyDescriptor<'_>,
    marked_pinned: bool,
) -> Result<Bytes, IoError> {
    let CopyDescriptor {
        binding,
        shape,
        strides,
        elem_size,
    } = descriptor;

    if !(is_contiguous(shape, strides) || is_inner_contiguous_rows(shape, strides)) {
        return Err(IoError::UnsupportedStrides);
    }

    let num_bytes = shape.iter().product::<usize>() * elem_size;
    let resource = ctx
        .memory_management_gpu
        .get_resource(binding.memory, binding.offset_start, binding.offset_end)
        .ok_or(IoError::InvalidHandle)?;

    let mut bytes = bytes_from_managed_pinned_memory(ctx, num_bytes, marked_pinned)
        .unwrap_or_else(|| Bytes::from_bytes_vec(vec![0; num_bytes]));

    let rank = shape.len();
    if rank <= 1 {
        unsafe {
            cudarc::driver::result::memcpy_dtoh_async(bytes.deref_mut(), resource.ptr, ctx.stream)
                .map_err(|e| IoError::Unknown(format!("CUDA memcpy failed: {}", e)))?;
        }
        return Ok(bytes);
    }

    let dim_x = shape[rank - 1];
    let width_bytes = dim_x * elem_size;
    let dim_y: usize = shape.iter().rev().skip(1).product();
    let pitch = row_pitch_elems(shape, strides).unwrap() * elem_size;
    let slice = bytes.deref_mut();

    let cpy = CUDA_MEMCPY2D_st {
        srcMemoryType: CUmemorytype::CU_MEMORYTYPE_DEVICE,
        srcDevice: resource.ptr,
        srcPitch: pitch,
        dstMemoryType: CUmemorytype::CU_MEMORYTYPE_HOST,
        dstHost: slice.as_mut_ptr() as *mut c_void,
        dstPitch: width_bytes,
        WidthInBytes: width_bytes,
        Height: dim_y,
        ..Default::default()
    };

    unsafe {
        cuMemcpy2DAsync_v2(&cpy, ctx.stream)
            .result()
            .map_err(|e| IoError::Unknown(format!("CUDA 2D memcpy failed: {}", e)))?;
    }

    Ok(bytes)
}

/// Creates a [Bytes] instance from pinned memory, if suitable for the given size.
///
/// For small data transfers (<= 100 MB) or when explicitly marked as pinned, this function
/// uses pinned memory to optimize performance. For larger transfers, it falls back to regular memory.
///
/// # Arguments
///
/// * `ctx` - The CUDA context for managing memory.
/// * `num_bytes` - The number of bytes to allocate.
/// * `marked_pinned` - Whether to force the use of pinned memory.
///
/// # Returns
///
/// An [Option] containing a [Bytes] instance if pinned memory is used, or [None] if regular memory should be used instead.
fn bytes_from_managed_pinned_memory(
    ctx: &mut CudaContext,
    num_bytes: usize,
    marked_pinned: bool,
) -> Option<Bytes> {
    // Use pinned memory for small transfers (<= 100 MB) or when explicitly marked.
    if !marked_pinned && num_bytes > 100 * MB {
        return None;
    }

    let handle = ctx.memory_management_cpu.reserve(num_bytes as u64).ok()?;
    let binding = handle.binding();
    let resource = ctx
        .memory_management_cpu
        .get_resource(binding.clone(), None, None)
        .ok_or(IoError::InvalidHandle)
        .ok()?;

    let (controller, alloc) = PinnedMemoryManagedAllocController::init(binding, resource);

    Some(unsafe { Bytes::from_raw_parts(alloc, num_bytes, Box::new(controller)) })
}