From 29fab7ab0bfb7f6d0d8bf16681f937210060de7c Mon Sep 17 00:00:00 2001 From: David Maseda Neira Date: Sat, 1 Nov 2025 10:35:33 +0100 Subject: [PATCH 1/3] Add safe API for CUDA constant memory operations --- examples/09-constant-memory.rs | 74 ++++++++++++++++++++++++ examples/constant_memory.cu | 19 +++++++ examples/constant_memory.ptx | 60 ++++++++++++++++++++ src/driver/result.rs | 18 ++++++ src/driver/safe/core.rs | 100 +++++++++++++++++++++++++++++++++ 5 files changed, 271 insertions(+) create mode 100644 examples/09-constant-memory.rs create mode 100644 examples/constant_memory.cu create mode 100644 examples/constant_memory.ptx diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs new file mode 100644 index 00000000..8b40af2e --- /dev/null +++ b/examples/09-constant-memory.rs @@ -0,0 +1,74 @@ +use cudarc::{ + driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg}, + nvrtc::Ptx, +}; + +fn main() -> Result<(), DriverError> { + let ctx = CudaContext::new(0)?; + let stream = ctx.default_stream(); + + // Load the module containing the kernel with constant memory + let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?; + + // Get the constant memory symbol + let coefficients_symbol = module.get_global("coefficients")?; + println!( + "Constant memory symbol 'coefficients' has {} bytes", + coefficients_symbol.num_bytes() + ); + + // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3 + let coefficients = [1.0f32, 2.0, 3.0, 4.0]; + + // Copy coefficients to constant memory + stream.memcpy_htos(&coefficients, &coefficients_symbol)?; + + // Load the kernel function + let polynomial_kernel = module.load_function("polynomial_kernel")?; + + // Prepare input data + let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0]; + let n = input.len(); + + // Copy input to device + let input_dev = stream.memcpy_stod(&input)?; + let mut output_dev = stream.alloc_zeros::(n)?; + + // Launch kernel + let cfg = LaunchConfig::for_num_elems(n as u32); + unsafe { + stream + .launch_builder(&polynomial_kernel) + .arg(&mut output_dev) + .arg(&input_dev) + .arg(&(n as i32)) + .launch(cfg) + }?; + + // Copy results back + let output = stream.memcpy_dtov(&output_dev)?; + + // Verify results + println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):"); + for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() { + let expected = coefficients[0] + + coefficients[1] * x + + coefficients[2] * x * x + + coefficients[3] * x * x * x; + println!( + " f({:.1}) = {:.1} (expected {:.1})", + x, y, expected + ); + assert!( + (y - expected).abs() < 1e-4, + "Mismatch at index {}: got {}, expected {}", + i, + y, + expected + ); + } + + println!("\nAll results match expected values!"); + + Ok(()) +} diff --git a/examples/constant_memory.cu b/examples/constant_memory.cu new file mode 100644 index 00000000..ed2d4f7d --- /dev/null +++ b/examples/constant_memory.cu @@ -0,0 +1,19 @@ +// Constant memory - faster than global memory for read-only data +// accessed by all threads +__constant__ float coefficients[4]; + +extern "C" __global__ void polynomial_kernel( + float *out, + const float *inp, + int numel +) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < numel) { + float x = inp[i]; + // Compute polynomial: coefficients[0] + coefficients[1]*x + coefficients[2]*x^2 + coefficients[3]*x^3 + out[i] = coefficients[0] + + coefficients[1] * x + + coefficients[2] * x * x + + coefficients[3] * x * x * x; + } +} diff --git a/examples/constant_memory.ptx b/examples/constant_memory.ptx new file mode 100644 index 00000000..f13d2992 --- /dev/null +++ b/examples/constant_memory.ptx @@ -0,0 +1,60 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-36424714 +// Cuda compilation tools, release 13.0, V13.0.88 +// Based on NVVM 7.0.1 +// + +.version 9.0 +.target sm_75 +.address_size 64 + + // .globl polynomial_kernel +.const .align 4 .b8 coefficients[16]; + +.visible .entry polynomial_kernel( + .param .u64 polynomial_kernel_param_0, + .param .u64 polynomial_kernel_param_1, + .param .u32 polynomial_kernel_param_2 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<12>; + .reg .b32 %r<6>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [polynomial_kernel_param_0]; + ld.param.u64 %rd2, [polynomial_kernel_param_1]; + ld.param.u32 %r2, [polynomial_kernel_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r3, %r4, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra $L__BB0_2; + + cvta.to.global.u64 %rd3, %rd2; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.const.f32 %f1, [coefficients+4]; + ld.global.f32 %f2, [%rd5]; + ld.const.f32 %f3, [coefficients]; + fma.rn.f32 %f4, %f2, %f1, %f3; + ld.const.f32 %f5, [coefficients+8]; + mul.f32 %f6, %f2, %f5; + fma.rn.f32 %f7, %f2, %f6, %f4; + ld.const.f32 %f8, [coefficients+12]; + mul.f32 %f9, %f2, %f8; + mul.f32 %f10, %f2, %f9; + fma.rn.f32 %f11, %f2, %f10, %f7; + cvta.to.global.u64 %rd6, %rd1; + add.s64 %rd7, %rd6, %rd4; + st.global.f32 [%rd7], %f11; + +$L__BB0_2: + ret; + +} + diff --git a/src/driver/result.rs b/src/driver/result.rs index 3111f4e0..499e6da3 100644 --- a/src/driver/result.rs +++ b/src/driver/result.rs @@ -965,6 +965,24 @@ pub mod module { Ok(func.assume_init()) } + /// Returns a pointer to a global/constant symbol in the module. + /// + /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43972c23c2d5c8a662f2d9a4d0c24) + /// + /// # Safety + /// `module` must be a properly allocated and not freed module. + pub unsafe fn get_global( + module: sys::CUmodule, + name: CString, + ) -> Result<(sys::CUdeviceptr, usize), DriverError> { + let name_ptr = name.as_c_str().as_ptr(); + let mut dptr = MaybeUninit::uninit(); + let mut bytes = MaybeUninit::uninit(); + sys::cuModuleGetGlobal_v2(dptr.as_mut_ptr(), bytes.as_mut_ptr(), module, name_ptr) + .result()?; + Ok((dptr.assume_init(), bytes.assume_init())) + } + /// Unloads a module. /// /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b) diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs index 97a55982..e3b7dcd8 100644 --- a/src/driver/safe/core.rs +++ b/src/driver/safe/core.rs @@ -1298,6 +1298,62 @@ impl CudaStream { unsafe { result::memcpy_dtod_async(dst, src, num_bytes, self.cu_stream) } } + /// Copy a `[T]`/`Vec`/[`PinnedHostSlice`] to a global/constant symbol. + /// + /// This is used to copy data into `__constant__` memory declared in CUDA kernels. + /// + /// # Example + /// + /// ```ignore + /// // In CUDA: __constant__ float my_const[256]; + /// let symbol = module.get_global("my_const")?; + /// let data = vec![1.0f32; 256]; + /// stream.memcpy_htos(&data, &symbol)?; + /// ``` + pub fn memcpy_htos + ?Sized>( + self: &Arc, + src: &Src, + symbol: &CudaSymbol, + ) -> Result<(), DriverError> { + let src_bytes = std::mem::size_of::() * src.len(); + assert!( + symbol.bytes >= src_bytes, + "Symbol size ({} bytes) is smaller than source data ({} bytes)", + symbol.bytes, + src_bytes + ); + let (src, _record_src) = unsafe { src.stream_synced_slice(self) }; + unsafe { result::memcpy_htod_async(symbol.cu_device_ptr, src, self.cu_stream) } + } + + /// Copy a [`CudaSlice`]/[`CudaView`] to a global/constant symbol. + /// + /// This is used to copy data into `__constant__` memory declared in CUDA kernels. + /// + /// # Example + /// + /// ```ignore + /// // In CUDA: __constant__ float my_const[256]; + /// let symbol = module.get_global("my_const")?; + /// let device_data = stream.memcpy_stod(&vec![1.0f32; 256])?; + /// stream.memcpy_dtos(&device_data, &symbol)?; + /// ``` + pub fn memcpy_dtos>( + self: &Arc, + src: &Src, + symbol: &CudaSymbol, + ) -> Result<(), DriverError> { + let src_bytes = src.num_bytes(); + assert!( + symbol.bytes >= src_bytes, + "Symbol size ({} bytes) is smaller than source data ({} bytes)", + symbol.bytes, + src_bytes + ); + let (src, _record_src) = src.device_ptr(self); + unsafe { result::memcpy_dtod_async(symbol.cu_device_ptr, src, src_bytes, self.cu_stream) } + } + /// Copy a [`CudaSlice`]/[`CudaView`] to a new [`CudaSlice`]. pub fn clone_dtod>( self: &Arc, @@ -1750,6 +1806,50 @@ impl CudaModule { module: self.clone(), }) } + + /// Gets a global/constant symbol from the loaded module. + /// + /// This can be used to access `__constant__` memory declared in CUDA kernels. + /// + /// # Example + /// + /// ```ignore + /// // In CUDA: __constant__ float my_const[256]; + /// let symbol = module.get_global("my_const")?; + /// stream.memcpy_htos(&data, &symbol)?; + /// ``` + pub fn get_global(self: &Arc, name: &str) -> Result { + let name_c = CString::new(name).unwrap(); + let (cu_device_ptr, bytes) = + unsafe { result::module::get_global(self.cu_module, name_c) }?; + Ok(CudaSymbol { + cu_device_ptr, + bytes, + module: self.clone(), + }) + } +} + +/// Wrapper around a global/constant symbol from a CUDA module. +/// +/// Created with [CudaModule::get_global()]. Use [CudaStream::memcpy_htos()] +/// or [CudaStream::memcpy_dtos()] to copy data to the symbol. +#[derive(Debug, Clone)] +pub struct CudaSymbol { + pub(crate) cu_device_ptr: sys::CUdeviceptr, + pub(crate) bytes: usize, + #[allow(unused)] + pub(crate) module: Arc, +} + +unsafe impl Send for CudaSymbol {} +unsafe impl Sync for CudaSymbol {} + +impl CudaSymbol { + /// Returns the size of the symbol in bytes. + pub fn num_bytes(&self) -> usize { + self.bytes + } } impl CudaFunction { From 9a8045f7dfd8ec756b3769a847bf03dcc80ae5ee Mon Sep 17 00:00:00 2001 From: David Maseda Neira Date: Sat, 1 Nov 2025 10:41:21 +0100 Subject: [PATCH 2/3] Don't panic on invalid name Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/driver/safe/core.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs index e3b7dcd8..00657450 100644 --- a/src/driver/safe/core.rs +++ b/src/driver/safe/core.rs @@ -1819,7 +1819,7 @@ impl CudaModule { /// stream.memcpy_htos(&data, &symbol)?; /// ``` pub fn get_global(self: &Arc, name: &str) -> Result { - let name_c = CString::new(name).unwrap(); + let name_c = CString::new(name).map_err(|_| DriverError(sys::cudaError_enum::CUDA_ERROR_INVALID_VALUE))?; let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?; Ok(CudaSymbol { From 0b7fb979dc8bf0967279bed24d01ea5fa42feb7a Mon Sep 17 00:00:00 2001 From: David Maseda Neira Date: Sat, 1 Nov 2025 16:58:58 +0100 Subject: [PATCH 3/3] Moved get_global to return CudaSlice --- examples/09-constant-memory.rs | 17 +++-- src/driver/safe/core.rs | 110 +++++++-------------------------- 2 files changed, 29 insertions(+), 98 deletions(-) diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs index 8b40af2e..2a4d3e50 100644 --- a/examples/09-constant-memory.rs +++ b/examples/09-constant-memory.rs @@ -10,18 +10,20 @@ fn main() -> Result<(), DriverError> { // Load the module containing the kernel with constant memory let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?; - // Get the constant memory symbol - let coefficients_symbol = module.get_global("coefficients")?; + // Get the constant memory symbol as a CudaSlice + let mut coefficients_symbol = module.get_global("coefficients", &stream)?; println!( "Constant memory symbol 'coefficients' has {} bytes", - coefficients_symbol.num_bytes() + coefficients_symbol.len() ); // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3 let coefficients = [1.0f32, 2.0, 3.0, 4.0]; - // Copy coefficients to constant memory - stream.memcpy_htos(&coefficients, &coefficients_symbol)?; + // Transmute the symbol to f32 and copy coefficients to constant memory + let mut symbol_view = coefficients_symbol.as_view_mut(); + let mut symbol_f32 = unsafe { symbol_view.transmute_mut::(4).unwrap() }; + stream.memcpy_htod(&coefficients, &mut symbol_f32)?; // Load the kernel function let polynomial_kernel = module.load_function("polynomial_kernel")?; @@ -55,10 +57,7 @@ fn main() -> Result<(), DriverError> { + coefficients[1] * x + coefficients[2] * x * x + coefficients[3] * x * x * x; - println!( - " f({:.1}) = {:.1} (expected {:.1})", - x, y, expected - ); + println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected); assert!( (y - expected).abs() < 1e-4, "Mismatch at index {}: got {}, expected {}", diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs index 00657450..b854155b 100644 --- a/src/driver/safe/core.rs +++ b/src/driver/safe/core.rs @@ -1298,62 +1298,6 @@ impl CudaStream { unsafe { result::memcpy_dtod_async(dst, src, num_bytes, self.cu_stream) } } - /// Copy a `[T]`/`Vec`/[`PinnedHostSlice`] to a global/constant symbol. - /// - /// This is used to copy data into `__constant__` memory declared in CUDA kernels. - /// - /// # Example - /// - /// ```ignore - /// // In CUDA: __constant__ float my_const[256]; - /// let symbol = module.get_global("my_const")?; - /// let data = vec![1.0f32; 256]; - /// stream.memcpy_htos(&data, &symbol)?; - /// ``` - pub fn memcpy_htos + ?Sized>( - self: &Arc, - src: &Src, - symbol: &CudaSymbol, - ) -> Result<(), DriverError> { - let src_bytes = std::mem::size_of::() * src.len(); - assert!( - symbol.bytes >= src_bytes, - "Symbol size ({} bytes) is smaller than source data ({} bytes)", - symbol.bytes, - src_bytes - ); - let (src, _record_src) = unsafe { src.stream_synced_slice(self) }; - unsafe { result::memcpy_htod_async(symbol.cu_device_ptr, src, self.cu_stream) } - } - - /// Copy a [`CudaSlice`]/[`CudaView`] to a global/constant symbol. - /// - /// This is used to copy data into `__constant__` memory declared in CUDA kernels. - /// - /// # Example - /// - /// ```ignore - /// // In CUDA: __constant__ float my_const[256]; - /// let symbol = module.get_global("my_const")?; - /// let device_data = stream.memcpy_stod(&vec![1.0f32; 256])?; - /// stream.memcpy_dtos(&device_data, &symbol)?; - /// ``` - pub fn memcpy_dtos>( - self: &Arc, - src: &Src, - symbol: &CudaSymbol, - ) -> Result<(), DriverError> { - let src_bytes = src.num_bytes(); - assert!( - symbol.bytes >= src_bytes, - "Symbol size ({} bytes) is smaller than source data ({} bytes)", - symbol.bytes, - src_bytes - ); - let (src, _record_src) = src.device_ptr(self); - unsafe { result::memcpy_dtod_async(symbol.cu_device_ptr, src, src_bytes, self.cu_stream) } - } - /// Copy a [`CudaSlice`]/[`CudaView`] to a new [`CudaSlice`]. pub fn clone_dtod>( self: &Arc, @@ -1807,51 +1751,39 @@ impl CudaModule { }) } - /// Gets a global/constant symbol from the loaded module. + /// Gets a global/constant symbol from the loaded module as a [CudaSlice]. /// /// This can be used to access `__constant__` memory declared in CUDA kernels. + /// The returned slice can be transmuted to the appropriate type via views. /// /// # Example /// /// ```ignore - /// // In CUDA: __constant__ float my_const[256]; - /// let symbol = module.get_global("my_const")?; - /// stream.memcpy_htos(&data, &symbol)?; + /// // In CUDA: __constant__ float my_const[4]; + /// let symbol = module.get_global("my_const", &stream)?; + /// let mut symbol_view = symbol.as_view_mut(); + /// let mut symbol_f32 = unsafe { symbol_view.transmute_mut::(4).unwrap() }; + /// stream.memcpy_htod(&[1.0f32, 2.0, 3.0, 4.0], &mut symbol_f32)?; /// ``` - pub fn get_global(self: &Arc, name: &str) -> Result { - let name_c = CString::new(name).map_err(|_| DriverError(sys::cudaError_enum::CUDA_ERROR_INVALID_VALUE))?; - let (cu_device_ptr, bytes) = - unsafe { result::module::get_global(self.cu_module, name_c) }?; - Ok(CudaSymbol { + pub fn get_global( + self: &Arc, + name: &str, + stream: &Arc, + ) -> Result, DriverError> { + let name_c = + CString::new(name).map_err(|_| DriverError(sys::CUresult::CUDA_ERROR_INVALID_VALUE))?; + let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?; + Ok(CudaSlice { cu_device_ptr, - bytes, - module: self.clone(), + len: bytes, + read: None, + write: None, + stream: stream.clone(), + marker: PhantomData, }) } } -/// Wrapper around a global/constant symbol from a CUDA module. -/// -/// Created with [CudaModule::get_global()]. Use [CudaStream::memcpy_htos()] -/// or [CudaStream::memcpy_dtos()] to copy data to the symbol. -#[derive(Debug, Clone)] -pub struct CudaSymbol { - pub(crate) cu_device_ptr: sys::CUdeviceptr, - pub(crate) bytes: usize, - #[allow(unused)] - pub(crate) module: Arc, -} - -unsafe impl Send for CudaSymbol {} -unsafe impl Sync for CudaSymbol {} - -impl CudaSymbol { - /// Returns the size of the symbol in bytes. - pub fn num_bytes(&self) -> usize { - self.bytes - } -} - impl CudaFunction { pub fn occupancy_available_dynamic_smem_per_block( &self,