diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs new file mode 100644 index 00000000..2a4d3e50 --- /dev/null +++ b/examples/09-constant-memory.rs @@ -0,0 +1,73 @@ +use cudarc::{ + driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg}, + nvrtc::Ptx, +}; + +fn main() -> Result<(), DriverError> { + let ctx = CudaContext::new(0)?; + let stream = ctx.default_stream(); + + // Load the module containing the kernel with constant memory + let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?; + + // Get the constant memory symbol as a CudaSlice + let mut coefficients_symbol = module.get_global("coefficients", &stream)?; + println!( + "Constant memory symbol 'coefficients' has {} bytes", + coefficients_symbol.len() + ); + + // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3 + let coefficients = [1.0f32, 2.0, 3.0, 4.0]; + + // Transmute the symbol to f32 and copy coefficients to constant memory + let mut symbol_view = coefficients_symbol.as_view_mut(); + let mut symbol_f32 = unsafe { symbol_view.transmute_mut::(4).unwrap() }; + stream.memcpy_htod(&coefficients, &mut symbol_f32)?; + + // Load the kernel function + let polynomial_kernel = module.load_function("polynomial_kernel")?; + + // Prepare input data + let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0]; + let n = input.len(); + + // Copy input to device + let input_dev = stream.memcpy_stod(&input)?; + let mut output_dev = stream.alloc_zeros::(n)?; + + // Launch kernel + let cfg = LaunchConfig::for_num_elems(n as u32); + unsafe { + stream + .launch_builder(&polynomial_kernel) + .arg(&mut output_dev) + .arg(&input_dev) + .arg(&(n as i32)) + .launch(cfg) + }?; + + // Copy results back + let output = stream.memcpy_dtov(&output_dev)?; + + // Verify results + println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):"); + for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() { + let expected = coefficients[0] + + coefficients[1] * x + + coefficients[2] * x * x + + coefficients[3] * x * x * x; + println!(" f({:.1}) = {:.1} (expected {:.1})", x, y, expected); + assert!( + (y - expected).abs() < 1e-4, + "Mismatch at index {}: got {}, expected {}", + i, + y, + expected + ); + } + + println!("\nAll results match expected values!"); + + Ok(()) +} diff --git a/examples/constant_memory.cu b/examples/constant_memory.cu new file mode 100644 index 00000000..ed2d4f7d --- /dev/null +++ b/examples/constant_memory.cu @@ -0,0 +1,19 @@ +// Constant memory - faster than global memory for read-only data +// accessed by all threads +__constant__ float coefficients[4]; + +extern "C" __global__ void polynomial_kernel( + float *out, + const float *inp, + int numel +) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < numel) { + float x = inp[i]; + // Compute polynomial: coefficients[0] + coefficients[1]*x + coefficients[2]*x^2 + coefficients[3]*x^3 + out[i] = coefficients[0] + + coefficients[1] * x + + coefficients[2] * x * x + + coefficients[3] * x * x * x; + } +} diff --git a/examples/constant_memory.ptx b/examples/constant_memory.ptx new file mode 100644 index 00000000..f13d2992 --- /dev/null +++ b/examples/constant_memory.ptx @@ -0,0 +1,60 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-36424714 +// Cuda compilation tools, release 13.0, V13.0.88 +// Based on NVVM 7.0.1 +// + +.version 9.0 +.target sm_75 +.address_size 64 + + // .globl polynomial_kernel +.const .align 4 .b8 coefficients[16]; + +.visible .entry polynomial_kernel( + .param .u64 polynomial_kernel_param_0, + .param .u64 polynomial_kernel_param_1, + .param .u32 polynomial_kernel_param_2 +) +{ + .reg .pred %p<2>; + .reg .f32 %f<12>; + .reg .b32 %r<6>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [polynomial_kernel_param_0]; + ld.param.u64 %rd2, [polynomial_kernel_param_1]; + ld.param.u32 %r2, [polynomial_kernel_param_2]; + mov.u32 %r3, %ctaid.x; + mov.u32 %r4, %ntid.x; + mov.u32 %r5, %tid.x; + mad.lo.s32 %r1, %r3, %r4, %r5; + setp.ge.s32 %p1, %r1, %r2; + @%p1 bra $L__BB0_2; + + cvta.to.global.u64 %rd3, %rd2; + mul.wide.s32 %rd4, %r1, 4; + add.s64 %rd5, %rd3, %rd4; + ld.const.f32 %f1, [coefficients+4]; + ld.global.f32 %f2, [%rd5]; + ld.const.f32 %f3, [coefficients]; + fma.rn.f32 %f4, %f2, %f1, %f3; + ld.const.f32 %f5, [coefficients+8]; + mul.f32 %f6, %f2, %f5; + fma.rn.f32 %f7, %f2, %f6, %f4; + ld.const.f32 %f8, [coefficients+12]; + mul.f32 %f9, %f2, %f8; + mul.f32 %f10, %f2, %f9; + fma.rn.f32 %f11, %f2, %f10, %f7; + cvta.to.global.u64 %rd6, %rd1; + add.s64 %rd7, %rd6, %rd4; + st.global.f32 [%rd7], %f11; + +$L__BB0_2: + ret; + +} + diff --git a/src/driver/result.rs b/src/driver/result.rs index 3111f4e0..499e6da3 100644 --- a/src/driver/result.rs +++ b/src/driver/result.rs @@ -965,6 +965,24 @@ pub mod module { Ok(func.assume_init()) } + /// Returns a pointer to a global/constant symbol in the module. + /// + /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43972c23c2d5c8a662f2d9a4d0c24) + /// + /// # Safety + /// `module` must be a properly allocated and not freed module. + pub unsafe fn get_global( + module: sys::CUmodule, + name: CString, + ) -> Result<(sys::CUdeviceptr, usize), DriverError> { + let name_ptr = name.as_c_str().as_ptr(); + let mut dptr = MaybeUninit::uninit(); + let mut bytes = MaybeUninit::uninit(); + sys::cuModuleGetGlobal_v2(dptr.as_mut_ptr(), bytes.as_mut_ptr(), module, name_ptr) + .result()?; + Ok((dptr.assume_init(), bytes.assume_init())) + } + /// Unloads a module. /// /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b) diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs index 97a55982..b854155b 100644 --- a/src/driver/safe/core.rs +++ b/src/driver/safe/core.rs @@ -1750,6 +1750,38 @@ impl CudaModule { module: self.clone(), }) } + + /// Gets a global/constant symbol from the loaded module as a [CudaSlice]. + /// + /// This can be used to access `__constant__` memory declared in CUDA kernels. + /// The returned slice can be transmuted to the appropriate type via views. + /// + /// # Example + /// + /// ```ignore + /// // In CUDA: __constant__ float my_const[4]; + /// let symbol = module.get_global("my_const", &stream)?; + /// let mut symbol_view = symbol.as_view_mut(); + /// let mut symbol_f32 = unsafe { symbol_view.transmute_mut::(4).unwrap() }; + /// stream.memcpy_htod(&[1.0f32, 2.0, 3.0, 4.0], &mut symbol_f32)?; + /// ``` + pub fn get_global( + self: &Arc, + name: &str, + stream: &Arc, + ) -> Result, DriverError> { + let name_c = + CString::new(name).map_err(|_| DriverError(sys::CUresult::CUDA_ERROR_INVALID_VALUE))?; + let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?; + Ok(CudaSlice { + cu_device_ptr, + len: bytes, + read: None, + write: None, + stream: stream.clone(), + marker: PhantomData, + }) + } } impl CudaFunction {