Add safe API for CUDA constant memory operations (#478)

wizenink · gemini-code-assist[bot] · web-flow · commit 12cbbab8420b · 2025-11-01T17:58:50.000-04:00
* Add safe API for CUDA constant memory operations

* Don't panic on invalid name

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

* Moved get_global to return CudaSlice&lt;u8&gt;

---------

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/examples/09-constant-memory.rs b/examples/09-constant-memory.rs
@@ -0,0 +1,73 @@
+use cudarc::{
+    driver::{CudaContext, DriverError, LaunchConfig, PushKernelArg},
+    nvrtc::Ptx,
+};
+
+fn main() -> Result<(), DriverError> {
+    let ctx = CudaContext::new(0)?;
+    let stream = ctx.default_stream();
+
+    // Load the module containing the kernel with constant memory
+    let module = ctx.load_module(Ptx::from_file("./examples/constant_memory.ptx"))?;
+
+    // Get the constant memory symbol as a CudaSlice<u8>
+    let mut coefficients_symbol = module.get_global("coefficients", &stream)?;
+    println!(
+        "Constant memory symbol 'coefficients' has {} bytes",
+        coefficients_symbol.len()
+    );
+
+    // Set up polynomial coefficients: 1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3
+    let coefficients = [1.0f32, 2.0, 3.0, 4.0];
+
+    // Transmute the symbol to f32 and copy coefficients to constant memory
+    let mut symbol_view = coefficients_symbol.as_view_mut();
+    let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
+    stream.memcpy_htod(&coefficients, &mut symbol_f32)?;
+
+    // Load the kernel function
+    let polynomial_kernel = module.load_function("polynomial_kernel")?;
+
+    // Prepare input data
+    let input = vec![0.0f32, 1.0, 2.0, 3.0, 4.0, 5.0];
+    let n = input.len();
+
+    // Copy input to device
+    let input_dev = stream.memcpy_stod(&input)?;
+    let mut output_dev = stream.alloc_zeros::<f32>(n)?;
+
+    // Launch kernel
+    let cfg = LaunchConfig::for_num_elems(n as u32);
+    unsafe {
+        stream
+            .launch_builder(&polynomial_kernel)
+            .arg(&mut output_dev)
+            .arg(&input_dev)
+            .arg(&(n as i32))
+            .launch(cfg)
+    }?;
+
+    // Copy results back
+    let output = stream.memcpy_dtov(&output_dev)?;
+
+    // Verify results
+    println!("\nPolynomial evaluation (1.0 + 2.0*x + 3.0*x^2 + 4.0*x^3):");
+    for (i, (&x, &y)) in input.iter().zip(output.iter()).enumerate() {
+        let expected = coefficients[0]
+            + coefficients[1] * x
+            + coefficients[2] * x * x
+            + coefficients[3] * x * x * x;
+        println!("  f({:.1}) = {:.1} (expected {:.1})", x, y, expected);
+        assert!(
+            (y - expected).abs() < 1e-4,
+            "Mismatch at index {}: got {}, expected {}",
+            i,
+            y,
+            expected
+        );
+    }
+
+    println!("\nAll results match expected values!");
+
+    Ok(())
+}
diff --git a/examples/constant_memory.cu b/examples/constant_memory.cu
@@ -0,0 +1,19 @@
+// Constant memory - faster than global memory for read-only data
+// accessed by all threads
+__constant__ float coefficients[4];
+
+extern "C" __global__ void polynomial_kernel(
+    float *out,
+    const float *inp,
+    int numel
+) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numel) {
+        float x = inp[i];
+        // Compute polynomial: coefficients[0] + coefficients[1]*x + coefficients[2]*x^2 + coefficients[3]*x^3
+        out[i] = coefficients[0] +
+                 coefficients[1] * x +
+                 coefficients[2] * x * x +
+                 coefficients[3] * x * x * x;
+    }
+}
diff --git a/examples/constant_memory.ptx b/examples/constant_memory.ptx
@@ -0,0 +1,60 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-36424714
+// Cuda compilation tools, release 13.0, V13.0.88
+// Based on NVVM 7.0.1
+//
+
+.version 9.0
+.target sm_75
+.address_size 64
+
+	// .globl	polynomial_kernel
+.const .align 4 .b8 coefficients[16];
+
+.visible .entry polynomial_kernel(
+	.param .u64 polynomial_kernel_param_0,
+	.param .u64 polynomial_kernel_param_1,
+	.param .u32 polynomial_kernel_param_2
+)
+{
+	.reg .pred 	%p<2>;
+	.reg .f32 	%f<12>;
+	.reg .b32 	%r<6>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [polynomial_kernel_param_0];
+	ld.param.u64 	%rd2, [polynomial_kernel_param_1];
+	ld.param.u32 	%r2, [polynomial_kernel_param_2];
+	mov.u32 	%r3, %ctaid.x;
+	mov.u32 	%r4, %ntid.x;
+	mov.u32 	%r5, %tid.x;
+	mad.lo.s32 	%r1, %r3, %r4, %r5;
+	setp.ge.s32 	%p1, %r1, %r2;
+	@%p1 bra 	$L__BB0_2;
+
+	cvta.to.global.u64 	%rd3, %rd2;
+	mul.wide.s32 	%rd4, %r1, 4;
+	add.s64 	%rd5, %rd3, %rd4;
+	ld.const.f32 	%f1, [coefficients+4];
+	ld.global.f32 	%f2, [%rd5];
+	ld.const.f32 	%f3, [coefficients];
+	fma.rn.f32 	%f4, %f2, %f1, %f3;
+	ld.const.f32 	%f5, [coefficients+8];
+	mul.f32 	%f6, %f2, %f5;
+	fma.rn.f32 	%f7, %f2, %f6, %f4;
+	ld.const.f32 	%f8, [coefficients+12];
+	mul.f32 	%f9, %f2, %f8;
+	mul.f32 	%f10, %f2, %f9;
+	fma.rn.f32 	%f11, %f2, %f10, %f7;
+	cvta.to.global.u64 	%rd6, %rd1;
+	add.s64 	%rd7, %rd6, %rd4;
+	st.global.f32 	[%rd7], %f11;
+
+$L__BB0_2:
+	ret;
+
+}
+
diff --git a/src/driver/result.rs b/src/driver/result.rs
@@ -965,6 +965,24 @@ pub mod module {
         Ok(func.assume_init())
     }
 
+    /// Returns a pointer to a global/constant symbol in the module.
+    ///
+    /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43972c23c2d5c8a662f2d9a4d0c24)
+    ///
+    /// # Safety
+    /// `module` must be a properly allocated and not freed module.
+    pub unsafe fn get_global(
+        module: sys::CUmodule,
+        name: CString,
+    ) -> Result<(sys::CUdeviceptr, usize), DriverError> {
+        let name_ptr = name.as_c_str().as_ptr();
+        let mut dptr = MaybeUninit::uninit();
+        let mut bytes = MaybeUninit::uninit();
+        sys::cuModuleGetGlobal_v2(dptr.as_mut_ptr(), bytes.as_mut_ptr(), module, name_ptr)
+            .result()?;
+        Ok((dptr.assume_init(), bytes.assume_init()))
+    }
+
     /// Unloads a module.
     ///
     /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b)
diff --git a/src/driver/safe/core.rs b/src/driver/safe/core.rs
@@ -1761,6 +1761,38 @@ impl CudaModule {
             module: self.clone(),
         })
     }
+
+    /// Gets a global/constant symbol from the loaded module as a [CudaSlice<u8>].
+    ///
+    /// This can be used to access `__constant__` memory declared in CUDA kernels.
+    /// The returned slice can be transmuted to the appropriate type via views.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// // In CUDA: __constant__ float my_const[4];
+    /// let symbol = module.get_global("my_const", &stream)?;
+    /// let mut symbol_view = symbol.as_view_mut();
+    /// let mut symbol_f32 = unsafe { symbol_view.transmute_mut::<f32>(4).unwrap() };
+    /// stream.memcpy_htod(&[1.0f32, 2.0, 3.0, 4.0], &mut symbol_f32)?;
+    /// ```
+    pub fn get_global(
+        self: &Arc<Self>,
+        name: &str,
+        stream: &Arc<CudaStream>,
+    ) -> Result<CudaSlice<u8>, DriverError> {
+        let name_c =
+            CString::new(name).map_err(|_| DriverError(sys::CUresult::CUDA_ERROR_INVALID_VALUE))?;
+        let (cu_device_ptr, bytes) = unsafe { result::module::get_global(self.cu_module, name_c) }?;
+        Ok(CudaSlice {
+            cu_device_ptr,
+            len: bytes,
+            read: None,
+            write: None,
+            stream: stream.clone(),
+            marker: PhantomData,
+        })
+    }
 }
 
 impl CudaFunction {